├── utils
    ├── __init__.py
    ├── dataset.py
    ├── length_model.py
    ├── grammar.py
    ├── network.py
    └── viterbi.py
├── LICENSE
├── README.md
├── eval.py
└── main.py


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Alexander Richard
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Action Sets: Weakly Supervised Action Segmentation without Ordering Constraints
 2 | Code for the paper Action Sets: Weakly Supervised Action Segmentation without Ordering Constraints
 3 | 
 4 | ### Prepraration:
 5 | 
 6 | * download the data from https://uni-bonn.sciebo.de/s/wOxTiWe5kfeY4Vd
 7 | * extract it so that you have the `data` folder in the same directory as `main.py`
 8 | * create a  `results` directory in the same directory where you also find `main.py`: `mkdir results`
 9 | 
10 | Requirements: Python3.x with the libraries numpy, pytorch (version 0.4.1), and scipy
11 | 
12 | ### Training:
13 | 
14 | Run `python main.py training`
15 | 
16 | ### Inference:
17 | 
18 | Run `python main.py inference --n_threads=NUM_THREADS`, where `NUM_THREADS` should be replaced with the number of parallel CPU threads you want to use for Viterbi decoding.
19 | 
20 | ### Evaluation:
21 | 
22 | In the inference step, recognition files are written to the `results` directory. The frame-level ground truth is available in `data/groundTruth`. Run `python eval.py --recog_dir=results --ground_truth_dir=data/groundTruth` to evaluate the frame accuracy of the trained model
23 | 
24 | ### Remarks:
25 | 
26 | We provide a python/pytorch implementation for easy usage. In the paper, we used a faster, in-house C++ implementation, so results can be slightly different. Running the provided setup on split1 of Breakfast should lead to roughly 23% frame accuracy.
27 | 
28 | If you use the code, please cite
29 | 
30 |     A. Richard, H. Kuehne, J. Gall:
31 |     Action Sets: Weakly Supervised Action Segmentation without Ordering Constraints
32 |     in IEEE Int. Conf. on Computer Vision and Pattern Recognition, 2018
33 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import re
 6 | 
 7 | 
 8 | def recog_file(filename, ground_truth_path):
 9 | 
10 |     # read ground truth
11 |     gt_file = ground_truth_path + re.sub('.*/','/',filename) + '.txt'
12 |     with open(gt_file, 'r') as f:
13 |         ground_truth = f.read().split('\n')[0:-1]
14 |         f.close()
15 |     # read recognized sequence
16 |     with open(filename, 'r') as f:
17 |         recognized = f.read().split('\n')[5].split() # framelevel recognition is in 6-th line of file
18 |         f.close()
19 | 
20 |     n_frame_errors = 0
21 |     for i in range(len(recognized)):
22 |         if not recognized[i] == ground_truth[i]:
23 |             n_frame_errors += 1
24 | 
25 |     return n_frame_errors, len(recognized)
26 | 
27 | 
28 | ### MAIN #######################################################################
29 | 
30 | ### arguments ###
31 | ### --recog_dir: the directory where the recognition files from inferency.py are placed
32 | ### --ground_truth_dir: the directory where the framelevel ground truth can be found
33 | parser = argparse.ArgumentParser()
34 | parser.add_argument('--recog_dir', default='results')
35 | parser.add_argument('--ground_truth_dir', default='data/groundTruth')
36 | args = parser.parse_args()
37 | 
38 | filelist = glob.glob(args.recog_dir + '/P*')
39 | 
40 | print('Evaluate %d video files...' % len(filelist))
41 | 
42 | n_frames = 0
43 | n_errors = 0
44 | # loop over all recognition files and evaluate the frame error
45 | for filename in filelist:
46 |     errors, frames = recog_file(filename, args.ground_truth_dir)
47 |     n_errors += errors
48 |     n_frames += frames
49 | 
50 | # print frame accuracy (1.0 - frame error rate)
51 | print('frame accuracy: %f' % (1.0 - float(n_errors) / n_frames))
52 | 


--------------------------------------------------------------------------------
/utils/dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2.7
 2 | 
 3 | import numpy as np
 4 | 
 5 | # reads the data
 6 | #
 7 | # @base_path: path to the data directory
 8 | # @video_list: list of video names to load
 9 | # @ label2index: mapping from labels (class names) to label indices
10 | #
11 | # self.features[video]: the feature array of the given video (dimension x frames)
12 | # self.action_set[video]: a set containing all occurring actions
13 | # self.ground_truth[video]: the ground truth labels of the video
14 | # self.input_dimension: dimension of video features
15 | # self.n_classes: number of classes
16 | class Dataset(object):
17 | 
18 |     def __init__(self, base_path, video_list, label2index):
19 |         self.features = dict()
20 |         self.action_set = dict()
21 |         self.ground_truth = dict()
22 |         # read features for each video
23 |         for video in video_list:
24 |             # video features
25 |             self.features[video] = np.load(base_path + '/features/' + video + '.npy')
26 |             # action set
27 |             with open(base_path + '/transcripts/' + video + '.txt') as f:
28 |                 self.action_set[video] = set([ label2index[line] for line in f.read().split('\n')[0:-1] ])
29 |             # ground truth
30 |             with open(base_path + '/groundTruth/' + video + '.txt') as f:
31 |                 self.ground_truth[video] = [ label2index[line] for line in f.read().split('\n')[0:-1] ]
32 |         # set input dimension and number of classes
33 |         self.input_dimension = list(self.features.values())[0].shape[0]
34 |         self.n_classes = len(label2index)
35 |         self.n_frames = sum([data.shape[1] for data in self.features.values()])
36 | 
37 |     def videos(self):
38 |         return self.features.keys()
39 | 
40 |     def length(self, video):
41 |         return self.features[video].shape[1]
42 | 


--------------------------------------------------------------------------------
/utils/length_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2.7
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class LengthModel(object):
 7 |     
 8 |     def n_classes(self):
 9 |         return 0
10 | 
11 |     def score(self, length, label):
12 |         return 0.0
13 | 
14 |     def max_length(self):
15 |         return np.inf
16 | 
17 | 
18 | class PoissonModel(LengthModel):
19 |     
20 |     def __init__(self, model_file, max_length = 5000, renormalize = True):
21 |         super(PoissonModel, self).__init__()
22 |         self.mean_lengths = np.loadtxt(model_file)
23 |         self.num_classes = self.mean_lengths.shape[0]
24 |         self.max_len = max_length
25 |         self.poisson = np.zeros((max_length, self.num_classes))
26 | 
27 |         # precompute normalizations for mean length model
28 |         self.norms = np.zeros(self.mean_lengths.shape)
29 |         if renormalize:
30 |             self.norms = np.round(self.mean_lengths) * np.log(np.round(self.mean_lengths)) - np.round(self.mean_lengths)
31 |             for c in range(len(self.mean_lengths)):
32 |                 logFak = 0
33 |                 for k in range(2, int(self.mean_lengths[c])+1):
34 |                     logFak += np.log(k)
35 |                 self.norms[c] = self.norms[c] - logFak
36 |         # precompute Poisson distribution
37 |         self.poisson[0, :] = -np.inf # length zero can not happen
38 |         logFak = 0
39 |         for l in range(1, self.max_len):
40 |             logFak += np.log(l)
41 |             self.poisson[l, :] = l * np.log(self.mean_lengths) - self.mean_lengths - logFak - self.norms
42 | 
43 |     def n_classes(self):
44 |         return self.num_classes
45 | 
46 |     def score(self, length, label):
47 |         if length > self.max_len:
48 |             return -np.inf
49 |         else:
50 |             return self.poisson[length, label]
51 | 
52 |     def max_lengths(self):
53 |         return self.max_len
54 | 
55 | 


--------------------------------------------------------------------------------
/utils/grammar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2.7
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Grammar(object):
 7 |     
 8 |     # @context: tuple containing the previous label indices
 9 |     # @label: the current label index
10 |     # @return: the log probability of label given context p(label|context)
11 |     def score(self, context, label): # score is a log probability
12 |         return 0.0
13 | 
14 |     # @return: the number of classes
15 |     def n_classes(self):
16 |         return 0
17 | 
18 |     # @return sequence start symbol
19 |     def start_symbol(self):
20 |         return -1
21 | 
22 |     # @return sequence end symbol
23 |     def end_symbol(self):
24 |         return -2
25 | 
26 |     # @context: tuple containing the previous label indices
27 |     # @return: list of all possible successor labels for the given context
28 |     def possible_successors(context):
29 |         return set()
30 | 
31 | 
32 | class PathGrammar(Grammar):
33 |     
34 |     def __init__(self, transcript_file, label2index_map):
35 |         self.num_classes = len(label2index_map)
36 |         transcripts = self._read_transcripts(transcript_file, label2index_map)
37 |         # generate successor sets
38 |         self.successors = dict()
39 |         for transcript in transcripts:
40 |             transcript = transcript + [self.end_symbol()]
41 |             for i in range(len(transcript)):
42 |                 context = (self.start_symbol(),) + tuple(transcript[0:i])
43 |                 self.successors[context] = set([transcript[i]]).union( self.successors.get(context, set()) )
44 | 
45 |     def _read_transcripts(self, transcript_file, label2index_map):
46 |         transcripts = []
47 |         with open(transcript_file, 'r') as f:
48 |             lines = f.read().split('\n')[0:-1]
49 |         for line in lines:
50 |             transcripts.append( [ label2index_map[label] for label in line.split() ] )
51 |         return transcripts
52 | 
53 |     def n_classes(self):
54 |         return self.num_classes
55 | 
56 |     def possible_successors(self, context):
57 |         return self.successors.get(context, set())
58 | 
59 |     def score(self, context, label):
60 |         if label in self.possible_successors(context):
61 |             return 0.0
62 |         else:
63 |             return -np.inf
64 | 
65 | 


--------------------------------------------------------------------------------
/utils/network.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2.7
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from torch.autograd import Variable
  6 | import torch.utils.data
  7 | import torch.nn as nn
  8 | import torch.optim as optim
  9 | from .dataset import Dataset
 10 | 
 11 | 
 12 | # wrapper class to provide videos from the dataset as pytorch tensors
 13 | class DatasetWrapper(torch.utils.data.Dataset):
 14 | 
 15 |     def __init__(self, dataset):
 16 |         self.dataset = dataset
 17 |         # datastructure for frame indexing
 18 |         self.selectors = []
 19 |         for video in self.dataset.features:
 20 |             self.selectors += [ (video, i) for i in range(self.dataset.features[video].shape[1]) ]
 21 | 
 22 |     def __len__(self):
 23 |         return len(self.selectors)
 24 | 
 25 |     def __getitem__(self, idx):
 26 |         assert idx < len(self)
 27 |         video = self.selectors[idx][0]
 28 |         frame = self.selectors[idx][1]
 29 |         features = torch.from_numpy( self.dataset.features[video][:, frame] )
 30 |         labels = []
 31 |         for c in range(self.dataset.n_classes):
 32 |             labels.append( torch.LongTensor([1 if c in self.dataset.action_set[video] else 0]) )
 33 |         return features, labels
 34 | 
 35 | 
 36 | # the neural network
 37 | class Net(nn.Module):
 38 | 
 39 |     def __init__(self, input_dim, n_classes):
 40 |         super(Net, self).__init__()
 41 |         self.n_classes = n_classes
 42 |         self.fc = nn.Linear(input_dim, 256)
 43 |         self.out_fc = []
 44 |         for c in range(n_classes):
 45 |             self.out_fc.append( nn.Linear(256, 2) )
 46 |         self.out_fc = nn.Sequential(*self.out_fc)
 47 | 
 48 |     def forward(self, x):
 49 |         x = nn.functional.relu(self.fc(x))
 50 |         outputs = []
 51 |         for c in range(self.n_classes):
 52 |             tmp = self.out_fc[c](x)
 53 |             tmp = nn.functional.log_softmax(tmp, dim = 1)
 54 |             outputs.append(tmp)
 55 |         return outputs
 56 | 
 57 | 
 58 | # class for network training
 59 | class Trainer(object):
 60 | 
 61 |     def __init__(self, dataset):
 62 |         self.dataset_wrapper = DatasetWrapper(dataset)
 63 |         self.net = Net(dataset.input_dimension, dataset.n_classes)
 64 |         self.net.cuda()
 65 | 
 66 |     def train(self, batch_size = 512, n_epochs = 2, learning_rate = 0.1):
 67 |         dataloader = torch.utils.data.DataLoader(self.dataset_wrapper, batch_size = batch_size, shuffle = True)
 68 |         criterion = nn.NLLLoss()
 69 |         optimizer = optim.SGD(self.net.parameters(), lr = learning_rate)
 70 |         # run for n epochs
 71 |         for epoch in range(n_epochs):
 72 |             # loop over all training data
 73 |             for i, data in enumerate(dataloader, 0):
 74 |                 optimizer.zero_grad()
 75 |                 input, target = data
 76 |                 input = Variable(input.cuda())
 77 |                 outputs = self.net(input)
 78 |                 loss = 0
 79 |                 for c, output in enumerate(outputs):
 80 |                     labels = Variable(target[c].cuda())
 81 |                     labels = labels.view(-1)
 82 |                     loss += criterion(output, labels)
 83 |                 loss.backward()
 84 |                 optimizer.step()
 85 |             print(float(loss))
 86 | 
 87 |     def save_model(self, model_file):
 88 |         torch.save(self.net.state_dict(), model_file)
 89 | 
 90 | 
 91 | # class to forward videos through a trained network
 92 | class Forwarder(object):
 93 | 
 94 |     def __init__(self, model_file):
 95 |         self.model_file = model_file
 96 |         self.net = None
 97 | 
 98 |     def forward(self, dataset):
 99 |         # read the data
100 |         dataset_wrapper = DatasetWrapper(dataset)
101 |         dataloader = torch.utils.data.DataLoader(dataset_wrapper, batch_size = 512, shuffle = False)
102 |         # load net if not yet done
103 |         if self.net == None:
104 |             self.net = Net(dataset.input_dimension, dataset.n_classes)
105 |             self.net.load_state_dict( torch.load(self.model_file) )
106 |             self.net.cuda()
107 |         # output probability container
108 |         log_probs = np.zeros( (dataset.n_frames, dataset.n_classes), dtype=np.float32 )
109 |         offset = 0
110 |         # forward all frames
111 |         for data in dataloader:
112 |             input, _ = data
113 |             input = Variable(input.cuda())
114 |             outputs = self.net(input)
115 |             for c, output in enumerate(outputs):
116 |                 log_probs[offset : offset + output.shape[0], c] = output.data.cpu()[:, 1]
117 |             offset += output.shape[0]
118 |         return log_probs
119 | 
120 | 


--------------------------------------------------------------------------------
/utils/viterbi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2.7
  2 | 
  3 | import numpy as np
  4 | from .grammar import PathGrammar
  5 | from .length_model import PoissonModel
  6 | import glob
  7 | import re
  8 | 
  9 | # Viterbi decoding
 10 | class Viterbi(object):
 11 | 
 12 |     ### helper structure ###
 13 |     class TracebackNode(object):
 14 |         def __init__(self, label, predecessor, boundary = False):
 15 |             self.label = label
 16 |             self.predecessor = predecessor
 17 |             self.boundary = boundary
 18 | 
 19 |     ### helper structure ###
 20 |     class HypDict(dict):
 21 |         class Hypothesis(object):
 22 |             def __init__(self, score, traceback):
 23 |                 self.score = score
 24 |                 self.traceback = traceback
 25 |         def update(self, key, score, traceback):
 26 |             if (not key in self) or (self[key].score <= score):
 27 |                 self[key] = self.Hypothesis(score, traceback)
 28 | 
 29 |     # @grammar: the grammar to use, must inherit from class Grammar
 30 |     # @length_model: the length model to use, must inherit from class LengthModel
 31 |     # @frame_sampling: generate hypotheses every frame_sampling frames
 32 |     # @max_hypotheses: maximal number of hypotheses. Smaller values result in stronger pruning
 33 |     def __init__(self, grammar, length_model, frame_sampling = 1, max_hypotheses = np.inf):
 34 |         self.grammar = grammar
 35 |         self.length_model = length_model
 36 |         self.frame_sampling = frame_sampling
 37 |         self.max_hypotheses = max_hypotheses
 38 | 
 39 |     # Viterbi decoding of a sequence
 40 |     # @log_frame_probs: logarithmized frame probabilities
 41 |     #                   (usually log(network_output) - log(prior) - max_val, where max_val ensures negativity of all log scores)
 42 |     # @return: the score of the best sequence,
 43 |     #          the corresponding framewise labels (len(labels) = len(sequence))
 44 |     #          and the inferred segments in the form (label, length)
 45 |     def decode(self, log_frame_probs):
 46 |         assert log_frame_probs.shape[1] == self.grammar.n_classes()
 47 |         frame_scores = np.cumsum(log_frame_probs, axis=0) # cumulative frame scores allow for quick lookup if frame_sampling > 1
 48 |         # create initial hypotheses
 49 |         hyps = self.init_decoding(frame_scores)
 50 |         # decode each following time step
 51 |         for t in range(2 * self.frame_sampling - 1, frame_scores.shape[0], self.frame_sampling):
 52 |             hyps = self.decode_frame(t, hyps, frame_scores)
 53 |             self.prune(hyps)
 54 |         # transition to end symbol
 55 |         final_hyp = self.finalize_decoding(hyps)
 56 |         labels, segments = self.traceback(final_hyp, frame_scores.shape[0])
 57 |         return final_hyp.score, labels, segments
 58 | 
 59 | 
 60 |     ### helper functions ###
 61 |     def frame_score(self, frame_scores, t, label):
 62 |         if t >= self.frame_sampling:
 63 |             return frame_scores[t, label] - frame_scores[t - self.frame_sampling, label]
 64 |         else:
 65 |             return frame_scores[t, label]
 66 | 
 67 |     def prune(self, hyps):
 68 |         if len(hyps) > self.max_hypotheses:
 69 |             tmp = sorted( [ (hyps[key].score, key) for key in hyps ] )
 70 |             del_keys = [ x[1] for x in tmp[0 : -self.max_hypotheses] ]
 71 |             for key in del_keys:
 72 |                 del hyps[key]
 73 | 
 74 |     def init_decoding(self, frame_scores):
 75 |         hyps = self.HypDict()
 76 |         context = (self.grammar.start_symbol(),)
 77 |         for label in self.grammar.possible_successors(context):
 78 |             key = context + (label, self.frame_sampling)
 79 |             score = self.grammar.score(context, label) + self.frame_score(frame_scores, self.frame_sampling - 1, label)
 80 |             hyps.update(key, score, self.TracebackNode(label, None, boundary = True))
 81 |         return hyps
 82 | 
 83 |     def decode_frame(self, t, old_hyp, frame_scores):
 84 |         new_hyp = self.HypDict()
 85 |         for key, hyp in old_hyp.items():
 86 |             context, label, length = key[0:-2], key[-2], key[-1]
 87 |             # stay in the same label...
 88 |             if length + self.frame_sampling <= self.length_model.max_length():
 89 |                 new_key = context + (label, length + self.frame_sampling)
 90 |                 score = hyp.score + self.frame_score(frame_scores, t, label)
 91 |                 new_hyp.update(new_key, score, self.TracebackNode(label, hyp.traceback, boundary = False))
 92 |             # ... or go to the next label
 93 |             context = context + (label,)
 94 |             for new_label in self.grammar.possible_successors(context):
 95 |                 if new_label == self.grammar.end_symbol():
 96 |                     continue
 97 |                 new_key = context + (new_label, self.frame_sampling)
 98 |                 score = hyp.score + self.frame_score(frame_scores, t, label) + self.length_model.score(length, label) + self.grammar.score(context, new_label)
 99 |                 new_hyp.update(new_key, score, self.TracebackNode(new_label, hyp.traceback, boundary = True))
100 |         # return new hypotheses
101 |         return new_hyp
102 | 
103 |     def finalize_decoding(self, old_hyp):
104 |         final_hyp = self.HypDict.Hypothesis(-np.inf, None)
105 |         for key, hyp in old_hyp.items():
106 |             context, label, length = key[0:-2], key[-2], key[-1]
107 |             context = context + (label,)
108 |             score = hyp.score + self.length_model.score(length, label) + self.grammar.score(context, self.grammar.end_symbol())
109 |             if score >= final_hyp.score:
110 |                 final_hyp.score, final_hyp.traceback = score, hyp.traceback
111 |         # return final hypothesis
112 |         return final_hyp
113 | 
114 |     def traceback(self, hyp, n_frames):
115 |         class Segment(object):
116 |             def __init__(self, label):
117 |                 self.label, self.length = label, 0
118 |         traceback = hyp.traceback
119 |         labels = []
120 |         segments = [Segment(traceback.label)]
121 |         while not traceback == None:
122 |             segments[-1].length += self.frame_sampling
123 |             labels += [traceback.label] * self.frame_sampling
124 |             if traceback.boundary and not traceback.predecessor == None:
125 |                 segments.append( Segment(traceback.predecessor.label) )
126 |             traceback = traceback.predecessor
127 |         segments[0].length += n_frames - len(labels) # append length of missing frames
128 |         labels += [hyp.traceback.label] * (n_frames - len(labels)) # append labels for missing frames
129 |         return list(reversed(labels)), list(reversed(segments))
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import numpy as np
  4 | import scipy.optimize
  5 | import random
  6 | import argparse
  7 | import multiprocessing as mp
  8 | import queue
  9 | from utils.dataset import Dataset
 10 | from utils.network import Trainer, Forwarder
 11 | from utils.grammar import PathGrammar
 12 | from utils.length_model import PoissonModel
 13 | from utils.viterbi import Viterbi
 14 | 
 15 | 
 16 | ### prior ######################################################################
 17 | def estimate_prior(dataset):
 18 |     prior = np.zeros( (dataset.n_classes,), dtype=np.float32 )
 19 |     for video in dataset.videos():
 20 |         for c in range(dataset.n_classes):
 21 |             prior[c] += dataset.features[video].shape[1] if c in dataset.action_set[video] else 0
 22 |     return prior / np.sum(prior)
 23 | 
 24 | 
 25 | ### loss based lengths #########################################################
 26 | def loss_based_lengths(dataset):
 27 |     # definition of objective function
 28 |     def objective(x, A, l):
 29 |         return 0.5 * np.sum( (np.dot(A, x) - l) ** 2 )
 30 |     # number of frames per video
 31 |     vid_lengths = np.array( [dataset.length(video) for video in dataset.videos()] )
 32 |     # binary data matrix: (n_videos x n_classes), A[video, class] = 1 iff class in action_set[video]
 33 |     A = np.zeros((len(dataset.videos()), dataset.n_classes))
 34 |     for i, video in enumerate(dataset.videos()):
 35 |         for c in dataset.action_set[video]:
 36 |            A[i, c] = 1
 37 |     # constraints: each mean length is at least 50 frames
 38 |     constr = [ lambda x, i=i : x[i] - 50 for i in range(dataset.n_classes) ]
 39 |     # optimize
 40 |     x0 = np.ones((dataset.n_classes)) * 450.0 # some initial value
 41 |     mean_lengths = scipy.optimize.fmin_cobyla(objective, x0, constr, args=(A, vid_lengths), consargs=(), maxfun=10000, disp=False)
 42 |     return mean_lengths
 43 | 
 44 | 
 45 | ### monte-carlo grammar ########################################################
 46 | def monte_carlo_grammar(dataset, mean_lengths, index2label, max_paths = 1000):
 47 |     monte_carlo_grammar = []
 48 |     sil_length = mean_lengths[0]
 49 |     while len(monte_carlo_grammar) < max_paths:
 50 |         for video in dataset.videos():
 51 |             action_set = dataset.action_set[video] - set([0]) # exclude SIL
 52 |             seq = []
 53 |             while sum( [ mean_lengths[label] for label in seq ] ) + 2 * sil_length < dataset.length(video):
 54 |                 seq.append( random.choice(list(action_set)) )
 55 |             if len(seq) == 0: # omit empty sequences
 56 |                 continue
 57 |             monte_carlo_grammar.append('SIL ' + ' '.join( [index2label[idx] for idx in seq] ) + ' SIL')
 58 |     random.shuffle(monte_carlo_grammar)
 59 |     return monte_carlo_grammar[0:max_paths]
 60 | 
 61 | 
 62 | ################################################################################
 63 | ### TRAINING                                                                 ###
 64 | ################################################################################
 65 | def train(label2index, index2label):
 66 |     # list of train videos
 67 |     with open('data/split1.train', 'r') as f:
 68 |         video_list = f.read().split('\n')[0:-1]
 69 |     # read train set
 70 |     print('read data...')
 71 |     dataset = Dataset('data', video_list, label2index)
 72 |     print('done')
 73 |     # train the network
 74 |     trainer = Trainer(dataset)
 75 |     trainer.train(batch_size = 512, n_epochs = 2, learning_rate = 0.1)
 76 |     trainer.save_model('results/net.model')
 77 |     # estimate prior, loss-based lengths, and monte-carlo grammar
 78 |     prior = estimate_prior(dataset)
 79 |     mean_lengths = loss_based_lengths(dataset)
 80 |     grammar = monte_carlo_grammar(dataset, mean_lengths, index2label)
 81 |     np.savetxt('results/prior', prior)
 82 |     np.savetxt('results/mean_lengths', mean_lengths, fmt='%.3f')
 83 |     with open('results/grammar', 'w') as f:
 84 |         f.write('\n'.join(grammar) + '\n')
 85 | 
 86 | 
 87 | ################################################################################
 88 | ### INFERENCE                                                                ###
 89 | ################################################################################
 90 | def infer(label2index, index2label, n_threads):
 91 |     # load models
 92 |     log_prior = np.log( np.loadtxt('results/prior') )
 93 |     grammar = PathGrammar('results/grammar', label2index)
 94 |     length_model = PoissonModel('results/mean_lengths', max_length = 2000)
 95 |     forwarder = Forwarder('results/net.model')
 96 |     # Viterbi decoder (max_hypotheses = n: at each time step, prune all hypotheses worse than the top n)
 97 |     viterbi_decoder = Viterbi(grammar, length_model, frame_sampling = 30, max_hypotheses = 50000 )
 98 |     # create list of test videos
 99 |     with open('data/split1.test', 'r') as f:
100 |         video_list = f.read().split('\n')[0:-1]
101 |     # forward each video
102 |     log_probs = dict()
103 |     queue = mp.Queue()
104 |     for video in video_list:
105 |         queue.put(video)
106 |         dataset = Dataset('data', [video], label2index)
107 |         log_probs[video] = forwarder.forward(dataset) - log_prior
108 |         log_probs[video] = log_probs[video] - np.max(log_probs[video])
109 |     # Viterbi decoding
110 |     procs = []
111 |     for i in range(n_threads):
112 |         p = mp.Process(target = decode, args = (queue, log_probs, viterbi_decoder, index2label) )
113 |         procs.append(p)
114 |         p.start()
115 |     for p in procs:
116 |         p.join()
117 | 
118 | 
119 | ### helper function for parallelized Viterbi decoding ##########################
120 | def decode(queue, log_probs, decoder, index2label):
121 |     while not queue.empty():
122 |         try:
123 |             video = queue.get(timeout = 3)
124 |             score, labels, segments = decoder.decode( log_probs[video] )
125 |             # save result
126 |             with open('results/' + video, 'w') as f:
127 |                 f.write( '### Recognized sequence: ###\n' )
128 |                 f.write( ' '.join( [index2label[s.label] for s in segments] ) + '\n' )
129 |                 f.write( '### Score: ###\n' + str(score) + '\n')
130 |                 f.write( '### Frame level recognition: ###\n')
131 |                 f.write( ' '.join( [index2label[l] for l in labels] ) + '\n' )
132 |         except queue.Empty:
133 |             pass
134 | 
135 | 
136 | 
137 | ################################################################################
138 | ### MAIN                                                                     ###
139 | ################################################################################
140 | if __name__ == '__main__':
141 | 
142 |     # read label2index mapping and index2label mapping
143 |     label2index = dict()
144 |     index2label = dict()
145 |     with open('data/mapping.txt', 'r') as f:
146 |         content = f.read().split('\n')[0:-1]
147 |         for line in content:
148 |             label2index[line.split()[1]] = int(line.split()[0])
149 |             index2label[int(line.split()[0])] = line.split()[1]
150 | 
151 |     ### command line arguments ###
152 |     ### mode: either training or inference
153 |     ### --n_threads: number of threads to use for inference (not used in training mode)
154 |     parser = argparse.ArgumentParser()
155 |     parser.add_argument('mode', choices = ['training', 'inference'])
156 |     parser.add_argument('--n_threads', default = 1, type = int)
157 |     args = parser.parse_args()
158 | 
159 |     if args.mode == 'training':
160 |         train(label2index, index2label)
161 |     else:
162 |         infer(label2index, index2label, args.n_threads)
163 | 
164 | 


--------------------------------------------------------------------------------