├── utils ├── __init__.py ├── dataset.py ├── length_model.py ├── grammar.py ├── network.py └── viterbi.py ├── LICENSE ├── README.md ├── eval.py └── main.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Alexander Richard 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Action Sets: Weakly Supervised Action Segmentation without Ordering Constraints 2 | Code for the paper Action Sets: Weakly Supervised Action Segmentation without Ordering Constraints 3 | 4 | ### Prepraration: 5 | 6 | * download the data from https://uni-bonn.sciebo.de/s/wOxTiWe5kfeY4Vd 7 | * extract it so that you have the `data` folder in the same directory as `main.py` 8 | * create a `results` directory in the same directory where you also find `main.py`: `mkdir results` 9 | 10 | Requirements: Python3.x with the libraries numpy, pytorch (version 0.4.1), and scipy 11 | 12 | ### Training: 13 | 14 | Run `python main.py training` 15 | 16 | ### Inference: 17 | 18 | Run `python main.py inference --n_threads=NUM_THREADS`, where `NUM_THREADS` should be replaced with the number of parallel CPU threads you want to use for Viterbi decoding. 19 | 20 | ### Evaluation: 21 | 22 | In the inference step, recognition files are written to the `results` directory. The frame-level ground truth is available in `data/groundTruth`. Run `python eval.py --recog_dir=results --ground_truth_dir=data/groundTruth` to evaluate the frame accuracy of the trained model 23 | 24 | ### Remarks: 25 | 26 | We provide a python/pytorch implementation for easy usage. In the paper, we used a faster, in-house C++ implementation, so results can be slightly different. Running the provided setup on split1 of Breakfast should lead to roughly 23% frame accuracy. 27 | 28 | If you use the code, please cite 29 | 30 | A. Richard, H. Kuehne, J. Gall: 31 | Action Sets: Weakly Supervised Action Segmentation without Ordering Constraints 32 | in IEEE Int. Conf. on Computer Vision and Pattern Recognition, 2018 33 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import argparse 4 | import glob 5 | import re 6 | 7 | 8 | def recog_file(filename, ground_truth_path): 9 | 10 | # read ground truth 11 | gt_file = ground_truth_path + re.sub('.*/','/',filename) + '.txt' 12 | with open(gt_file, 'r') as f: 13 | ground_truth = f.read().split('\n')[0:-1] 14 | f.close() 15 | # read recognized sequence 16 | with open(filename, 'r') as f: 17 | recognized = f.read().split('\n')[5].split() # framelevel recognition is in 6-th line of file 18 | f.close() 19 | 20 | n_frame_errors = 0 21 | for i in range(len(recognized)): 22 | if not recognized[i] == ground_truth[i]: 23 | n_frame_errors += 1 24 | 25 | return n_frame_errors, len(recognized) 26 | 27 | 28 | ### MAIN ####################################################################### 29 | 30 | ### arguments ### 31 | ### --recog_dir: the directory where the recognition files from inferency.py are placed 32 | ### --ground_truth_dir: the directory where the framelevel ground truth can be found 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--recog_dir', default='results') 35 | parser.add_argument('--ground_truth_dir', default='data/groundTruth') 36 | args = parser.parse_args() 37 | 38 | filelist = glob.glob(args.recog_dir + '/P*') 39 | 40 | print('Evaluate %d video files...' % len(filelist)) 41 | 42 | n_frames = 0 43 | n_errors = 0 44 | # loop over all recognition files and evaluate the frame error 45 | for filename in filelist: 46 | errors, frames = recog_file(filename, args.ground_truth_dir) 47 | n_errors += errors 48 | n_frames += frames 49 | 50 | # print frame accuracy (1.0 - frame error rate) 51 | print('frame accuracy: %f' % (1.0 - float(n_errors) / n_frames)) 52 | -------------------------------------------------------------------------------- /utils/dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.7 2 | 3 | import numpy as np 4 | 5 | # reads the data 6 | # 7 | # @base_path: path to the data directory 8 | # @video_list: list of video names to load 9 | # @ label2index: mapping from labels (class names) to label indices 10 | # 11 | # self.features[video]: the feature array of the given video (dimension x frames) 12 | # self.action_set[video]: a set containing all occurring actions 13 | # self.ground_truth[video]: the ground truth labels of the video 14 | # self.input_dimension: dimension of video features 15 | # self.n_classes: number of classes 16 | class Dataset(object): 17 | 18 | def __init__(self, base_path, video_list, label2index): 19 | self.features = dict() 20 | self.action_set = dict() 21 | self.ground_truth = dict() 22 | # read features for each video 23 | for video in video_list: 24 | # video features 25 | self.features[video] = np.load(base_path + '/features/' + video + '.npy') 26 | # action set 27 | with open(base_path + '/transcripts/' + video + '.txt') as f: 28 | self.action_set[video] = set([ label2index[line] for line in f.read().split('\n')[0:-1] ]) 29 | # ground truth 30 | with open(base_path + '/groundTruth/' + video + '.txt') as f: 31 | self.ground_truth[video] = [ label2index[line] for line in f.read().split('\n')[0:-1] ] 32 | # set input dimension and number of classes 33 | self.input_dimension = list(self.features.values())[0].shape[0] 34 | self.n_classes = len(label2index) 35 | self.n_frames = sum([data.shape[1] for data in self.features.values()]) 36 | 37 | def videos(self): 38 | return self.features.keys() 39 | 40 | def length(self, video): 41 | return self.features[video].shape[1] 42 | -------------------------------------------------------------------------------- /utils/length_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.7 2 | 3 | import numpy as np 4 | 5 | 6 | class LengthModel(object): 7 | 8 | def n_classes(self): 9 | return 0 10 | 11 | def score(self, length, label): 12 | return 0.0 13 | 14 | def max_length(self): 15 | return np.inf 16 | 17 | 18 | class PoissonModel(LengthModel): 19 | 20 | def __init__(self, model_file, max_length = 5000, renormalize = True): 21 | super(PoissonModel, self).__init__() 22 | self.mean_lengths = np.loadtxt(model_file) 23 | self.num_classes = self.mean_lengths.shape[0] 24 | self.max_len = max_length 25 | self.poisson = np.zeros((max_length, self.num_classes)) 26 | 27 | # precompute normalizations for mean length model 28 | self.norms = np.zeros(self.mean_lengths.shape) 29 | if renormalize: 30 | self.norms = np.round(self.mean_lengths) * np.log(np.round(self.mean_lengths)) - np.round(self.mean_lengths) 31 | for c in range(len(self.mean_lengths)): 32 | logFak = 0 33 | for k in range(2, int(self.mean_lengths[c])+1): 34 | logFak += np.log(k) 35 | self.norms[c] = self.norms[c] - logFak 36 | # precompute Poisson distribution 37 | self.poisson[0, :] = -np.inf # length zero can not happen 38 | logFak = 0 39 | for l in range(1, self.max_len): 40 | logFak += np.log(l) 41 | self.poisson[l, :] = l * np.log(self.mean_lengths) - self.mean_lengths - logFak - self.norms 42 | 43 | def n_classes(self): 44 | return self.num_classes 45 | 46 | def score(self, length, label): 47 | if length > self.max_len: 48 | return -np.inf 49 | else: 50 | return self.poisson[length, label] 51 | 52 | def max_lengths(self): 53 | return self.max_len 54 | 55 | -------------------------------------------------------------------------------- /utils/grammar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.7 2 | 3 | import numpy as np 4 | 5 | 6 | class Grammar(object): 7 | 8 | # @context: tuple containing the previous label indices 9 | # @label: the current label index 10 | # @return: the log probability of label given context p(label|context) 11 | def score(self, context, label): # score is a log probability 12 | return 0.0 13 | 14 | # @return: the number of classes 15 | def n_classes(self): 16 | return 0 17 | 18 | # @return sequence start symbol 19 | def start_symbol(self): 20 | return -1 21 | 22 | # @return sequence end symbol 23 | def end_symbol(self): 24 | return -2 25 | 26 | # @context: tuple containing the previous label indices 27 | # @return: list of all possible successor labels for the given context 28 | def possible_successors(context): 29 | return set() 30 | 31 | 32 | class PathGrammar(Grammar): 33 | 34 | def __init__(self, transcript_file, label2index_map): 35 | self.num_classes = len(label2index_map) 36 | transcripts = self._read_transcripts(transcript_file, label2index_map) 37 | # generate successor sets 38 | self.successors = dict() 39 | for transcript in transcripts: 40 | transcript = transcript + [self.end_symbol()] 41 | for i in range(len(transcript)): 42 | context = (self.start_symbol(),) + tuple(transcript[0:i]) 43 | self.successors[context] = set([transcript[i]]).union( self.successors.get(context, set()) ) 44 | 45 | def _read_transcripts(self, transcript_file, label2index_map): 46 | transcripts = [] 47 | with open(transcript_file, 'r') as f: 48 | lines = f.read().split('\n')[0:-1] 49 | for line in lines: 50 | transcripts.append( [ label2index_map[label] for label in line.split() ] ) 51 | return transcripts 52 | 53 | def n_classes(self): 54 | return self.num_classes 55 | 56 | def possible_successors(self, context): 57 | return self.successors.get(context, set()) 58 | 59 | def score(self, context, label): 60 | if label in self.possible_successors(context): 61 | return 0.0 62 | else: 63 | return -np.inf 64 | 65 | -------------------------------------------------------------------------------- /utils/network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.7 2 | 3 | import numpy as np 4 | import torch 5 | from torch.autograd import Variable 6 | import torch.utils.data 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | from .dataset import Dataset 10 | 11 | 12 | # wrapper class to provide videos from the dataset as pytorch tensors 13 | class DatasetWrapper(torch.utils.data.Dataset): 14 | 15 | def __init__(self, dataset): 16 | self.dataset = dataset 17 | # datastructure for frame indexing 18 | self.selectors = [] 19 | for video in self.dataset.features: 20 | self.selectors += [ (video, i) for i in range(self.dataset.features[video].shape[1]) ] 21 | 22 | def __len__(self): 23 | return len(self.selectors) 24 | 25 | def __getitem__(self, idx): 26 | assert idx < len(self) 27 | video = self.selectors[idx][0] 28 | frame = self.selectors[idx][1] 29 | features = torch.from_numpy( self.dataset.features[video][:, frame] ) 30 | labels = [] 31 | for c in range(self.dataset.n_classes): 32 | labels.append( torch.LongTensor([1 if c in self.dataset.action_set[video] else 0]) ) 33 | return features, labels 34 | 35 | 36 | # the neural network 37 | class Net(nn.Module): 38 | 39 | def __init__(self, input_dim, n_classes): 40 | super(Net, self).__init__() 41 | self.n_classes = n_classes 42 | self.fc = nn.Linear(input_dim, 256) 43 | self.out_fc = [] 44 | for c in range(n_classes): 45 | self.out_fc.append( nn.Linear(256, 2) ) 46 | self.out_fc = nn.Sequential(*self.out_fc) 47 | 48 | def forward(self, x): 49 | x = nn.functional.relu(self.fc(x)) 50 | outputs = [] 51 | for c in range(self.n_classes): 52 | tmp = self.out_fc[c](x) 53 | tmp = nn.functional.log_softmax(tmp, dim = 1) 54 | outputs.append(tmp) 55 | return outputs 56 | 57 | 58 | # class for network training 59 | class Trainer(object): 60 | 61 | def __init__(self, dataset): 62 | self.dataset_wrapper = DatasetWrapper(dataset) 63 | self.net = Net(dataset.input_dimension, dataset.n_classes) 64 | self.net.cuda() 65 | 66 | def train(self, batch_size = 512, n_epochs = 2, learning_rate = 0.1): 67 | dataloader = torch.utils.data.DataLoader(self.dataset_wrapper, batch_size = batch_size, shuffle = True) 68 | criterion = nn.NLLLoss() 69 | optimizer = optim.SGD(self.net.parameters(), lr = learning_rate) 70 | # run for n epochs 71 | for epoch in range(n_epochs): 72 | # loop over all training data 73 | for i, data in enumerate(dataloader, 0): 74 | optimizer.zero_grad() 75 | input, target = data 76 | input = Variable(input.cuda()) 77 | outputs = self.net(input) 78 | loss = 0 79 | for c, output in enumerate(outputs): 80 | labels = Variable(target[c].cuda()) 81 | labels = labels.view(-1) 82 | loss += criterion(output, labels) 83 | loss.backward() 84 | optimizer.step() 85 | print(float(loss)) 86 | 87 | def save_model(self, model_file): 88 | torch.save(self.net.state_dict(), model_file) 89 | 90 | 91 | # class to forward videos through a trained network 92 | class Forwarder(object): 93 | 94 | def __init__(self, model_file): 95 | self.model_file = model_file 96 | self.net = None 97 | 98 | def forward(self, dataset): 99 | # read the data 100 | dataset_wrapper = DatasetWrapper(dataset) 101 | dataloader = torch.utils.data.DataLoader(dataset_wrapper, batch_size = 512, shuffle = False) 102 | # load net if not yet done 103 | if self.net == None: 104 | self.net = Net(dataset.input_dimension, dataset.n_classes) 105 | self.net.load_state_dict( torch.load(self.model_file) ) 106 | self.net.cuda() 107 | # output probability container 108 | log_probs = np.zeros( (dataset.n_frames, dataset.n_classes), dtype=np.float32 ) 109 | offset = 0 110 | # forward all frames 111 | for data in dataloader: 112 | input, _ = data 113 | input = Variable(input.cuda()) 114 | outputs = self.net(input) 115 | for c, output in enumerate(outputs): 116 | log_probs[offset : offset + output.shape[0], c] = output.data.cpu()[:, 1] 117 | offset += output.shape[0] 118 | return log_probs 119 | 120 | -------------------------------------------------------------------------------- /utils/viterbi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.7 2 | 3 | import numpy as np 4 | from .grammar import PathGrammar 5 | from .length_model import PoissonModel 6 | import glob 7 | import re 8 | 9 | # Viterbi decoding 10 | class Viterbi(object): 11 | 12 | ### helper structure ### 13 | class TracebackNode(object): 14 | def __init__(self, label, predecessor, boundary = False): 15 | self.label = label 16 | self.predecessor = predecessor 17 | self.boundary = boundary 18 | 19 | ### helper structure ### 20 | class HypDict(dict): 21 | class Hypothesis(object): 22 | def __init__(self, score, traceback): 23 | self.score = score 24 | self.traceback = traceback 25 | def update(self, key, score, traceback): 26 | if (not key in self) or (self[key].score <= score): 27 | self[key] = self.Hypothesis(score, traceback) 28 | 29 | # @grammar: the grammar to use, must inherit from class Grammar 30 | # @length_model: the length model to use, must inherit from class LengthModel 31 | # @frame_sampling: generate hypotheses every frame_sampling frames 32 | # @max_hypotheses: maximal number of hypotheses. Smaller values result in stronger pruning 33 | def __init__(self, grammar, length_model, frame_sampling = 1, max_hypotheses = np.inf): 34 | self.grammar = grammar 35 | self.length_model = length_model 36 | self.frame_sampling = frame_sampling 37 | self.max_hypotheses = max_hypotheses 38 | 39 | # Viterbi decoding of a sequence 40 | # @log_frame_probs: logarithmized frame probabilities 41 | # (usually log(network_output) - log(prior) - max_val, where max_val ensures negativity of all log scores) 42 | # @return: the score of the best sequence, 43 | # the corresponding framewise labels (len(labels) = len(sequence)) 44 | # and the inferred segments in the form (label, length) 45 | def decode(self, log_frame_probs): 46 | assert log_frame_probs.shape[1] == self.grammar.n_classes() 47 | frame_scores = np.cumsum(log_frame_probs, axis=0) # cumulative frame scores allow for quick lookup if frame_sampling > 1 48 | # create initial hypotheses 49 | hyps = self.init_decoding(frame_scores) 50 | # decode each following time step 51 | for t in range(2 * self.frame_sampling - 1, frame_scores.shape[0], self.frame_sampling): 52 | hyps = self.decode_frame(t, hyps, frame_scores) 53 | self.prune(hyps) 54 | # transition to end symbol 55 | final_hyp = self.finalize_decoding(hyps) 56 | labels, segments = self.traceback(final_hyp, frame_scores.shape[0]) 57 | return final_hyp.score, labels, segments 58 | 59 | 60 | ### helper functions ### 61 | def frame_score(self, frame_scores, t, label): 62 | if t >= self.frame_sampling: 63 | return frame_scores[t, label] - frame_scores[t - self.frame_sampling, label] 64 | else: 65 | return frame_scores[t, label] 66 | 67 | def prune(self, hyps): 68 | if len(hyps) > self.max_hypotheses: 69 | tmp = sorted( [ (hyps[key].score, key) for key in hyps ] ) 70 | del_keys = [ x[1] for x in tmp[0 : -self.max_hypotheses] ] 71 | for key in del_keys: 72 | del hyps[key] 73 | 74 | def init_decoding(self, frame_scores): 75 | hyps = self.HypDict() 76 | context = (self.grammar.start_symbol(),) 77 | for label in self.grammar.possible_successors(context): 78 | key = context + (label, self.frame_sampling) 79 | score = self.grammar.score(context, label) + self.frame_score(frame_scores, self.frame_sampling - 1, label) 80 | hyps.update(key, score, self.TracebackNode(label, None, boundary = True)) 81 | return hyps 82 | 83 | def decode_frame(self, t, old_hyp, frame_scores): 84 | new_hyp = self.HypDict() 85 | for key, hyp in old_hyp.items(): 86 | context, label, length = key[0:-2], key[-2], key[-1] 87 | # stay in the same label... 88 | if length + self.frame_sampling <= self.length_model.max_length(): 89 | new_key = context + (label, length + self.frame_sampling) 90 | score = hyp.score + self.frame_score(frame_scores, t, label) 91 | new_hyp.update(new_key, score, self.TracebackNode(label, hyp.traceback, boundary = False)) 92 | # ... or go to the next label 93 | context = context + (label,) 94 | for new_label in self.grammar.possible_successors(context): 95 | if new_label == self.grammar.end_symbol(): 96 | continue 97 | new_key = context + (new_label, self.frame_sampling) 98 | score = hyp.score + self.frame_score(frame_scores, t, label) + self.length_model.score(length, label) + self.grammar.score(context, new_label) 99 | new_hyp.update(new_key, score, self.TracebackNode(new_label, hyp.traceback, boundary = True)) 100 | # return new hypotheses 101 | return new_hyp 102 | 103 | def finalize_decoding(self, old_hyp): 104 | final_hyp = self.HypDict.Hypothesis(-np.inf, None) 105 | for key, hyp in old_hyp.items(): 106 | context, label, length = key[0:-2], key[-2], key[-1] 107 | context = context + (label,) 108 | score = hyp.score + self.length_model.score(length, label) + self.grammar.score(context, self.grammar.end_symbol()) 109 | if score >= final_hyp.score: 110 | final_hyp.score, final_hyp.traceback = score, hyp.traceback 111 | # return final hypothesis 112 | return final_hyp 113 | 114 | def traceback(self, hyp, n_frames): 115 | class Segment(object): 116 | def __init__(self, label): 117 | self.label, self.length = label, 0 118 | traceback = hyp.traceback 119 | labels = [] 120 | segments = [Segment(traceback.label)] 121 | while not traceback == None: 122 | segments[-1].length += self.frame_sampling 123 | labels += [traceback.label] * self.frame_sampling 124 | if traceback.boundary and not traceback.predecessor == None: 125 | segments.append( Segment(traceback.predecessor.label) ) 126 | traceback = traceback.predecessor 127 | segments[0].length += n_frames - len(labels) # append length of missing frames 128 | labels += [hyp.traceback.label] * (n_frames - len(labels)) # append labels for missing frames 129 | return list(reversed(labels)), list(reversed(segments)) 130 | 131 | 132 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import numpy as np 4 | import scipy.optimize 5 | import random 6 | import argparse 7 | import multiprocessing as mp 8 | import queue 9 | from utils.dataset import Dataset 10 | from utils.network import Trainer, Forwarder 11 | from utils.grammar import PathGrammar 12 | from utils.length_model import PoissonModel 13 | from utils.viterbi import Viterbi 14 | 15 | 16 | ### prior ###################################################################### 17 | def estimate_prior(dataset): 18 | prior = np.zeros( (dataset.n_classes,), dtype=np.float32 ) 19 | for video in dataset.videos(): 20 | for c in range(dataset.n_classes): 21 | prior[c] += dataset.features[video].shape[1] if c in dataset.action_set[video] else 0 22 | return prior / np.sum(prior) 23 | 24 | 25 | ### loss based lengths ######################################################### 26 | def loss_based_lengths(dataset): 27 | # definition of objective function 28 | def objective(x, A, l): 29 | return 0.5 * np.sum( (np.dot(A, x) - l) ** 2 ) 30 | # number of frames per video 31 | vid_lengths = np.array( [dataset.length(video) for video in dataset.videos()] ) 32 | # binary data matrix: (n_videos x n_classes), A[video, class] = 1 iff class in action_set[video] 33 | A = np.zeros((len(dataset.videos()), dataset.n_classes)) 34 | for i, video in enumerate(dataset.videos()): 35 | for c in dataset.action_set[video]: 36 | A[i, c] = 1 37 | # constraints: each mean length is at least 50 frames 38 | constr = [ lambda x, i=i : x[i] - 50 for i in range(dataset.n_classes) ] 39 | # optimize 40 | x0 = np.ones((dataset.n_classes)) * 450.0 # some initial value 41 | mean_lengths = scipy.optimize.fmin_cobyla(objective, x0, constr, args=(A, vid_lengths), consargs=(), maxfun=10000, disp=False) 42 | return mean_lengths 43 | 44 | 45 | ### monte-carlo grammar ######################################################## 46 | def monte_carlo_grammar(dataset, mean_lengths, index2label, max_paths = 1000): 47 | monte_carlo_grammar = [] 48 | sil_length = mean_lengths[0] 49 | while len(monte_carlo_grammar) < max_paths: 50 | for video in dataset.videos(): 51 | action_set = dataset.action_set[video] - set([0]) # exclude SIL 52 | seq = [] 53 | while sum( [ mean_lengths[label] for label in seq ] ) + 2 * sil_length < dataset.length(video): 54 | seq.append( random.choice(list(action_set)) ) 55 | if len(seq) == 0: # omit empty sequences 56 | continue 57 | monte_carlo_grammar.append('SIL ' + ' '.join( [index2label[idx] for idx in seq] ) + ' SIL') 58 | random.shuffle(monte_carlo_grammar) 59 | return monte_carlo_grammar[0:max_paths] 60 | 61 | 62 | ################################################################################ 63 | ### TRAINING ### 64 | ################################################################################ 65 | def train(label2index, index2label): 66 | # list of train videos 67 | with open('data/split1.train', 'r') as f: 68 | video_list = f.read().split('\n')[0:-1] 69 | # read train set 70 | print('read data...') 71 | dataset = Dataset('data', video_list, label2index) 72 | print('done') 73 | # train the network 74 | trainer = Trainer(dataset) 75 | trainer.train(batch_size = 512, n_epochs = 2, learning_rate = 0.1) 76 | trainer.save_model('results/net.model') 77 | # estimate prior, loss-based lengths, and monte-carlo grammar 78 | prior = estimate_prior(dataset) 79 | mean_lengths = loss_based_lengths(dataset) 80 | grammar = monte_carlo_grammar(dataset, mean_lengths, index2label) 81 | np.savetxt('results/prior', prior) 82 | np.savetxt('results/mean_lengths', mean_lengths, fmt='%.3f') 83 | with open('results/grammar', 'w') as f: 84 | f.write('\n'.join(grammar) + '\n') 85 | 86 | 87 | ################################################################################ 88 | ### INFERENCE ### 89 | ################################################################################ 90 | def infer(label2index, index2label, n_threads): 91 | # load models 92 | log_prior = np.log( np.loadtxt('results/prior') ) 93 | grammar = PathGrammar('results/grammar', label2index) 94 | length_model = PoissonModel('results/mean_lengths', max_length = 2000) 95 | forwarder = Forwarder('results/net.model') 96 | # Viterbi decoder (max_hypotheses = n: at each time step, prune all hypotheses worse than the top n) 97 | viterbi_decoder = Viterbi(grammar, length_model, frame_sampling = 30, max_hypotheses = 50000 ) 98 | # create list of test videos 99 | with open('data/split1.test', 'r') as f: 100 | video_list = f.read().split('\n')[0:-1] 101 | # forward each video 102 | log_probs = dict() 103 | queue = mp.Queue() 104 | for video in video_list: 105 | queue.put(video) 106 | dataset = Dataset('data', [video], label2index) 107 | log_probs[video] = forwarder.forward(dataset) - log_prior 108 | log_probs[video] = log_probs[video] - np.max(log_probs[video]) 109 | # Viterbi decoding 110 | procs = [] 111 | for i in range(n_threads): 112 | p = mp.Process(target = decode, args = (queue, log_probs, viterbi_decoder, index2label) ) 113 | procs.append(p) 114 | p.start() 115 | for p in procs: 116 | p.join() 117 | 118 | 119 | ### helper function for parallelized Viterbi decoding ########################## 120 | def decode(queue, log_probs, decoder, index2label): 121 | while not queue.empty(): 122 | try: 123 | video = queue.get(timeout = 3) 124 | score, labels, segments = decoder.decode( log_probs[video] ) 125 | # save result 126 | with open('results/' + video, 'w') as f: 127 | f.write( '### Recognized sequence: ###\n' ) 128 | f.write( ' '.join( [index2label[s.label] for s in segments] ) + '\n' ) 129 | f.write( '### Score: ###\n' + str(score) + '\n') 130 | f.write( '### Frame level recognition: ###\n') 131 | f.write( ' '.join( [index2label[l] for l in labels] ) + '\n' ) 132 | except queue.Empty: 133 | pass 134 | 135 | 136 | 137 | ################################################################################ 138 | ### MAIN ### 139 | ################################################################################ 140 | if __name__ == '__main__': 141 | 142 | # read label2index mapping and index2label mapping 143 | label2index = dict() 144 | index2label = dict() 145 | with open('data/mapping.txt', 'r') as f: 146 | content = f.read().split('\n')[0:-1] 147 | for line in content: 148 | label2index[line.split()[1]] = int(line.split()[0]) 149 | index2label[int(line.split()[0])] = line.split()[1] 150 | 151 | ### command line arguments ### 152 | ### mode: either training or inference 153 | ### --n_threads: number of threads to use for inference (not used in training mode) 154 | parser = argparse.ArgumentParser() 155 | parser.add_argument('mode', choices = ['training', 'inference']) 156 | parser.add_argument('--n_threads', default = 1, type = int) 157 | args = parser.parse_args() 158 | 159 | if args.mode == 'training': 160 | train(label2index, index2label) 161 | else: 162 | infer(label2index, index2label, args.n_threads) 163 | 164 | --------------------------------------------------------------------------------