├── .gitattributes ├── .gitignore ├── README.md ├── combine_rgb_flow.py ├── pytorch ├── .gitignore ├── LICENSE ├── README.md ├── checkpoints.py ├── datasets │ ├── __init__.py │ ├── charadesflow.py │ ├── charadesrgb.py │ ├── fake.py │ └── transforms.py ├── exp │ ├── flownet.py │ ├── flownet_test.py │ ├── rgbnet.py │ ├── rgbnet_inception.py │ ├── rgbnet_resnet.py │ └── rgbnet_test.py ├── get_alreadytrained.sh ├── main.py ├── models │ ├── __init__.py │ └── vgg16flow.py ├── opts.py ├── train.py └── utils │ ├── __init__.py │ ├── map.py │ └── tee.py └── torch ├── INSTALL.md ├── LICENSE ├── README.md ├── checkpoints.lua ├── dataloader.lua ├── datasets ├── README.md ├── charades-gen.lua ├── charades.lua ├── charadesflow-gen.lua ├── charadesflow.lua ├── charadessync-gen.lua ├── charadessync.lua ├── charadessyncflow-gen.lua ├── charadessyncflow.lua ├── cifar10-gen.lua ├── cifar10.lua ├── imagenet-gen.lua ├── imagenet.lua ├── init.lua └── transforms.lua ├── exp ├── flownet.lua ├── flownet_localize.lua ├── flownet_resume.lua ├── flownet_test.lua ├── lstmflownet.lua ├── lstmrgbnet.lua ├── rgbnet.lua ├── rgbnet_localize.lua ├── rgbnet_resume.lua └── rgbnet_test.lua ├── get_alreadytrained.sh ├── get_alreadytrained_lstm.sh ├── get_models.sh ├── layers └── CrossEntropyCriterion.lua ├── main.lua ├── models ├── init.lua ├── preresnet.lua ├── resnet.lua ├── vgg16.lua ├── vgg16flow.lua ├── vgg16lstm.lua └── vgg16lstmflow.lua ├── opts.lua └── train.lua /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | gen/ 2 | libnccl.so 3 | model_best.t7 4 | checkpoints 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Charades Starter Code for Activity Recognition in Torch and PyTorch 2 | 3 | Contributor: Gunnar Atli Sigurdsson 4 | 5 | **New:** extension of this framework to the deep CRF model on Charades for *Asynchronous Temporal Fields for Action Recognition*: https://github.com/gsig/temporal-fields 6 | 7 | * **New:** This code implements a Two-Stream network in PyTorch 8 | * This code implements a Two-Stream network in Torch 9 | * This code implements a Two-Stream+LSTM network in Torch 10 | 11 | See [pytorch/](pytorch/), [torch/](torch/), for the code repositories. 12 | 13 | The code replicates the 'Two-Stream Extended' and 'Two-Stream+LSTM' baselines found in: 14 | ``` 15 | @inproceedings{sigurdsson2017asynchronous, 16 | author = {Gunnar A. Sigurdsson and Santosh Divvala and Ali Farhadi and Abhinav Gupta}, 17 | title = {Asynchronous Temporal Fields for Action Recognition}, 18 | booktitle={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 19 | year={2017}, 20 | pdf = {http://arxiv.org/pdf/1612.06371.pdf}, 21 | code = {https://github.com/gsig/temporal-fields}, 22 | } 23 | ``` 24 | which is in turn based off "Two-stream convolutional networks for action recognition in videos" by Simonyan and Zisserman, and "Beyond Short Snippets: Deep Networks for Video Classification" by Joe Yue-Hei Ng el al. 25 | 26 | Combining the predictions (submission files) of those models using combine_rgb_flow.py 27 | yields a final classification accuracy of 18.9% mAP (Two-Stream) and 19.8% (LSTM) on Charades (evalated with charades_v1_classify.m) 28 | 29 | 30 | ## Technical Overview: 31 | 32 | The code is organized such that to train a two-stream network. Two independed network are trained: One RGB network and one Flow network. 33 | This code parses the training data into pairs of an image (or flow), and a label for a single activity class. This forms a softmax training setup like a standard CNN. The network is a VGG-16 network. For RGB it is pretrained on Image-Net, and for Flow it is pretrained on UCF101. The pretrained networks can be downloaded with the scripts in this directory. 34 | For testing. The network uses a batch size of 25, scores all images, and pools the output to make a classfication prediction or uses all 25 outputs for localization. 35 | 36 | -------------------------------------------------------------------------------- /combine_rgb_flow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Script for combining the submission files for the RGB and Flow networks 4 | # 5 | # Contributor: Gunnar Atli Sigurdsson 6 | 7 | import numpy as np 8 | import sys 9 | import pdb 10 | from itertools import groupby 11 | 12 | rgbfile = sys.argv[1] 13 | flowfile = sys.argv[2] 14 | w = [0.5,0.5] 15 | nclasses = 157 16 | 17 | def loadfile(path): 18 | with open(path) as f: 19 | lines = [x.strip().split(' ') for x in f.readlines()] 20 | localization = len(lines[0]) == nclasses+2 21 | if localization: 22 | data = [(x[0]+' '+x[1],np.array([float(y) for y in x[2:]])) for x in lines] 23 | else: 24 | data = [(x[0],np.array([float(y) for y in x[1:]])) for x in lines] 25 | return data 26 | 27 | rgb = loadfile(rgbfile) 28 | flow = loadfile(flowfile) 29 | 30 | rgbdict = dict(rgb) 31 | flowdict = dict(flow) 32 | 33 | keys = list(set(rgbdict.keys()+flowdict.keys())) 34 | w = [x/sum(w) for x in w] 35 | 36 | def normme(x): 37 | x = x-np.mean(x) 38 | x = x/(0.00001+np.std(x)) 39 | return x 40 | 41 | N = 157 42 | def lookup(d,key): 43 | if key in d: 44 | return d[key] 45 | else: 46 | sys.stderr.write('error ' + key + '\n') 47 | return np.zeros((nclasses,)) 48 | 49 | for id0 in keys: 50 | r = lookup(rgbdict,id0) 51 | f = lookup(flowdict,id0) 52 | out = r*w[0]+f*w[1] #unnormalized combination 53 | #out = normme(r)*w[0]+normme(f)*w[1] #normalize first 54 | #out = np.exp(np.log(r)*w[0]+np.log(f)*w[1]) #weighted geometric mean 55 | out = [str(x) for x in out] 56 | print('{} {}'.format(id0,' '.join(out))) 57 | -------------------------------------------------------------------------------- /pytorch/.gitignore: -------------------------------------------------------------------------------- 1 | gen/ 2 | checkpoints/ 3 | *.pyc 4 | *.swp -------------------------------------------------------------------------------- /pytorch/README.md: -------------------------------------------------------------------------------- 1 | ## PyTorch Starter Code for Activity Classification and Localization on Charades 2 | 3 | Contributor: Gunnar Atli Sigurdsson 4 | 5 | Extension of this framework to the deep CRF model on Charades for *Asynchronous Temporal Fields for Action Recognition*: https://github.com/gsig/temporal-fields 6 | 7 | * This code implements a Two-Stream network in PyTorch 8 | 9 | The code replicates the 'Two-Stream Extended' and 'Two-Stream+LSTM' baselines found in: 10 | ``` 11 | @inproceedings{sigurdsson2017asynchronous, 12 | author = {Gunnar A. Sigurdsson and Santosh Divvala and Ali Farhadi and Abhinav Gupta}, 13 | title = {Asynchronous Temporal Fields for Action Recognition}, 14 | booktitle={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 15 | year={2017}, 16 | pdf = {http://arxiv.org/pdf/1612.06371.pdf}, 17 | code = {https://github.com/gsig/temporal-fields}, 18 | } 19 | ``` 20 | which is in turn based off "Two-stream convolutional networks for action recognition in videos" by Simonyan and Zisserman, and "Beyond Short Snippets: Deep Networks for Video Classification" by Joe Yue-Hei Ng el al. 21 | 22 | Combining the predictions (submission files) of those models using combine_rgb_flow.py 23 | yields a final classification accuracy of 20.6% mAP (Two-Stream) and on Charades (evalated with charades_v1_classify.m) 24 | 25 | 26 | ## Technical Overview: 27 | 28 | The code is organized such that to train a two-stream network. Two independed network are trained: One RGB network and one Flow network. 29 | This code parses the training data into pairs of an image (or flow), and a label for a single activity class. This forms a softmax training setup like a standard CNN. The network is a VGG-16 network. For RGB it is pretrained on Image-Net, and for Flow it is pretrained on UCF101. The pretrained networks can be downloaded with the scripts in this directory. 30 | For testing. The network uses a batch size of 25, scores all images, and pools the output to make a classfication prediction or uses all 25 outputs for localization. 31 | 32 | All outputs are stored in the cache-dir. This includes epoch*.txt which is the classification output, and localize*.txt which is the localization output (note the you need to specify that you want this in the options). 33 | Those output files can be combined after training with the python scripts in this directory. 34 | All output files can be scored with the official MATLAB evaluation script provided with the Charades dataset. 35 | 36 | Requirements: 37 | * Python 2.7 38 | * PyTorch 39 | 40 | 41 | ## Steps to train your own two-stream network on Charades: 42 | 43 | 1. Download the Charades Annotations (allenai.org/plato/charades/) 44 | 2. Download the Charades RGB and/or Flow frames (allenai.org/plato/charades/) 45 | 3. Duplicate and edit one of the experiment files under exp/ with appropriate parameters. For additional parameters, see opts.lua 46 | 4. Run an experiment by calling python exp/rgbnet.py where rgbnet.py is your experiment file 47 | 5. The checkpoints/logfiles/outputs are stored in your specified cache directory. 48 | 6. Combine one RGB output file and one Flow output file with combine_rgb_flow.py to generate a submission file 49 | 7. Evaluate the submission file with the Charades_v1_classify.m or Charades_v1_localize.m evaluation scripts 50 | 8. Build of the code, cite our papers, and say hi to us at CVPR. 51 | 52 | Good luck! 53 | 54 | 55 | ## Pretrained networks: 56 | 57 | While the RGB net can be trained in a day on a modern GPU, the flow net requires nontrivial IO and time to converge. For your convenience we provide RGB and Flow models already trained on Charades using exp/rgbnet.py and exp/flownet.py 58 | 59 | https://www.dropbox.com/s/p457h2ifi6v1qdz/twostream_rgb.pth.tar?dl=1 60 | https://www.dropbox.com/s/m1hkeiwjtndt26z/twostream_flow.pth?dl=1 61 | 62 | * The rgb model was obtained after 7 epochs (epochSize=0.1) 63 | * The rgb model has a classification accuracy of 18.6% mAP (evalated with charades_v1_classify.m) 64 | * The flow model was converted directly from the Charades Torch codebase (../torch/) 65 | * The flow model has a classification accuracy of 15.4% mAP (via charades_v1_classify.m) 66 | 67 | Combining the predictions (submission files) of those models using combine_rgb_flow.py 68 | yields a final classification accuracy of 20.6% mAP (evalated with charades_v1_classify.m) 69 | 70 | To fine-tune those models, or run experiments, please see exp/rgbnet_test.py and exp/flownet_test.py 71 | 72 | 73 | Additionally we include rgb-streams fine-tuned from resnet and inception pretrained on ImageNet: 74 | * ResNet-152 (exp/rgbnet_resnet.py): 22.8% mAP (via charades_v1_classify.m) 75 | * https://www.dropbox.com/s/iy9fmk0r1a3edoz/resnet_rgb.pth.tar?dl=1 76 | * Inception_v3 (exp/rgbnet_inception.py): 22.7% mAP (via charades_v1_classify.m) 77 | * https://www.dropbox.com/s/whxikophm7xqchb/inception_rgb.pth.tar?dl=1 78 | 79 | 80 | Charades submission files are available for multiple baselines at https://github.com/gsig/temporal-fields 81 | -------------------------------------------------------------------------------- /pytorch/checkpoints.py: -------------------------------------------------------------------------------- 1 | """ Defines functions used for checkpointing models and storing model scores """ 2 | import os 3 | import torch 4 | import shutil 5 | from collections import OrderedDict 6 | 7 | 8 | def ordered_load_state(model, chkpoint): 9 | """ 10 | Wrapping the model with parallel/dataparallel seems to 11 | change the variable names for the states 12 | This attempts to load normally and otherwise aligns the labels 13 | of the two statese and tries again. 14 | """ 15 | try: 16 | model.load_state_dict(chkpoint) 17 | except Exception: # assume order is the same, and use new labels 18 | print('keys do not match model, trying to align') 19 | modelkeys = model.state_dict().keys() 20 | fixed = OrderedDict([(z,y) 21 | for (x,y),z in zip(chkpoint.items(), modelkeys)]) 22 | model.load_state_dict(fixed) 23 | 24 | 25 | def load(args, model, optimizer): 26 | if args.resume: 27 | if os.path.isfile(args.resume): 28 | print("=> loading checkpoint '{}'".format(args.resume)) 29 | chkpoint = torch.load(args.resume) 30 | if isinstance(chkpoint, dict) and 'state_dict' in chkpoint: 31 | args.start_epoch = chkpoint['epoch'] 32 | mAP = chkpoint['mAP'] 33 | ordered_load_state(model, chkpoint['state_dict']) 34 | optimizer.load_state_dict(chkpoint['optimizer']) 35 | print("=> loaded checkpoint '{}' (epoch {})" 36 | .format(args.resume, chkpoint['epoch'])) 37 | return mAP 38 | else: 39 | ordered_load_state(model, chkpoint) 40 | print("=> loaded checkpoint '{}' (just weights)" 41 | .format(args.resume)) 42 | return 0 43 | else: 44 | raise ValueError("no checkpoint found at '{}'".format(args.resume)) 45 | return 0 46 | 47 | 48 | def score_file(scores, filename): 49 | with open(filename, 'w') as f: 50 | for key, val in sorted(scores.items()): 51 | f.write('{} {}\n'.format(key, val)) 52 | 53 | 54 | def save(epoch, args, model, optimizer, is_best, scores): 55 | state = { 56 | 'epoch': epoch + 1, 57 | 'arch': args.arch, 58 | 'state_dict': model.state_dict(), 59 | 'mAP': scores['mAP'], 60 | 'optimizer': optimizer.state_dict(), 61 | } 62 | filename = "{}/model.pth.tar".format(args.cache) 63 | score_file(scores, "{}/model_{:03d}.txt".format(args.cache, epoch+1)) 64 | torch.save(state, filename) 65 | if is_best: 66 | bestname = "{}/model_best.pth.tar".format(args.cache) 67 | score_file(scores, "{}/model_best.txt".format(args.cache, epoch+1)) 68 | shutil.copyfile(filename, bestname) 69 | -------------------------------------------------------------------------------- /pytorch/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """ Initilize the datasets module 2 | New datasets can be added with python scripts under datasets/ 3 | """ 4 | import torch 5 | import torch.utils.data 6 | import torch.utils.data.distributed 7 | import importlib 8 | 9 | 10 | def get_dataset(args): 11 | dataset = importlib.import_module('.'+args.dataset, package='datasets') 12 | train_dataset, val_dataset, valvideo_dataset = dataset.get(args) 13 | 14 | if args.distributed: 15 | train_sampler = torch.utils.data.distributed.DistributedSampler( 16 | train_dataset) 17 | else: 18 | train_sampler = None 19 | 20 | train_loader = torch.utils.data.DataLoader( 21 | train_dataset, batch_size=args.batch_size, shuffle=( 22 | train_sampler is None), 23 | num_workers=args.workers, pin_memory=True, sampler=train_sampler) 24 | 25 | val_loader = torch.utils.data.DataLoader( 26 | val_dataset, batch_size=args.batch_size, shuffle=True, 27 | num_workers=args.workers, pin_memory=True) 28 | 29 | valvideo_loader = torch.utils.data.DataLoader( 30 | valvideo_dataset, batch_size=25, shuffle=False, 31 | num_workers=args.workers, pin_memory=True) 32 | 33 | return train_loader, val_loader, valvideo_loader 34 | -------------------------------------------------------------------------------- /pytorch/datasets/charadesflow.py: -------------------------------------------------------------------------------- 1 | """ Dataset loader for the Charades dataset """ 2 | import torch 3 | import torchvision.transforms as transforms 4 | import transforms as arraytransforms 5 | from charadesrgb import Charades, cls2int 6 | from PIL import Image 7 | import numpy as np 8 | from glob import glob 9 | 10 | 11 | def pil_loader(path): 12 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 13 | with open(path, 'rb') as f: 14 | img = Image.open(f) 15 | return img.convert('L') 16 | 17 | 18 | def accimage_loader(path): 19 | import accimage 20 | try: 21 | return accimage.Image(path) 22 | except IOError: 23 | # Potentially a decoding problem, fall back to PIL.Image 24 | return pil_loader(path) 25 | 26 | 27 | def default_loader(path): 28 | from torchvision import get_image_backend 29 | if get_image_backend() == 'accimage': 30 | return accimage_loader(path) 31 | else: 32 | return pil_loader(path) 33 | 34 | 35 | class Charadesflow(Charades): 36 | def __init__(self, *args, **kwargs): 37 | super(Charadesflow,self).__init__(*args, **kwargs) 38 | 39 | def prepare(self, path, labels, split): 40 | FPS, GAP, testGAP = 24, 4, 25 41 | STACK=10 42 | datadir = path 43 | image_paths, targets, ids = [], [], [] 44 | 45 | for i, (vid, label) in enumerate(labels.iteritems()): 46 | iddir = datadir + '/' + vid 47 | lines = glob(iddir+'/*.jpg') 48 | n = len(lines)/2 49 | if i % 100 == 0: 50 | print("{} {}".format(i, iddir)) 51 | if n == 0: 52 | continue 53 | if split == 'val_video': 54 | target = torch.IntTensor(157).zero_() 55 | for x in label: 56 | target[cls2int(x['class'])] = 1 57 | spacing = np.linspace(0, n-1-STACK-1, testGAP) # fit 10 optical flow pairs 58 | for loc in spacing: 59 | impath = '{}/{}-{:06d}x.jpg'.format( 60 | iddir, vid, int(np.floor(loc))+1) 61 | image_paths.append(impath) 62 | targets.append(target) 63 | ids.append(vid) 64 | else: 65 | for x in label: 66 | for ii in range(0, n-1, GAP): 67 | if x['start'] < ii/float(FPS) < x['end']: 68 | if ii>n-1-STACK-1: continue # fit 10 optical flow pairs 69 | impath = '{}/{}-{:06d}x.jpg'.format( 70 | iddir, vid, ii+1) 71 | image_paths.append(impath) 72 | targets.append(cls2int(x['class'])) 73 | ids.append(vid) 74 | return {'image_paths': image_paths, 'targets': targets, 'ids': ids} 75 | 76 | def __getitem__(self, index): 77 | """ 78 | Args: 79 | index (int): Index 80 | Returns: 81 | tuple: (image, target) where target is class_index of the target class. 82 | """ 83 | path = self.data['image_paths'][index] 84 | base = path[:-5-6] 85 | framenr = int(path[-5-6:-5]) 86 | assert '{}{:06d}x.jpg'.format(base,framenr) == path 87 | STACK=10 88 | img = [] 89 | for i in range(STACK): 90 | x = '{}{:06d}x.jpg'.format(base,framenr+i) 91 | y = '{}{:06d}y.jpg'.format(base,framenr+i) 92 | imgx = default_loader(x) 93 | imgy = default_loader(y) 94 | img.append(imgx) 95 | img.append(imgy) 96 | target = self.data['targets'][index] 97 | meta = {} 98 | meta['id'] = self.data['ids'][index] 99 | if self.transform is not None: 100 | img = self.transform(img) 101 | if self.target_transform is not None: 102 | target = self.target_transform(target) 103 | return img, target, meta 104 | 105 | 106 | def get(args): 107 | """ Entry point. Call this function to get all Charades dataloaders """ 108 | normalize = arraytransforms.Normalize(mean=[0.502], std=[1.0]) 109 | train_file = args.train_file 110 | val_file = args.val_file 111 | train_dataset = Charadesflow( 112 | args.data, 'train', train_file, args.cache, 113 | transform=transforms.Compose([ 114 | arraytransforms.RandomResizedCrop(224), 115 | arraytransforms.ToTensor(), 116 | normalize, 117 | transforms.Lambda(lambda x: torch.cat(x)), 118 | ])) 119 | val_transforms = transforms.Compose([ 120 | arraytransforms.Resize(256), 121 | arraytransforms.CenterCrop(224), 122 | arraytransforms.ToTensor(), 123 | normalize, 124 | transforms.Lambda(lambda x: torch.cat(x)), 125 | ]) 126 | val_dataset = Charadesflow( 127 | args.data, 'val', val_file, args.cache, transform=val_transforms) 128 | valvideo_dataset = Charadesflow( 129 | args.data, 'val_video', val_file, args.cache, transform=val_transforms) 130 | return train_dataset, val_dataset, valvideo_dataset 131 | -------------------------------------------------------------------------------- /pytorch/datasets/charadesrgb.py: -------------------------------------------------------------------------------- 1 | """ Dataset loader for the Charades dataset """ 2 | import torch 3 | import torchvision.transforms as transforms 4 | import torch.utils.data as data 5 | from PIL import Image 6 | import numpy as np 7 | from glob import glob 8 | import csv 9 | import cPickle as pickle 10 | import os 11 | 12 | 13 | def parse_charades_csv(filename): 14 | labels = {} 15 | with open(filename) as f: 16 | reader = csv.DictReader(f) 17 | for row in reader: 18 | vid = row['id'] 19 | actions = row['actions'] 20 | if actions == '': 21 | actions = [] 22 | else: 23 | actions = [a.split(' ') for a in actions.split(';')] 24 | actions = [{'class': x, 'start': float( 25 | y), 'end': float(z)} for x, y, z in actions] 26 | labels[vid] = actions 27 | return labels 28 | 29 | 30 | def cls2int(x): 31 | return int(x[1:]) 32 | 33 | 34 | def pil_loader(path): 35 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 36 | with open(path, 'rb') as f: 37 | img = Image.open(f) 38 | return img.convert('RGB') 39 | 40 | 41 | def accimage_loader(path): 42 | import accimage 43 | try: 44 | return accimage.Image(path) 45 | except IOError: 46 | # Potentially a decoding problem, fall back to PIL.Image 47 | return pil_loader(path) 48 | 49 | 50 | def default_loader(path): 51 | from torchvision import get_image_backend 52 | if get_image_backend() == 'accimage': 53 | return accimage_loader(path) 54 | else: 55 | return pil_loader(path) 56 | 57 | 58 | def cache(cachefile): 59 | """ Creates a decorator that caches the result to cachefile """ 60 | def cachedecorator(fn): 61 | def newf(*args, **kwargs): 62 | print('cachefile {}'.format(cachefile)) 63 | if os.path.exists(cachefile): 64 | with open(cachefile, 'rb') as f: 65 | print("Loading cached result from '%s'" % cachefile) 66 | return pickle.load(f) 67 | res = fn(*args, **kwargs) 68 | with open(cachefile, 'wb') as f: 69 | print("Saving result to cache '%s'" % cachefile) 70 | pickle.dump(res, f) 71 | return res 72 | return newf 73 | return cachedecorator 74 | 75 | 76 | class Charades(data.Dataset): 77 | def __init__(self, root, split, labelpath, cachedir, transform=None, target_transform=None): 78 | self.num_classes = 157 79 | self.transform = transform 80 | self.target_transform = target_transform 81 | self.labels = parse_charades_csv(labelpath) 82 | self.root = root 83 | cachename = '{}/{}_{}.pkl'.format(cachedir, 84 | self.__class__.__name__, split) 85 | self.data = cache(cachename)(self.prepare)(root, self.labels, split) 86 | 87 | def prepare(self, path, labels, split): 88 | FPS, GAP, testGAP = 24, 4, 25 89 | datadir = path 90 | image_paths, targets, ids = [], [], [] 91 | 92 | for i, (vid, label) in enumerate(labels.iteritems()): 93 | iddir = datadir + '/' + vid 94 | lines = glob(iddir+'/*.jpg') 95 | n = len(lines) 96 | if i % 100 == 0: 97 | print("{} {}".format(i, iddir)) 98 | if n == 0: 99 | continue 100 | if split == 'val_video': 101 | target = torch.IntTensor(157).zero_() 102 | for x in label: 103 | target[cls2int(x['class'])] = 1 104 | spacing = np.linspace(0, n-1, testGAP) 105 | for loc in spacing: 106 | impath = '{}/{}-{:06d}.jpg'.format( 107 | iddir, vid, int(np.floor(loc))+1) 108 | image_paths.append(impath) 109 | targets.append(target) 110 | ids.append(vid) 111 | else: 112 | for x in label: 113 | for ii in range(0, n-1, GAP): 114 | if x['start'] < ii/float(FPS) < x['end']: 115 | impath = '{}/{}-{:06d}.jpg'.format( 116 | iddir, vid, ii+1) 117 | image_paths.append(impath) 118 | targets.append(cls2int(x['class'])) 119 | ids.append(vid) 120 | return {'image_paths': image_paths, 'targets': targets, 'ids': ids} 121 | 122 | def __getitem__(self, index): 123 | """ 124 | Args: 125 | index (int): Index 126 | Returns: 127 | tuple: (image, target) where target is class_index of the target class. 128 | """ 129 | path = self.data['image_paths'][index] 130 | target = self.data['targets'][index] 131 | meta = {} 132 | meta['id'] = self.data['ids'][index] 133 | img = default_loader(path) 134 | if self.transform is not None: 135 | img = self.transform(img) 136 | if self.target_transform is not None: 137 | target = self.target_transform(target) 138 | return img, target, meta 139 | 140 | def __len__(self): 141 | return len(self.data['image_paths']) 142 | 143 | def __repr__(self): 144 | fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' 145 | fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) 146 | fmt_str += ' Root Location: {}\n'.format(self.root) 147 | tmp = ' Transforms (if any): ' 148 | fmt_str += '{0}{1}\n'.format( 149 | tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 150 | tmp = ' Target Transforms (if any): ' 151 | fmt_str += '{0}{1}'.format( 152 | tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 153 | return fmt_str 154 | 155 | 156 | def get(args): 157 | """ Entry point. Call this function to get all Charades dataloaders """ 158 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 159 | std=[0.229, 0.224, 0.225]) 160 | train_file = args.train_file 161 | val_file = args.val_file 162 | train_dataset = Charades( 163 | args.data, 'train', train_file, args.cache, 164 | transform=transforms.Compose([ 165 | transforms.RandomResizedCrop(args.inputsize), 166 | transforms.ColorJitter( 167 | brightness=0.4, contrast=0.4, saturation=0.4), 168 | transforms.RandomHorizontalFlip(), 169 | transforms.ToTensor(), # missing PCA lighting jitter 170 | normalize, 171 | ])) 172 | val_dataset = Charades( 173 | args.data, 'val', val_file, args.cache, 174 | transform=transforms.Compose([ 175 | transforms.Resize(int(256./224*args.inputsize)), 176 | transforms.CenterCrop(args.inputsize), 177 | transforms.ToTensor(), 178 | normalize, 179 | ])) 180 | valvideo_dataset = Charades( 181 | args.data, 'val_video', val_file, args.cache, 182 | transform=transforms.Compose([ 183 | transforms.Resize(int(256./224*args.inputsize)), 184 | transforms.CenterCrop(args.inputsize), 185 | transforms.ToTensor(), 186 | normalize, 187 | ])) 188 | return train_dataset, val_dataset, valvideo_dataset 189 | -------------------------------------------------------------------------------- /pytorch/datasets/fake.py: -------------------------------------------------------------------------------- 1 | """ Define random data for quick debugging """ 2 | import torchvision 3 | import torchvision.transforms as transforms 4 | 5 | 6 | def get(args): 7 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 8 | std=[0.229, 0.224, 0.225]) 9 | 10 | train_dataset = torchvision.datasets.FakeData( 11 | transform=transforms.Compose([ 12 | transforms.RandomResizedCrop(224), 13 | transforms.RandomHorizontalFlip(), 14 | transforms.ToTensor(), 15 | normalize, 16 | ])) 17 | 18 | val_dataset = torchvision.datasets.FakeData( 19 | transform=transforms.Compose([ 20 | transforms.Resize(256), 21 | transforms.CenterCrop(224), 22 | transforms.ToTensor(), 23 | normalize, 24 | ])) 25 | 26 | return train_dataset, val_dataset, val_dataset 27 | -------------------------------------------------------------------------------- /pytorch/datasets/transforms.py: -------------------------------------------------------------------------------- 1 | """ Overloading Torchvision transforms to operate on a list """ 2 | 3 | import torchvision.transforms as parents 4 | 5 | class CenterCrop(parents.CenterCrop): 6 | def __init__(self, *args, **kwargs): 7 | super(CenterCrop, self).__init__(*args, **kwargs) 8 | def __call__(self, img): 9 | return [super(CenterCrop, self).__call__(im) for im in img] 10 | 11 | 12 | class RandomCrop(parents.RandomCrop): 13 | def __init__(self, *args, **kwargs): 14 | super(RandomCrop, self).__init__(*args, **kwargs) 15 | def __call__(self, img): 16 | return [super(RandomCrop, self).__call__(im) for im in img] 17 | 18 | 19 | class RandomResizedCrop(parents.RandomResizedCrop): 20 | def __init__(self, *args): 21 | super(RandomResizedCrop, self).__init__(*args) 22 | def __call__(self, img): 23 | return [super(RandomResizedCrop, self).__call__(im) for im in img] 24 | 25 | 26 | class Resize(parents.Resize): 27 | def __init__(self, *args, **kwargs): 28 | super(Resize, self).__init__(*args, **kwargs) 29 | def __call__(self, img): 30 | return [super(Resize, self).__call__(im) for im in img] 31 | 32 | 33 | class ToTensor(parents.ToTensor): 34 | def __init__(self, *args, **kwargs): 35 | super(ToTensor, self).__init__(*args, **kwargs) 36 | def __call__(self, img): 37 | return [super(ToTensor, self).__call__(im) for im in img] 38 | 39 | 40 | class Normalize(parents.Normalize): 41 | def __init__(self, *args, **kwargs): 42 | super(Normalize, self).__init__(*args, **kwargs) 43 | def __call__(self, img): 44 | return [super(Normalize, self).__call__(im) for im in img] 45 | 46 | -------------------------------------------------------------------------------- /pytorch/exp/flownet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | #sys.path.insert(0, '..') 4 | sys.path.insert(0, '.') 5 | from main import main 6 | 7 | args = [ 8 | '--name', __file__.split('/')[-1].split('.')[0], # name is filename 9 | '--print-freq', '1', 10 | '--dataset', 'charadesflow', 11 | '--data','/scratch/gsigurds/Charades_v1_flow/', 12 | '--arch', 'vgg16flow', 13 | '--pretrained-weights', './vgg16flow_ucf101.pth', 14 | '--lr', '5e-3', 15 | '--lr-decay-rate','15', 16 | '--epochs','40', 17 | '--batch-size', '64', 18 | '--train-size', '0.2', 19 | '--val-size', '0.1', 20 | '--cache-dir', '/nfs.yoda/gsigurds/ai2/caches/', 21 | '--pretrained', 22 | #'--evaluate', 23 | ] 24 | sys.argv.extend(args) 25 | main() 26 | -------------------------------------------------------------------------------- /pytorch/exp/flownet_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | #sys.path.insert(0, '..') 4 | sys.path.insert(0, '.') 5 | from main import main 6 | 7 | args = [ 8 | '--name', __file__.split('/')[-1].split('.')[0], # name is filename 9 | '--print-freq', '1', 10 | '--dataset', 'charadesflow', 11 | '--data','/scratch/gsigurds/Charades_v1_flow/', 12 | '--arch', 'vgg16flow', 13 | '--lr', '5e-3', 14 | '--lr-decay-rate','15', 15 | '--epochs','40', 16 | '--batch-size', '64', 17 | '--train-size', '0.2', 18 | '--val-size', '0.001', 19 | '--cache-dir', '/nfs.yoda/gsigurds/ai2/caches/', 20 | '--pretrained', 21 | '--resume', './twostream_flow.pth', 22 | '--evaluate', 23 | ] 24 | sys.argv.extend(args) 25 | main() 26 | -------------------------------------------------------------------------------- /pytorch/exp/rgbnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | #sys.path.insert(0, '..') 4 | sys.path.insert(0, '.') 5 | from main import main 6 | 7 | args = [ 8 | '--name', __file__.split('/')[-1].split('.')[0], # name is filename 9 | '--print-freq', '1', 10 | '--dataset', 'charadesrgb', 11 | '--arch', 'vgg16', 12 | '--lr', '1e-3', 13 | '--batch-size', '64', 14 | '--train-size', '0.1', 15 | '--val-size', '0.1', 16 | '--cache-dir', '/nfs.yoda/gsigurds/ai2/caches/', 17 | '--pretrained', 18 | #'--evaluate', 19 | ] 20 | sys.argv.extend(args) 21 | main() 22 | -------------------------------------------------------------------------------- /pytorch/exp/rgbnet_inception.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | #sys.path.insert(0, '..') 4 | sys.path.insert(0, '.') 5 | from main import main 6 | 7 | args = [ 8 | '--name', __file__.split('/')[-1].split('.')[0], # name is filename 9 | '--print-freq', '1', 10 | '--dataset', 'charadesrgb', 11 | '--arch', 'inception_v3', 12 | '--inputsize','299', 13 | '--lr', '1e-3', 14 | '--batch-size', '64', 15 | '--train-size', '0.1', 16 | '--val-size', '0.1', 17 | '--cache-dir', '/nfs.yoda/gsigurds/ai2/caches/', 18 | '--pretrained', 19 | #'--evaluate', 20 | ] 21 | sys.argv.extend(args) 22 | main() 23 | -------------------------------------------------------------------------------- /pytorch/exp/rgbnet_resnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | #sys.path.insert(0, '..') 4 | sys.path.insert(0, '.') 5 | from main import main 6 | 7 | args = [ 8 | '--name', __file__.split('/')[-1].split('.')[0], # name is filename 9 | '--print-freq', '1', 10 | '--dataset', 'charadesrgb', 11 | '--arch', 'resnet152', 12 | '--lr', '1e-3', 13 | '--batch-size', '50', 14 | '--train-size', '0.1', 15 | '--val-size', '0.1', 16 | '--cache-dir', '/nfs.yoda/gsigurds/ai2/caches/', 17 | '--pretrained', 18 | #'--evaluate', 19 | ] 20 | sys.argv.extend(args) 21 | main() 22 | -------------------------------------------------------------------------------- /pytorch/exp/rgbnet_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | #sys.path.insert(0, '..') 4 | sys.path.insert(0, '.') 5 | from main import main 6 | 7 | args = [ 8 | '--name', __file__.split('/')[-1].split('.')[0], # name is filename 9 | '--print-freq', '1', 10 | '--dataset', 'charadesrgb', 11 | '--arch', 'vgg16', 12 | '--lr', '1e-3', 13 | '--batch-size', '64', 14 | '--train-size', '0.1', 15 | '--val-size', '0.1', 16 | '--cache-dir', '/nfs.yoda/gsigurds/ai2/caches/', 17 | '--pretrained', 18 | '--resume', './twostream_rgb.pth.tar', 19 | '--evaluate', 20 | ] 21 | sys.argv.extend(args) 22 | main() 23 | -------------------------------------------------------------------------------- /pytorch/get_alreadytrained.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to download pretrained pytorch models on Charades 3 | # Approximately equivalent to models obtained by running exp/rgbnet.py 4 | # 5 | # The rgb model was obtained after 7 epochs (epoch-size 0.1) 6 | # The rgb model has a classification accuracy of 18.6% mAP (via charades_v1_classify.m) 7 | # Notice that this is an improvement over the Torch RGB model 8 | # The flow model was converted directly from the Charades Torch codebase (../torch/) 9 | # The flow model has a classification accuracy of 15.4% mAP (via charades_v1_classify.m) 10 | # 11 | # vgg16flow_ucf101.pth is a converted model from Torch that was pretrained on UCF101 12 | # and is used as an initialization for the flow model 13 | # 14 | # Combining the predictions (submission files) of those models using combine_rgb_flow.py 15 | # yields a final classification accuracy of 20.6% mAP (via charades_v1_classify.m) 16 | # 17 | # Additionally we include rgb-streams fine-tuned from resnet and inception pretrained on ImageNet 18 | # ResNet-152 (exp/rgbnet_resnet.py): 22.8% mAP (via charades_v1_classify.m) 19 | # Inception_v3 (exp/rgbnet_inception.py): 22.7% mAP (via charades_v1_classify.m) 20 | 21 | wget -O twostream_rgb.pth.tar https://www.dropbox.com/s/p457h2ifi6v1qdz/twostream_rgb.pth.tar?dl=1 22 | wget -O twostream_flow.pth https://www.dropbox.com/s/m1hkeiwjtndt26z/twostream_flow.pth?dl=1 23 | wget -O vgg16flow_ucf101.pth https://www.dropbox.com/s/qlr5aty2jz4dq5o/vgg16flow_ucf101.pth?dl=1 24 | wget -O resnet_rgb.pth.tar https://www.dropbox.com/s/iy9fmk0r1a3edoz/resnet_rgb.pth.tar?dl=1 25 | wget -O inception_rgb.pth.tar https://www.dropbox.com/s/whxikophm7xqchb/inception_rgb.pth.tar?dl=1 26 | -------------------------------------------------------------------------------- /pytorch/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Charades activity recognition baseline code 4 | Can be run directly or throught config scripts under exp/ 5 | 6 | Gunnar Sigurdsson, 2018 7 | """ 8 | import torch 9 | import numpy as np 10 | import random 11 | import train 12 | from models import create_model 13 | from datasets import get_dataset 14 | import checkpoints 15 | from opts import parse 16 | from utils import tee 17 | 18 | 19 | def seed(manualseed): 20 | random.seed(manualseed) 21 | np.random.seed(manualseed) 22 | torch.manual_seed(manualseed) 23 | torch.cuda.manual_seed(manualseed) 24 | 25 | 26 | best_mAP = 0 27 | def main(): 28 | global opt, best_mAP 29 | opt = parse() 30 | tee.Tee(opt.cache+'/log.txt') 31 | print(vars(opt)) 32 | seed(opt.manual_seed) 33 | 34 | model, criterion, optimizer = create_model(opt) 35 | if opt.resume: best_mAP = checkpoints.load(opt, model, optimizer) 36 | print(model) 37 | trainer = train.Trainer() 38 | train_loader, val_loader, valvideo_loader = get_dataset(opt) 39 | 40 | if opt.evaluate: 41 | trainer.validate(val_loader, model, criterion, -1, opt) 42 | trainer.validate_video(valvideo_loader, model, -1, opt) 43 | return 44 | 45 | for epoch in range(opt.start_epoch, opt.epochs): 46 | if opt.distributed: 47 | trainer.train_sampler.set_epoch(epoch) 48 | top1,top5 = trainer.train(train_loader, model, criterion, optimizer, epoch, opt) 49 | top1val,top5val = trainer.validate(val_loader, model, criterion, epoch, opt) 50 | mAP = trainer.validate_video(valvideo_loader, model, epoch, opt) 51 | is_best = mAP > best_mAP 52 | best_mAP = max(mAP, best_mAP) 53 | scores = {'top1train':top1,'top5train':top5,'top1val':top1val,'top5val':top5val,'mAP':mAP} 54 | checkpoints.save(epoch, opt, model, optimizer, is_best, scores) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /pytorch/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Initialize the model module 3 | New models can be defined by adding scripts under models/ 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.parallel 8 | import torch.backends.cudnn as cudnn 9 | import torch.distributed as dist 10 | import torchvision.models as tmodels 11 | import importlib 12 | 13 | 14 | def create_model(args): 15 | if args.arch in tmodels.__dict__: # torchvision models 16 | if args.pretrained: 17 | print("=> using pre-trained model '{}'".format(args.arch)) 18 | model = tmodels.__dict__[args.arch](pretrained=True) 19 | model = model.cuda() 20 | else: 21 | print("=> creating model '{}'".format(args.arch)) 22 | model = tmodels.__dict__[args.arch]() 23 | else: # defined as script in this directory 24 | model = importlib.import_module('.'+args.arch, package='models').model 25 | if not args.pretrained_weights == '': 26 | print('loading pretrained-weights from {}'.format(args.pretrained_weights)) 27 | model.load_state_dict(torch.load(args.pretrained_weights)) 28 | 29 | # replace last layer 30 | if hasattr(model, 'classifier'): 31 | newcls = list(model.classifier.children()) 32 | newcls = newcls[:-1] + [nn.Linear(newcls[-1].in_features, args.nclass).cuda()] 33 | model.classifier = nn.Sequential(*newcls) 34 | elif hasattr(model, 'fc'): 35 | model.fc = nn.Linear(model.fc.in_features, args.nclass) 36 | if hasattr(model, 'AuxLogits'): 37 | model.AuxLogits.fc = nn.Linear(model.AuxLogits.fc.in_features, args.nclass) 38 | else: 39 | newcls = list(model.children()) 40 | if hasattr(model, 'in_features'): 41 | in_features = model.in_features 42 | else: 43 | in_features = newcls[-1].in_features 44 | newcls = newcls[:-1] + [nn.Linear(in_features, args.nclass).cuda()] 45 | model = nn.Sequential(*newcls) 46 | 47 | if args.distributed: 48 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 49 | world_size=args.world_size) 50 | model.cuda() 51 | model = torch.nn.parallel.DistributedDataParallel(model) 52 | else: 53 | if hasattr(model, 'features'): 54 | model.features = torch.nn.DataParallel(model.features) 55 | model.cuda() 56 | else: 57 | model = torch.nn.DataParallel(model).cuda() 58 | 59 | # define loss function and optimizer 60 | criterion = nn.CrossEntropyLoss().cuda() 61 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 62 | momentum=args.momentum, 63 | weight_decay=args.weight_decay) 64 | cudnn.benchmark = True 65 | return model, criterion, optimizer 66 | -------------------------------------------------------------------------------- /pytorch/models/vgg16flow.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | class LambdaBase(nn.Sequential): 7 | def __init__(self, fn, *args): 8 | super(LambdaBase, self).__init__(*args) 9 | self.lambda_func = fn 10 | 11 | def forward_prepare(self, input): 12 | output = [] 13 | for module in self._modules.values(): 14 | output.append(module(input)) 15 | return output if output else input 16 | 17 | class Lambda(LambdaBase): 18 | def forward(self, input): 19 | return self.lambda_func(self.forward_prepare(input)) 20 | 21 | 22 | model = nn.Sequential( # Sequential, 23 | nn.Conv2d(20,64,(3, 3),(1, 1),(1, 1)), 24 | nn.ReLU(), 25 | nn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)), 26 | nn.ReLU(), 27 | nn.MaxPool2d((2, 2),(2, 2),(0, 0),ceil_mode=True), 28 | nn.Conv2d(64,128,(3, 3),(1, 1),(1, 1)), 29 | nn.ReLU(), 30 | nn.Conv2d(128,128,(3, 3),(1, 1),(1, 1)), 31 | nn.ReLU(), 32 | nn.MaxPool2d((2, 2),(2, 2),(0, 0),ceil_mode=True), 33 | nn.Conv2d(128,256,(3, 3),(1, 1),(1, 1)), 34 | nn.ReLU(), 35 | nn.Conv2d(256,256,(3, 3),(1, 1),(1, 1)), 36 | nn.ReLU(), 37 | nn.Conv2d(256,256,(3, 3),(1, 1),(1, 1)), 38 | nn.ReLU(), 39 | nn.MaxPool2d((2, 2),(2, 2),(0, 0),ceil_mode=True), 40 | nn.Conv2d(256,512,(3, 3),(1, 1),(1, 1)), 41 | nn.ReLU(), 42 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 43 | nn.ReLU(), 44 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 45 | nn.ReLU(), 46 | nn.MaxPool2d((2, 2),(2, 2),(0, 0),ceil_mode=True), 47 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 48 | nn.ReLU(), 49 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 50 | nn.ReLU(), 51 | nn.Conv2d(512,512,(3, 3),(1, 1),(1, 1)), 52 | nn.ReLU(), 53 | nn.MaxPool2d((2, 2),(2, 2),(0, 0),ceil_mode=True), 54 | Lambda(lambda x: x.view(x.size(0),-1)), # View, 55 | nn.Sequential(Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(25088,4096)), # Linear, 56 | nn.ReLU(), 57 | nn.Dropout(0.9), 58 | nn.Sequential(Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(4096,4096)), # Linear, 59 | nn.ReLU(), 60 | nn.Dropout(0.8), 61 | nn.Sequential(Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(4096,101)), # Linear, 62 | ) 63 | model.in_features = 4096 64 | -------------------------------------------------------------------------------- /pytorch/opts.py: -------------------------------------------------------------------------------- 1 | """ Define and parse commandline arguments """ 2 | import argparse 3 | import os 4 | 5 | 6 | def parse(): 7 | print('parsing arguments') 8 | parser = argparse.ArgumentParser(description='PyTorch Charades Training') 9 | parser.add_argument('--data', metavar='DIR', default='/scratch/gsigurds/Charades_v1_rgb/', 10 | help='path to dataset') 11 | parser.add_argument('--dataset', metavar='DIR', default='fake', 12 | help='name of dataset under datasets/') 13 | parser.add_argument('--train-file', default='./Charades_v1_train.csv', type=str) 14 | parser.add_argument('--val-file', default='./Charades_v1_test.csv', type=str) 15 | parser.add_argument('--arch', '-a', metavar='ARCH', default='alexnet', 16 | help='model architecture: ') 17 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 18 | help='number of data loading workers (default: 4)') 19 | parser.add_argument('--epochs', default=20, type=int, metavar='N', 20 | help='number of total epochs to run') 21 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 22 | help='manual epoch number (useful on restarts)') 23 | parser.add_argument('-b', '--batch-size', default=256, type=int, 24 | metavar='N', help='mini-batch size (default: 256)') 25 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, 26 | metavar='LR', help='initial learning rate') 27 | parser.add_argument('--lr-decay-rate',default=6, type=int) 28 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 29 | help='momentum') 30 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 31 | metavar='W', help='weight decay (default: 1e-4)') 32 | parser.add_argument('--print-freq', '-p', default=10, type=int, 33 | metavar='N', help='print frequency (default: 10)') 34 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 35 | help='path to latest checkpoint (default: none)') 36 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 37 | help='evaluate model on validation set') 38 | parser.add_argument('--pretrained', dest='pretrained', action='store_true', 39 | help='use pre-trained model') 40 | parser.add_argument('--pretrained-weights', default='', type=str) 41 | parser.add_argument('--inputsize', default=224, type=int) 42 | parser.add_argument('--world-size', default=1, type=int, 43 | help='number of distributed processes') 44 | parser.add_argument('--manual-seed', default=0, type=int) 45 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 46 | help='url used to set up distributed training') 47 | parser.add_argument('--dist-backend', default='gloo', type=str, 48 | help='distributed backend') 49 | parser.add_argument('--train-size', default=1.0, type=float) 50 | parser.add_argument('--val-size', default=1.0, type=float) 51 | parser.add_argument('--cache-dir', default='./cache/', type=str) 52 | parser.add_argument('--name', default='test', type=str) 53 | parser.add_argument('--nclass', default=157, type=int) 54 | parser.add_argument('--accum-grad', default=4, type=int) 55 | args = parser.parse_args() 56 | args.distributed = args.world_size > 1 57 | args.cache = args.cache_dir+args.name+'/' 58 | if not os.path.exists(args.cache): 59 | os.makedirs(args.cache) 60 | 61 | return args 62 | -------------------------------------------------------------------------------- /pytorch/train.py: -------------------------------------------------------------------------------- 1 | """ Defines the Trainer class which handles train/validation/validation_video 2 | """ 3 | import time 4 | import torch 5 | import itertools 6 | import numpy as np 7 | from utils import map 8 | 9 | 10 | class AverageMeter(object): 11 | """Computes and stores the average and current value""" 12 | 13 | def __init__(self): 14 | self.reset() 15 | 16 | def reset(self): 17 | self.val = 0 18 | self.avg = 0 19 | self.sum = 0 20 | self.count = 0 21 | 22 | def update(self, val, n=1): 23 | self.val = val 24 | self.sum += val * n 25 | self.count += n 26 | self.avg = self.sum / self.count 27 | 28 | 29 | def adjust_learning_rate(startlr, decay_rate, optimizer, epoch): 30 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 31 | lr = startlr * (0.1 ** (epoch // decay_rate)) 32 | for param_group in optimizer.param_groups: 33 | param_group['lr'] = lr 34 | 35 | 36 | def accuracy(output, target, topk=(1,)): 37 | """Computes the precision@k for the specified values of k""" 38 | maxk = max(topk) 39 | batch_size = target.size(0) 40 | 41 | _, pred = output.topk(maxk, 1, True, True) 42 | pred = pred.t() 43 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 44 | 45 | res = [] 46 | for k in topk: 47 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 48 | res.append(correct_k.mul_(100.0 / batch_size)) 49 | return res 50 | 51 | 52 | def submission_file(ids, outputs, filename): 53 | """ write list of ids and outputs to filename""" 54 | with open(filename, 'w') as f: 55 | for vid, output in zip(ids, outputs): 56 | scores = ['{:g}'.format(x) 57 | for x in output] 58 | f.write('{} {}\n'.format(vid, ' '.join(scores))) 59 | 60 | 61 | class Trainer(): 62 | def train(self, loader, model, criterion, optimizer, epoch, args): 63 | adjust_learning_rate(args.lr, args.lr_decay_rate, optimizer, epoch) 64 | batch_time = AverageMeter() 65 | data_time = AverageMeter() 66 | losses = AverageMeter() 67 | top1 = AverageMeter() 68 | top5 = AverageMeter() 69 | 70 | # switch to train mode 71 | model.train() 72 | optimizer.zero_grad() 73 | 74 | def part(x): return itertools.islice(x, int(len(x)*args.train_size)) 75 | end = time.time() 76 | for i, (input, target, meta) in enumerate(part(loader)): 77 | data_time.update(time.time() - end) 78 | 79 | target = target.long().cuda(async=True) 80 | input_var = torch.autograd.Variable(input.cuda()) 81 | target_var = torch.autograd.Variable(target) 82 | output = model(input_var) 83 | loss = None 84 | # for nets that have multiple outputs such as inception 85 | if isinstance(output, tuple): 86 | loss = sum((criterion(o,target_var) for o in output)) 87 | output = output[0] 88 | else: 89 | loss = criterion(output, target_var) 90 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 91 | losses.update(loss.data[0], input.size(0)) 92 | top1.update(prec1[0], input.size(0)) 93 | top5.update(prec5[0], input.size(0)) 94 | 95 | loss.backward() 96 | if i % args.accum_grad == args.accum_grad-1: 97 | print('updating parameters') 98 | optimizer.step() 99 | optimizer.zero_grad() 100 | 101 | batch_time.update(time.time() - end) 102 | end = time.time() 103 | 104 | if i % args.print_freq == 0: 105 | print('Epoch: [{0}][{1}/{2}({3})]\t' 106 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 107 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 108 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 109 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 110 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 111 | epoch, i, int( 112 | len(loader)*args.train_size), len(loader), 113 | batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)) 114 | return top1.avg,top5.avg 115 | 116 | def validate(self, loader, model, criterion, epoch, args): 117 | batch_time = AverageMeter() 118 | losses = AverageMeter() 119 | top1 = AverageMeter() 120 | top5 = AverageMeter() 121 | 122 | # switch to evaluate mode 123 | model.eval() 124 | 125 | def part(x): return itertools.islice(x, int(len(x)*args.val_size)) 126 | end = time.time() 127 | for i, (input, target, meta) in enumerate(part(loader)): 128 | target = target.long().cuda(async=True) 129 | input_var = torch.autograd.Variable(input.cuda(), volatile=True) 130 | target_var = torch.autograd.Variable(target, volatile=True) 131 | output = model(input_var) 132 | loss = criterion(output, target_var) 133 | 134 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 135 | losses.update(loss.data[0], input.size(0)) 136 | top1.update(prec1[0], input.size(0)) 137 | top5.update(prec5[0], input.size(0)) 138 | batch_time.update(time.time() - end) 139 | end = time.time() 140 | 141 | if i % args.print_freq == 0: 142 | print('Test: [{0}/{1} ({2})]\t' 143 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 144 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 145 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 146 | 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 147 | i, int(len(loader)*args.val_size), len(loader), 148 | batch_time=batch_time, loss=losses, 149 | top1=top1, top5=top5)) 150 | 151 | print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}' 152 | .format(top1=top1, top5=top5)) 153 | 154 | return top1.avg,top5.avg 155 | 156 | def validate_video(self, loader, model, epoch, args): 157 | """ Run video-level validation on the Charades test set""" 158 | batch_time = AverageMeter() 159 | outputs = [] 160 | gts = [] 161 | ids = [] 162 | 163 | # switch to evaluate mode 164 | model.eval() 165 | 166 | end = time.time() 167 | for i, (input, target, meta) in enumerate(loader): 168 | target = target.long().cuda(async=True) 169 | assert target[0,:].eq(target[1,:]).all(), "val_video not synced" 170 | input_var = torch.autograd.Variable(input.cuda(), volatile=True) 171 | output = model(input_var) 172 | output = torch.nn.Softmax(dim=1)(output) 173 | 174 | # store predictions 175 | output_video = output.mean(dim=0) 176 | outputs.append(output_video.data.cpu().numpy()) 177 | gts.append(target[0,:]) 178 | ids.append(meta['id'][0]) 179 | batch_time.update(time.time() - end) 180 | end = time.time() 181 | 182 | if i % args.print_freq == 0: 183 | print('Test2: [{0}/{1}]\t' 184 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'.format( 185 | i, len(loader), batch_time=batch_time)) 186 | #mAP, _, ap = map.map(np.vstack(outputs), np.vstack(gts)) 187 | mAP, _, ap = map.charades_map(np.vstack(outputs), np.vstack(gts)) 188 | print(ap) 189 | print(' * mAP {:.3f}'.format(mAP)) 190 | submission_file( 191 | ids, outputs, '{}/epoch_{:03d}.txt'.format(args.cache, epoch+1)) 192 | return mAP 193 | -------------------------------------------------------------------------------- /pytorch/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gsig/charades-algorithms/927794cd04c588f1e749e96f5c0e69d81a1576e0/pytorch/utils/__init__.py -------------------------------------------------------------------------------- /pytorch/utils/map.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def map(submission_array, gt_array): 5 | """ Returns mAP, weighted mAP, and AP array """ 6 | m_aps = [] 7 | n_classes = submission_array.shape[1] 8 | for oc_i in range(n_classes): 9 | sorted_idxs = np.argsort(-submission_array[:, oc_i]) 10 | tp = gt_array[:, oc_i][sorted_idxs] == 1 11 | fp = np.invert(tp) 12 | n_pos = tp.sum() 13 | if n_pos < 0.1: 14 | m_aps.append(float('nan')) 15 | continue 16 | fp.sum() 17 | f_pcs = np.cumsum(fp) 18 | t_pcs = np.cumsum(tp) 19 | prec = t_pcs / (f_pcs+t_pcs).astype(float) 20 | avg_prec = 0 21 | for i in range(submission_array.shape[0]): 22 | if tp[i]: 23 | avg_prec += prec[i] 24 | m_aps.append(avg_prec / n_pos.astype(float)) 25 | m_aps = np.array(m_aps) 26 | m_ap = np.mean(m_aps) 27 | w_ap = (m_aps * gt_array.sum(axis=0) / gt_array.sum().sum().astype(float)) 28 | return m_ap, w_ap, m_aps 29 | 30 | 31 | def charades_map(submission_array, gt_array): 32 | """ 33 | Approximate version of the charades evaluation function 34 | For precise numbers, use the submission file with the official matlab script 35 | """ 36 | fix = submission_array.copy() 37 | empty = np.sum(gt_array, axis=1)==0 38 | fix[empty, :] = np.NINF 39 | return map(fix, gt_array) 40 | -------------------------------------------------------------------------------- /pytorch/utils/tee.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements a crude stdout-to-file redirect for keep history of experiments 3 | The following code initializes the redirect: 4 | import tee 5 | tee.Tee(filename) 6 | """ 7 | import logging 8 | import sys 9 | 10 | 11 | class StreamToLogger(object): 12 | def __init__(self, stream, logger, log_level=logging.INFO): 13 | self.logger = logger 14 | self.log_level = log_level 15 | self.linebuf = '' 16 | self.stream = stream 17 | 18 | def write(self, buf): 19 | self.stream.write(buf) 20 | for line in buf.rstrip().splitlines(): 21 | self.logger.log(self.log_level, line.rstrip()) 22 | 23 | def flush(self): 24 | self.stream.flush() 25 | 26 | 27 | class Tee(object): 28 | def __init__(self, filename): 29 | self.filename = filename 30 | logging.basicConfig( 31 | level=logging.DEBUG, 32 | format='%(asctime)s:%(message)s', 33 | filename=filename, 34 | filemode='a' 35 | ) 36 | stdout_logger = logging.getLogger('STDOUT') 37 | sl = StreamToLogger(sys.stdout, stdout_logger, logging.INFO) 38 | sys.stdout = sl 39 | 40 | stderr_logger = logging.getLogger('STDERR') 41 | sl = StreamToLogger(sys.stderr, stderr_logger, logging.ERROR) 42 | sys.stderr = sl 43 | print "Logging to file {}".format(filename) 44 | -------------------------------------------------------------------------------- /torch/INSTALL.md: -------------------------------------------------------------------------------- 1 | Torch ResNet Installation 2 | ========================= 3 | 4 | This is the suggested way to install the Torch ResNet dependencies on [Ubuntu 14.04+](http://www.ubuntu.com/): 5 | * NVIDIA CUDA 7.0+ 6 | * NVIDIA cuDNN v4 7 | * Torch 8 | * ImageNet dataset 9 | 10 | ## Requirements 11 | * NVIDIA GPU with compute capability 3.5 or above 12 | 13 | ## Install CUDA 14 | 1. Install the `build-essential` package: 15 | ```bash 16 | sudo apt-get install build-essential 17 | ``` 18 | 19 | 2. If you are using a Virtual Machine (like Amazon EC2 instances), install: 20 | ```bash 21 | sudo apt-get update 22 | sudo apt-get install linux-generic 23 | ``` 24 | 25 | 3. Download the CUDA .deb file for Linux Ubuntu 14.04 64-bit from: https://developer.nvidia.com/cuda-downloads. 26 | The file will be named something like `cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb` 27 | 28 | 4. Install CUDA from the .deb file: 29 | ```bash 30 | sudo dpkg -i cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb 31 | sudo apt-get update 32 | sudo apt-get install cuda 33 | echo "export PATH=/usr/local/cuda/bin/:\$PATH; export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:\$LD_LIBRARY_PATH; " >>~/.bashrc && source ~/.bashrc 34 | ``` 35 | 36 | 4. Restart your computer 37 | 38 | ## Install cuDNN v4 39 | 1. Download cuDNN v4 from https://developer.nvidia.com/cuDNN (requires registration). 40 | The file will be named something like `cudnn-7.0-linux-x64-v4.0-rc.tgz`. 41 | 42 | 2. Extract the file to `/usr/local/cuda`: 43 | ```bash 44 | tar -xvf cudnn-7.0-linux-x64-v4.0-rc.tgz 45 | sudo cp cuda/include/*.h /usr/local/cuda/include 46 | sudo cp cuda/lib64/*.so* /usr/local/cuda/lib64 47 | ``` 48 | 49 | ## Install Torch 50 | 1. Install the Torch dependencies: 51 | ```bash 52 | curl -sk https://raw.githubusercontent.com/torch/ezinstall/master/install-deps | bash -e 53 | ``` 54 | 55 | 2. Install Torch in a local folder: 56 | ```bash 57 | git clone https://github.com/torch/distro.git ~/torch --recursive 58 | cd ~/torch; ./install.sh 59 | ``` 60 | 61 | If you want to uninstall torch, you can use the command: `rm -rf ~/torch` 62 | 63 | ## Install the Torch cuDNN v4 bindings 64 | ```bash 65 | git clone -b R4 https://github.com/soumith/cudnn.torch.git 66 | cd cudnn.torch; luarocks make 67 | ``` 68 | 69 | -------------------------------------------------------------------------------- /torch/LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For fb.resnet.torch software 4 | 5 | Copyright (c) 2016, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /torch/README.md: -------------------------------------------------------------------------------- 1 | ## Charades Starter Code for Activity Classification and Localization 2 | 3 | Contributor: Gunnar Atli Sigurdsson 4 | 5 | **New:** extension of this framework to the deep CRF model on Charades for *Asynchronous Temporal Fields for Action Recognition*: https://github.com/gsig/temporal-fields 6 | 7 | * This code implements a Two-Stream network in Torch 8 | * This code implements a Two-Stream+LSTM network in Torch 9 | * This code is built of the Res-Net Torch source code: github.com/facebook/fb.resnet.torch 10 | * This code awkwardly hacks said code to work as Two-Stream/LSTM 11 | * Some functionality from original code may work (optnet) 12 | * Some functionality from original code may not work (resnet) 13 | 14 | The code replicates the 'Two-Stream Extended' and 'Two-Stream+LSTM' baselines found in: 15 | ``` 16 | @inproceedings{sigurdsson2017asynchronous, 17 | author = {Gunnar A. Sigurdsson and Santosh Divvala and Ali Farhadi and Abhinav Gupta}, 18 | title = {Asynchronous Temporal Fields for Action Recognition}, 19 | booktitle={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 20 | year={2017}, 21 | pdf = {http://arxiv.org/pdf/1612.06371.pdf}, 22 | code = {https://github.com/gsig/temporal-fields}, 23 | } 24 | ``` 25 | which is in turn based off "Two-stream convolutional networks for action recognition in videos" by Simonyan and Zisserman, and "Beyond Short Snippets: Deep Networks for Video Classification" by Joe Yue-Hei Ng el al. 26 | 27 | Combining the predictions (submission files) of those models using combine_rgb_flow.py 28 | yields a final classification accuracy of 18.9% mAP (Two-Stream) and 19.8% (LSTM) on Charades (evalated with charades_v1_classify.m) 29 | 30 | 31 | ## Technical Overview: 32 | 33 | The code is organized such that to train a two-stream network. Two independed network are trained: One RGB network and one Flow network. 34 | This code parses the training data into pairs of an image (or flow), and a label for a single activity class. This forms a softmax training setup like a standard CNN. The network is a VGG-16 network. For RGB it is pretrained on Image-Net, and for Flow it is pretrained on UCF101. The pretrained networks can be downloaded with the scripts in this directory. 35 | For testing. The network uses a batch size of 25, scores all images, and pools the output to make a classfication prediction or uses all 25 outputs for localization. 36 | 37 | All outputs are stored in the cacheDir under checkpoints/. This includes epoch*.txt which is the classification output, and localize*.txt which is the localization output (note the you need to specify that you want this in the options). 38 | Those output files can be combined after training with the python scripts in this directory. 39 | All output files can be scored with the official MATLAB evaluation script provided with the Charades dataset. 40 | 41 | Requirements: 42 | * csvigo: luarocks install csvigo 43 | * loadcaffe: luarocks install loadcaffe 44 | * optnet: luarocks install optnet 45 | (The flow net requires optnet to converge with the current default settings for the parameters) 46 | 47 | Optional requirements: 48 | * Facebook Lua Libraries, for speedups and fb.debugger, a great debugger 49 | Please refer to the original res-net codebase for more information. 50 | 51 | 52 | ## Steps to train your own two-stream network on Charades: 53 | 54 | 1. Download the Charades Annotations (allenai.org/plato/charades/) 55 | 2. Download the Charades RGB and/or Flow frames (allenai.org/plato/charades/) 56 | 3. Download the Imagenet/UCF101 Pre-trained Image and Flow models using ./get_models.sh 57 | 4. Duplicate and edit one of the experiment files under exp/ with appropriate parameters. For additional parameters, see opts.lua 58 | 5. Run an experiment by calling dofile 'exp/rgbnet.lua' where rgbnet.lua is your experiment file 59 | 6. The checkpoints/logfiles/outputs are stored in your specified cache directory. 60 | 7. Combine one RGB output file and one Flow output file with combine_rgb_flow.py to generate a submission file 61 | 8. Evaluate the submission file with the Charades_v1_classify.m or Charades_v1_localize.m evaluation scripts 62 | 9. Build of the code, cite our papers, and say hi to us at CVPR. 63 | 64 | Good luck! 65 | 66 | 67 | ## Pretrained networks: 68 | 69 | While the RGB net can be trained in a day on a modern GPU, the flow net requires nontrivial IO and time to converge. For your convenience we provide RGB and Flow models already trained on Charades using exp/rgbnet.lua and exp/flownet.lua 70 | 71 | https://www.dropbox.com/s/o7afkhw52rqr48g/twostream_flow.t7?dl=1 72 | https://www.dropbox.com/s/bo9rv32zaxojsmz/twostream_rgb.t7?dl=1 73 | 74 | * The flow model was obtained after 31 epochs (epochSize=0.2) 75 | * The flow model has a classification accuracy of 15.4% mAP (evalated with charades_v1_classify.m) 76 | * The rgb model was obtained after 6 epochs (epochSize=0.1) 77 | * The rgb model has a classification accuracy of 15.6% mAP (evalated with charades_v1_classify.m) 78 | 79 | Combining the predictions (submission files) of those models using combine_rgb_flow.py 80 | yields a final classification accuracy of 18.9% mAP (evalated with charades_v1_classify.m) 81 | 82 | To fine-tune those models, or run experiments, please see exp/rgbnet_resume.lua, exp/rgbnet_test.lua, exp/flownet_resume.lua, and exp/flownet_test.lua 83 | 84 | Charades submission files are available for multiple baselines at https://github.com/gsig/temporal-fields 85 | 86 | ## Two-Stream+LSTM details 87 | 88 | We also provide pre-trained LSTM models using exp/lstmrgbnet.lua and exp/lstmflownet.lua, please see get_alreadytrained_lstm.sh for details. 89 | 90 | This baseline fine-tunes the previous Two-Stream models with a LSTM on top of fc7. It uses a special loader for Charades (charadessync), that feeds in a full video for each batch, to train an LSTM. To accomodate the softmax loss, (frame,label) pairs are randomly sampled for the training set. exp/lstmrgnet.lua, models/vgg16lstm.lua, and datasets/charadessync-gen.lua contain more details. 91 | -------------------------------------------------------------------------------- /torch/checkpoints.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | local checkpoint = {} 10 | 11 | local function deepCopy(tbl) 12 | -- creates a copy of a network with new modules and the same tensors 13 | local copy = {} 14 | for k, v in pairs(tbl) do 15 | if type(v) == 'table' then 16 | copy[k] = deepCopy(v) 17 | else 18 | copy[k] = v 19 | end 20 | end 21 | if torch.typename(tbl) then 22 | torch.setmetatable(copy, torch.typename(tbl)) 23 | end 24 | return copy 25 | end 26 | 27 | function checkpoint.latest(opt) 28 | if opt.resume == 'none' then 29 | return nil 30 | end 31 | 32 | local latestPath = paths.concat(opt.resume, 'latest.t7') 33 | if not paths.filep(latestPath) then 34 | return nil 35 | end 36 | 37 | print('=> Loading checkpoint ' .. latestPath) 38 | local latest = torch.load(latestPath) 39 | local optimState = torch.load(paths.concat(opt.resume, latest.optimFile)) 40 | 41 | return latest, optimState 42 | end 43 | 44 | local function modelscore(name,score) 45 | print('dumping score to file') 46 | local out = assert(io.open(name, "w")) 47 | out:write("train top1: " .. score[1] .. "\n") 48 | out:write("train top5: " .. score[2] .. "\n") 49 | out:write("test top1: " .. score[3] .. "\n") 50 | out:write("test top5: " .. score[4] .. "\n") 51 | out:write("mAP: " .. score[5] .. "\n") 52 | out:close() 53 | end 54 | 55 | function checkpoint.save(epoch, model, optimState, isBestModel, opt, score) 56 | -- don't save the DataParallelTable for easier loading on other machines 57 | if torch.type(model) == 'nn.DataParallelTable' then 58 | model = model:get(1) 59 | end 60 | 61 | -- create a clean copy on the CPU without modifying the original network 62 | model = deepCopy(model):float():clearState() 63 | 64 | local modelFile = 'model_' .. epoch .. '.t7' 65 | local optimFile = 'optimState_' .. epoch .. '.t7' 66 | 67 | modelscore(paths.concat(opt.save, string.format("model_%03d.txt",epoch)), score) 68 | torch.save(paths.concat(opt.save, modelFile), model) 69 | torch.save(paths.concat(opt.save, optimFile), optimState) 70 | torch.save(paths.concat(opt.save, 'latest.t7'), { 71 | epoch = epoch, 72 | modelFile = modelFile, 73 | optimFile = optimFile, 74 | }) 75 | modelscore(paths.concat(opt.save, 'latest.txt'), score) 76 | 77 | if isBestModel then 78 | torch.save(paths.concat(opt.save, 'model_best.t7'), model) 79 | modelscore(paths.concat(opt.save, 'model_best.txt'), score) 80 | end 81 | end 82 | 83 | return checkpoint 84 | -------------------------------------------------------------------------------- /torch/dataloader.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Multi-threaded data loader 10 | -- 11 | 12 | local datasets = require 'datasets/init' 13 | local Threads = require 'threads' 14 | Threads.serialization('threads.sharedserialize') 15 | 16 | local M = {} 17 | local DataLoader = torch.class('resnet.DataLoader', M) 18 | 19 | function DataLoader.create(opt) 20 | -- The train and val loader 21 | local loaders = {} 22 | 23 | for i, split in ipairs{'train', 'val', 'val2'} do 24 | local dataset = datasets.create(opt, split) 25 | loaders[i] = M.DataLoader(dataset, opt, split) 26 | end 27 | 28 | return table.unpack(loaders) 29 | end 30 | 31 | function DataLoader:__init(dataset, opt, split) 32 | local manualSeed = opt.manualSeed 33 | local function init() 34 | require('datasets/' .. opt.dataset) 35 | end 36 | local function main(idx) 37 | if manualSeed ~= 0 then 38 | torch.manualSeed(manualSeed + idx) 39 | end 40 | torch.setnumthreads(1) 41 | _G.dataset = dataset 42 | _G.preprocess = dataset:preprocess() 43 | return dataset:size() 44 | end 45 | 46 | local threads, sizes = Threads(opt.nThreads, init, main) 47 | self.nCrops = (split == 'val' and opt.tenCrop) and 10 or 1 48 | self.threads = threads 49 | self.__size = sizes[1][1] 50 | self.split = split 51 | self.synchronous = (opt.dataset=='charadessync') or (opt.dataset=='charadessyncflow') 52 | self.epochSize = tonumber(opt.epochSize) 53 | if self.epochSize and (self.epochSize < 1) then 54 | self.epochSize = torch.floor(self.epochSize * self.__size / opt.batchSize) * opt.batchSize 55 | end 56 | self.testSize = tonumber(opt.testSize) 57 | if self.testSize and (self.testSize < 1) then 58 | self.testSize = torch.floor(self.testSize * self.__size / opt.batchSize) * opt.batchSize 59 | end 60 | if split=='val2' then 61 | self.batchSize = 25 62 | else 63 | self.batchSize = math.floor(opt.batchSize / self.nCrops) 64 | end 65 | end 66 | 67 | function DataLoader:size() 68 | if self.split=='train' and self.epochSize and not (self.epochSize==1) then 69 | return math.ceil(self.epochSize / self.batchSize) 70 | elseif self.split=='val' and self.testSize and not (self.testSize==1) then 71 | return math.ceil(self.testSize / self.batchSize) 72 | else 73 | return math.ceil(self.__size / self.batchSize) 74 | end 75 | end 76 | 77 | function DataLoader:run() 78 | local threads = self.threads 79 | local split = self.split 80 | local size, batchSize = self.__size, self.batchSize 81 | local perm = torch.randperm(size) 82 | if self.split=='train' then 83 | if self.epochSize and not (self.epochSize==1) then 84 | -- Ensure each sample is seen equally often 85 | -- but reduce the epochSize 86 | if not self.perm then 87 | self.perm = torch.randperm(size) 88 | if self.synchronous then self.perm = torch.range(1,size) end 89 | end 90 | if self.perm:size(1) <= self.epochSize then 91 | if self.synchronous then 92 | self.perm = self.perm:cat(torch.range(1,size),1) 93 | else 94 | self.perm = self.perm:cat(torch.randperm(size),1) 95 | end 96 | end 97 | perm = self.perm[{{1,self.epochSize}}] 98 | self.perm = self.perm[{{self.epochSize+1,-1}}] 99 | size = self.epochSize 100 | else 101 | perm = torch.randperm(size) 102 | if self.synchronous then perm = torch.range(1,size) end 103 | end 104 | elseif self.split=='val' then 105 | perm = torch.range(1,size) 106 | if self.testSize and not (self.testSize==1) then 107 | perm = perm[{{1,self.testSize}}] 108 | size = self.testSize 109 | end 110 | elseif self.split=='val2' then 111 | perm = torch.range(1,size) 112 | else 113 | assert(false,'split undefined') 114 | end 115 | 116 | local idx, sample = 1, nil 117 | local function enqueue() 118 | while idx <= size and threads:acceptsjob() do 119 | local indices = perm:narrow(1, idx, math.min(batchSize, size - idx + 1)) 120 | threads:addjob( 121 | function(indices, nCrops) 122 | local sz = indices:size(1) 123 | local batch, imageSize 124 | local target 125 | 126 | if split=="val2" then 127 | target = torch.IntTensor(sz,157) 128 | else 129 | target = torch.IntTensor(sz) 130 | end 131 | local names = {} 132 | local ids = {} 133 | local obj = torch.IntTensor(sz) 134 | local verb = torch.IntTensor(sz) 135 | local scene = torch.IntTensor(sz) 136 | for i, idx in ipairs(indices:totable()) do 137 | local sample = _G.dataset:get(idx) 138 | local input = _G.preprocess(sample.input) 139 | if not batch then 140 | imageSize = input:size():totable() 141 | if nCrops > 1 then table.remove(imageSize, 1) end 142 | batch = torch.FloatTensor(sz, nCrops, table.unpack(imageSize)) 143 | end 144 | batch[i]:copy(input) 145 | 146 | if split=="val2" then 147 | target[i]:copy(sample.target) 148 | else 149 | target[i] = sample.target 150 | end 151 | names[i] = sample.name 152 | ids[i] = sample.id 153 | obj[i] = sample.obj and sample.obj or 0 154 | verb[i] = sample.verb and sample.verb or 0 155 | scene[i] = sample.scene and sample.scene or 0 156 | end 157 | collectgarbage() 158 | return { 159 | input = batch:view(sz * nCrops, table.unpack(imageSize)), 160 | target = target, 161 | names = names, 162 | ids = ids, 163 | obj = obj, 164 | verb = verb, 165 | scene = scene, 166 | } 167 | end, 168 | function(_sample_) 169 | sample = _sample_ 170 | end, 171 | indices, 172 | self.nCrops 173 | ) 174 | idx = idx + batchSize 175 | end 176 | end 177 | 178 | local n = 0 179 | local function loop() 180 | enqueue() 181 | if not threads:hasjob() then 182 | return nil 183 | end 184 | threads:dojob() 185 | if threads:haserror() then 186 | threads:synchronize() 187 | end 188 | enqueue() 189 | n = n + 1 190 | return n, sample 191 | end 192 | 193 | return loop 194 | end 195 | 196 | return M.DataLoader 197 | -------------------------------------------------------------------------------- /torch/datasets/README.md: -------------------------------------------------------------------------------- 1 | ## Datasets 2 | 3 | Each dataset consist of two files: `dataset-gen.lua` and `dataset.lua`. The `dataset-gen.lua` is responsible for one-time setup, while 4 | the `dataset.lua` handles the actual data loading. 5 | 6 | ### `dataset-gen.lua` 7 | 8 | The `dataset-gen.lua` performs any necessary one-time setup. For example, the [`cifar10-gen.lua`](cifar10-gen.lua) file downloads the CIFAR-10 dataset, and the [`imagenet-gen.lua`](imagenet-gen.lua) file indexes all the training and validation data. 9 | 10 | The module should have a single function `exec(opt, cacheFile)`. 11 | - `opt`: the command line options 12 | - `cacheFile`: path to output 13 | 14 | ```lua 15 | local M = {} 16 | function M.exec(opt, cacheFile) 17 | local imageInfo = {} 18 | -- preprocess dataset, store results in imageInfo, save to cacheFile 19 | torch.save(cacheFile, imageInfo) 20 | end 21 | return M 22 | ``` 23 | 24 | ### `dataset.lua` 25 | 26 | The `dataset.lua` should return a class that implements three functions: 27 | - `get(i)`: returns a table containing two entries, `input` and `target` 28 | - `input`: the training or validation image as a Torch tensor 29 | - `target`: the image category as a number 1-N 30 | - `size()`: returns the number of entries in the dataset 31 | - `preprocess()`: returns a function that transforms the `input` for data augmentation or input normalization 32 | 33 | ```lua 34 | local M = {} 35 | local FakeDataset = torch.class('resnet.FakeDataset', M) 36 | 37 | function FakeDataset:__init(imageInfo, opt, split) 38 | -- imageInfo: result from dataset-gen.lua 39 | -- opt: command-line arguments 40 | -- split: "train" or "val" 41 | end 42 | 43 | function FakeDataset:get(i) 44 | return { 45 | input = torch.Tensor(3, 800, 600):uniform(), 46 | target = 42, 47 | } 48 | end 49 | 50 | function FakeDataset:size() 51 | -- size of dataset 52 | return 2000 53 | end 54 | 55 | function FakeDataset:preprocess() 56 | -- Scale smaller side to 256 and take 224x224 center-crop 57 | return t.Compose{ 58 | t.Scale(256), 59 | t.CenterCrop(224), 60 | } 61 | end 62 | 63 | return M.FakeDataset 64 | ``` 65 | -------------------------------------------------------------------------------- /torch/datasets/charades-gen.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Script to compute list of Charades filenames and classes rgb images 10 | -- 11 | -- This generates a file gen/charades.t7 which contains the list of all 12 | -- Charades training and validation images and their classes. This script also 13 | -- works for other datasets arragned with the same layout. 14 | -- 15 | -- Contributor: Gunnar Atli Sigurdsson 16 | 17 | local sys = require 'sys' 18 | local ffi = require 'ffi' 19 | 20 | local M = {} 21 | 22 | local function parseCSV(filename) 23 | require 'csvigo' 24 | print(('Loading csv: %s'):format(filename)) 25 | local all = csvigo.load{path=filename, mode='tidy'} 26 | local ids = all['id'] 27 | local actionss = all['actions'] 28 | local N = #ids 29 | local labels = {} 30 | for i = 1,#ids do 31 | local id = ids[i] 32 | local actions = actionss[i] 33 | local label = {} 34 | for a in string.gmatch(actions, '([^;]+)') do -- split on ';' 35 | local a = string.gmatch(a, '([^ ]+)') -- split on ' ' 36 | table.insert(label,{c=a(), s=tonumber(a()), e=tonumber(a())}) 37 | end 38 | labels[id] = label 39 | end 40 | return labels 41 | end 42 | 43 | 44 | local function prepare(opt,labels,split) 45 | require 'sys' 46 | require 'string' 47 | local imagePath = torch.CharTensor() 48 | local imageClass = torch.LongTensor() 49 | local dir = opt.data 50 | assert(paths.dirp(dir), 'directory not found: ' .. dir) 51 | local imagePaths, imageClasses, ids = {}, {}, {} 52 | local FPS, GAP, testGAP = 24, 4, 25 53 | local e,count = 0, 0 54 | 55 | -- For each video annotation, prepare test files 56 | local imageClasses2 57 | if split=='val_video' then 58 | imageClasses2 = torch.IntTensor(4000000, opt.nClasses):zero() --allocating memory 59 | end 60 | for id,label in pairs(labels) do 61 | e = e+1 62 | if e % 100 == 1 then print(e) end 63 | iddir = dir .. '/' .. id 64 | local f = io.popen(('find -L %s -iname "*.jpg" '):format(iddir)) 65 | if not f then 66 | print('class not found: ' .. id) 67 | print(('find -L %s -iname "*.jpg" '):format(iddir)) 68 | else 69 | local lines = {} 70 | while true do 71 | local line = f:read('*line') 72 | if not line then break end 73 | table.insert(lines,line) 74 | end 75 | local N = #lines 76 | if split=='val_video' then 77 | local target = torch.IntTensor(157,1):zero() 78 | for _,anno in pairs(label) do 79 | target[1+tonumber(string.sub(anno.c,2,-1))] = 1 -- 1-index 80 | end 81 | local tmp = torch.linspace(1,N,testGAP) 82 | for ii = 1,testGAP do 83 | local i = torch.floor(tmp[ii]) 84 | local impath = iddir .. '/' .. id .. '-' .. string.format('%06d',i) .. '.jpg' 85 | count = count + 1 86 | table.insert(imagePaths,impath) 87 | imageClasses2[count]:copy(target) 88 | table.insert(ids,id) 89 | end 90 | elseif opt.setup == 'softmax' then 91 | if #label>0 then 92 | for _,anno in pairs(label) do 93 | for i = 1,N,GAP do 94 | if (anno.s<(i-1)/FPS) and ((i-1)/FPS Generating list of images") 161 | local classList, classToIdx = findClasses(trainDir) 162 | 163 | print(" | finding all validation2 images") 164 | local val2ImagePath, val2ImageClass, val2ids = prepare(opt,labelstest,'val_video') 165 | 166 | print(" | finding all validation images") 167 | local valImagePath, valImageClass, valids = prepare(opt,labelstest,'val') 168 | 169 | print(" | finding all training images") 170 | local trainImagePath, trainImageClass, ids = prepare(opt,labels,'train') 171 | 172 | local info = { 173 | basedir = opt.data, 174 | classList = classList, 175 | train = { 176 | imagePath = trainImagePath, 177 | imageClass = trainImageClass, 178 | ids = ids, 179 | }, 180 | val = { 181 | imagePath = valImagePath, 182 | imageClass = valImageClass, 183 | ids = valids, 184 | }, 185 | val2 = { 186 | imagePath = val2ImagePath, 187 | imageClass = val2ImageClass, 188 | ids = val2ids, 189 | }, 190 | } 191 | 192 | print(" | saving list of images to " .. cacheFile) 193 | torch.save(cacheFile, info) 194 | return info 195 | end 196 | 197 | return M 198 | -------------------------------------------------------------------------------- /torch/datasets/charades.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Charades dataset loader 10 | -- Contributor: Gunnar Atli Sigurdsson 11 | 12 | local image = require 'image' 13 | local paths = require 'paths' 14 | local t = require 'datasets/transforms' 15 | local ffi = require 'ffi' 16 | 17 | local M = {} 18 | local CharadesDataset = torch.class('resnet.CharadesDataset', M) 19 | 20 | function CharadesDataset:__init(imageInfo, opt, split) 21 | self.imageInfo = imageInfo[split] 22 | self.opt = opt 23 | self.split = split 24 | self.dir = opt.data 25 | assert(paths.dirp(self.dir), 'directory does not exist: ' .. self.dir) 26 | end 27 | 28 | function CharadesDataset:get(i) 29 | local path = ffi.string(self.imageInfo.imagePath[i]:data()) 30 | local image = self:_loadImage(paths.concat(self.dir, path)) 31 | local class = self.imageInfo.imageClass[i] 32 | local id = ffi.string(self.imageInfo.ids[i]:data()) 33 | 34 | return { 35 | input = image, 36 | target = class, 37 | id = id 38 | } 39 | end 40 | 41 | function CharadesDataset:_loadImage(path) 42 | local ok, input = pcall(function() 43 | return image.load(path, 3, 'float') 44 | end) 45 | 46 | -- Sometimes image.load fails because the file extension does not match the 47 | -- image format. In that case, use image.decompress on a ByteTensor. 48 | if not ok then 49 | local f = io.open(path, 'r') 50 | assert(f, 'Error reading: ' .. tostring(path)) 51 | local data = f:read('*a') 52 | f:close() 53 | 54 | local b = torch.ByteTensor(string.len(data)) 55 | ffi.copy(b:data(), data, b:size(1)) 56 | 57 | input = image.decompress(b, 3, 'float') 58 | end 59 | 60 | return input 61 | end 62 | 63 | function CharadesDataset:size() 64 | return self.imageInfo.imageClass:size(1) 65 | end 66 | 67 | -- Computed from random subset of ImageNet training images 68 | local meanstd = { 69 | mean = { 103.939/255, 116.779/255, 123.68/255 }, --vgg16 70 | std = { 1.0, 1.0, 1.0 }, -- I don't think caffe normalizes 71 | } 72 | local pca = { 73 | eigval = torch.Tensor{ 0.2175, 0.0188, 0.0045 }, 74 | eigvec = torch.Tensor{ 75 | { -0.5675, 0.7192, 0.4009 }, 76 | { -0.5808, -0.0045, -0.8140 }, 77 | { -0.5836, -0.6948, 0.4203 }, 78 | }, 79 | } 80 | 81 | function CharadesDataset:preprocess() 82 | if self.split == 'train' then 83 | return t.Compose{ 84 | t.RandomSizedCrop(224), 85 | t.ColorJitter({ 86 | brightness = 0.4, 87 | contrast = 0.4, 88 | saturation = 0.4, 89 | }), 90 | t.Lighting(0.1, pca.eigval, pca.eigvec), 91 | t.ColorNormalize(meanstd), 92 | t.HorizontalFlip(0.5), 93 | } 94 | elseif self.split == 'val' then 95 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 96 | return t.Compose{ 97 | t.Scale(256), 98 | t.ColorNormalize(meanstd), 99 | Crop(224), 100 | } 101 | elseif self.split == 'val2' then 102 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 103 | return t.Compose{ 104 | t.Scale(256), 105 | t.ColorNormalize(meanstd), 106 | Crop(224), 107 | } 108 | else 109 | error('invalid split: ' .. self.split) 110 | end 111 | end 112 | 113 | return M.CharadesDataset 114 | -------------------------------------------------------------------------------- /torch/datasets/charadesflow-gen.lua: -------------------------------------------------------------------------------- 1 | -- This source code is licensed under the BSD-style license found in the 2 | -- LICENSE file in the root directory of this source tree. An additional grant 3 | -- of patent rights can be found in the PATENTS file in the same directory. 4 | -- 5 | -- Script to compute list of Charades filenames and classes 6 | -- 7 | -- This generates a file gen/charadesflow.t7 which contains the list of all 8 | -- Charades training and validation images and their classes. This script also 9 | -- works for other datasets arragned with the same layout. 10 | -- 11 | -- Contributor: Gunnar Atli Sigurdsson 12 | 13 | local sys = require 'sys' 14 | local ffi = require 'ffi' 15 | 16 | local M = {} 17 | 18 | local function parseCSV(filename) 19 | require 'csvigo' 20 | print(('Loading csv: %s'):format(filename)) 21 | local all = csvigo.load{path=filename, mode='tidy'} 22 | local ids = all['id'] 23 | local actionss = all['actions'] 24 | local N = #ids 25 | local labels = {} 26 | for i = 1,#ids do 27 | local id = ids[i] 28 | local actions = actionss[i] 29 | local label = {} 30 | for a in string.gmatch(actions, '([^;]+)') do -- split on ';' 31 | local a = string.gmatch(a, '([^ ]+)') -- split on ' ' 32 | table.insert(label,{c=a(), s=tonumber(a()), e=tonumber(a())}) 33 | end 34 | labels[id] = label 35 | end 36 | return labels 37 | end 38 | 39 | 40 | local function prepare(opt,labels,split) 41 | require 'sys' 42 | require 'string' 43 | local imagePath = torch.CharTensor() 44 | local imageClass = torch.LongTensor() 45 | local dir = opt.data 46 | assert(paths.dirp(dir), 'directory not found: ' .. dir) 47 | local imagePaths = {} 48 | local imageClasses = {} 49 | local ids = {} 50 | local FPS = 24 51 | local GAP = 4 52 | local testGAP = 25 53 | local flowframes = 10 54 | 55 | local e = 0 56 | local count = 0 57 | -- For each video annotation, prepare test files 58 | local imageClasses2 59 | if split=='val_video' then 60 | imageClasses2 = torch.IntTensor(4000000, opt.nClasses):zero() 61 | end 62 | for id,label in pairs(labels) do 63 | e = e+1 64 | if e % 100 == 1 then print(e) end 65 | iddir = dir .. '/' .. id 66 | local f = io.popen(('find -L %s -iname "*.jpg" '):format(iddir)) 67 | if not f then 68 | print('class not found: ' .. id) 69 | print(('find -L %s -iname "*.jpg" '):format(iddir)) 70 | else 71 | local lines = {} 72 | while true do 73 | local line = f:read('*line') 74 | if not line then break end 75 | table.insert(lines,line) 76 | end 77 | local N = torch.floor(#lines/2) -- to account for x and y 78 | if split=='val_video' then 79 | local target = torch.IntTensor(157,1):zero() 80 | for _,anno in pairs(label) do 81 | target[1+tonumber(string.sub(anno.c,2,-1))] = 1 -- 1-index 82 | end 83 | local tmp = torch.linspace(1,N-flowframes-1,testGAP) -- -1 so we don't get bad flow 84 | for ii = 1,testGAP do 85 | local i = torch.floor(tmp[ii]) 86 | local impath = iddir .. '/' .. id .. '-' .. string.format('%06d',i) .. 'x' .. '.jpg' 87 | count = count + 1 88 | table.insert(imagePaths,impath) 89 | imageClasses2[count]:copy(target) 90 | table.insert(ids,id) 91 | end 92 | elseif opt.setup == 'softmax' then 93 | if #label>0 then 94 | for _,anno in pairs(label) do 95 | for i = 1,N,GAP do 96 | if (anno.s<(i-1)/FPS) and ((i-1)/FPS Generating list of images") 163 | local classList, classToIdx = findClasses(trainDir) 164 | 165 | print(" | finding all validation2 images") 166 | local val2ImagePath, val2ImageClass, val2ids = prepare(opt,labelstest,'val_video') 167 | 168 | print(" | finding all validation images") 169 | local valImagePath, valImageClass, valids = prepare(opt,labelstest,'val') 170 | 171 | print(" | finding all training images") 172 | local trainImagePath, trainImageClass, ids = prepare(opt,labels,'train') 173 | 174 | local info = { 175 | basedir = opt.data, 176 | classList = classList, 177 | train = { 178 | imagePath = trainImagePath, 179 | imageClass = trainImageClass, 180 | ids = ids, 181 | }, 182 | val = { 183 | imagePath = valImagePath, 184 | imageClass = valImageClass, 185 | ids = valids, 186 | }, 187 | val2 = { 188 | imagePath = val2ImagePath, 189 | imageClass = val2ImageClass, 190 | ids = val2ids, 191 | }, 192 | } 193 | 194 | print(" | saving list of images to " .. cacheFile) 195 | torch.save(cacheFile, info) 196 | return info 197 | end 198 | 199 | return M 200 | -------------------------------------------------------------------------------- /torch/datasets/charadesflow.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- ImageNet dataset loader 10 | -- 11 | 12 | local image = require 'image' 13 | local paths = require 'paths' 14 | local t = require 'datasets/transforms' 15 | local ffi = require 'ffi' 16 | 17 | local M = {} 18 | local CharadesDataset = torch.class('resnet.CharadesDataset', M) 19 | 20 | function CharadesDataset:__init(imageInfo, opt, split) 21 | self.imageInfo = imageInfo[split] 22 | self.opt = opt 23 | self.split = split 24 | self.dir = opt.data 25 | assert(paths.dirp(self.dir), 'directory does not exist: ' .. self.dir) 26 | end 27 | 28 | function CharadesDataset:get(i) 29 | -- This function loads in 20 consecutive optical flow images (10 x and 10 y images) 30 | -- Follwing the two-stream architecture 31 | local path = ffi.string(self.imageInfo.imagePath[i]:data()) 32 | local image1 = self:_loadImage(paths.concat(self.dir, path)) 33 | local finalimage = torch.Tensor(20,image1:size(2),image1:size(3)) 34 | -- the path is of the format */?????-000000x.jpg 35 | local prefix = string.sub(path,1,#path-6-5) 36 | local number = string.sub(path,#path-6-5+1,#path-5) 37 | for j = 1,10 do 38 | local thispath1 = prefix .. string.format('%06d',number-1+j) .. 'x' .. '.jpg' 39 | local thispath2 = prefix .. string.format('%06d',number-1+j) .. 'y' .. '.jpg' 40 | local image1 = self:_loadImage(paths.concat(self.dir, thispath1)) 41 | local image2 = self:_loadImage(paths.concat(self.dir, thispath2)) 42 | finalimage[{(j-1)*2+1,{},{}}] = image1 43 | finalimage[{(j-1)*2+1+1,{},{}}] = image2 44 | end 45 | 46 | local class = self.imageInfo.imageClass[i] 47 | local id = ffi.string(self.imageInfo.ids[i]:data()) 48 | 49 | return { 50 | input = finalimage, 51 | target = class, 52 | id = id 53 | } 54 | end 55 | 56 | function CharadesDataset:_loadImage(path) 57 | local ok, input = pcall(function() 58 | --return image.load(path, 3, 'float') 59 | return image.load(path, 1, 'float') 60 | end) 61 | 62 | -- Sometimes image.load fails because the file extension does not match the 63 | -- image format. In that case, use image.decompress on a ByteTensor. 64 | if not ok then 65 | local f = io.open(path, 'r') 66 | assert(f, 'Error reading: ' .. tostring(path)) 67 | local data = f:read('*a') 68 | f:close() 69 | 70 | local b = torch.ByteTensor(string.len(data)) 71 | ffi.copy(b:data(), data, b:size(1)) 72 | 73 | --input = image.decompress(b, 3, 'float') 74 | input = image.decompress(b, 1, 'float') 75 | end 76 | 77 | return input 78 | end 79 | 80 | function CharadesDataset:size() 81 | return self.imageInfo.imageClass:size(1) 82 | end 83 | 84 | -- Computed from random subset of ImageNet training images 85 | local meanstd = { 86 | mean = { 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255 }, --flow vgg16 87 | std = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }, -- I don't think caffe normalizes 88 | } 89 | local pca = { 90 | eigval = torch.Tensor{ 0.2175, 0.0188, 0.0045 }, 91 | eigvec = torch.Tensor{ 92 | { -0.5675, 0.7192, 0.4009 }, 93 | { -0.5808, -0.0045, -0.8140 }, 94 | { -0.5836, -0.6948, 0.4203 }, 95 | }, 96 | } 97 | 98 | function CharadesDataset:preprocess() 99 | if self.split == 'train' then 100 | return t.Compose{ 101 | t.RandomSizedCrop(224), 102 | --t.ColorJitter({ 103 | -- brightness = 0.4, 104 | -- contrast = 0.4, 105 | -- saturation = 0.4, 106 | --}), 107 | --t.Lighting(0.1, pca.eigval, pca.eigvec), 108 | t.ColorNormalize(meanstd), 109 | t.HorizontalFlip(0.5), 110 | } 111 | elseif self.split == 'val' then 112 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 113 | return t.Compose{ 114 | t.Scale(256), 115 | t.ColorNormalize(meanstd), 116 | Crop(224), 117 | } 118 | elseif self.split == 'val2' then 119 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 120 | return t.Compose{ 121 | t.Scale(256), 122 | t.ColorNormalize(meanstd), 123 | Crop(224), 124 | } 125 | else 126 | error('invalid split: ' .. self.split) 127 | end 128 | end 129 | 130 | return M.CharadesDataset 131 | -------------------------------------------------------------------------------- /torch/datasets/charadessync-gen.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Script to compute list of Charades filenames and classes rgb images 10 | -- 11 | -- This version is different from charades-gen.lua as this loads videos one by one 12 | -- To train models that require sequential data, such as LSTM 13 | -- 14 | -- This generates a file gen/charades.t7 which contains the list of all 15 | -- Charades training and validation images and their classes. This script also 16 | -- works for other datasets arragned with the same layout. 17 | -- 18 | -- Contributor: Gunnar Atli Sigurdsson 19 | 20 | local sys = require 'sys' 21 | local ffi = require 'ffi' 22 | 23 | local M = {} 24 | 25 | local function parseCSV(filename) 26 | require 'csvigo' 27 | print(('Loading csv: %s'):format(filename)) 28 | local all = csvigo.load{path=filename, mode='tidy'} 29 | local ids = all['id'] 30 | local actionss = all['actions'] 31 | local N = #ids 32 | local labels = {} 33 | for i = 1,#ids do 34 | local id = ids[i] 35 | local actions = actionss[i] 36 | local label = {} 37 | for a in string.gmatch(actions, '([^;]+)') do -- split on ';' 38 | local a = string.gmatch(a, '([^ ]+)') -- split on ' ' 39 | table.insert(label,{c=a(), s=tonumber(a()), e=tonumber(a())}) 40 | end 41 | labels[id] = label 42 | end 43 | return labels 44 | end 45 | 46 | 47 | local function prepare(opt,labels,split) 48 | require 'sys' 49 | require 'string' 50 | local imagePath = torch.CharTensor() 51 | local imageClass = torch.LongTensor() 52 | local dir = opt.data 53 | assert(paths.dirp(dir), 'directory not found: ' .. dir) 54 | local imagePaths, imageClasses, ids = {}, {}, {} 55 | local FPS, GAP, testGAP = 24, 4, 25 56 | local e,count = 0, 0 57 | 58 | -- For each video annotation, prepare test files 59 | local imageClasses2 60 | if split=='val_video' then 61 | imageClasses2 = torch.IntTensor(4000000, opt.nClasses):zero() --allocating memory 62 | end 63 | for id,label in pairs(labels) do 64 | e = e+1 65 | if e % 100 == 1 then print(e) end 66 | iddir = dir .. '/' .. id 67 | local f = io.popen(('find -L %s -iname "*.jpg" '):format(iddir)) 68 | if not f then 69 | print('class not found: ' .. id) 70 | print(('find -L %s -iname "*.jpg" '):format(iddir)) 71 | else 72 | local lines = {} 73 | while true do 74 | local line = f:read('*line') 75 | if not line then break end 76 | table.insert(lines,line) 77 | end 78 | local N = #lines 79 | if split=='val_video' then 80 | local target = torch.IntTensor(157,1):zero() 81 | for _,anno in pairs(label) do 82 | target[1+tonumber(string.sub(anno.c,2,-1))] = 1 -- 1-index 83 | end 84 | local tmp = torch.linspace(1,N,testGAP) 85 | for ii = 1,testGAP do 86 | local i = torch.floor(tmp[ii]) 87 | local impath = iddir .. '/' .. id .. '-' .. string.format('%06d',i) .. '.jpg' 88 | count = count + 1 89 | table.insert(imagePaths,impath) 90 | imageClasses2[count]:copy(target) 91 | table.insert(ids,id) 92 | end 93 | elseif opt.setup == 'softmax' then 94 | local localimagePaths = {} 95 | local localimageClasses = {} 96 | local localids = {} 97 | if #label>0 then 98 | -- To generate training data with softmax loss (only one label) 99 | -- We create a sorted pool with all pairs of (frames,label) 100 | -- and then randomly select a subset of those according to our batch size 101 | -- Someone should really figure out how to properly use sigmoid loss for this 102 | for i = 1,N,GAP do 103 | for _,anno in pairs(label) do 104 | if (anno.s<(i-1)/FPS) and ((i-1)/FPS=opt.batchSize then 116 | local inds = torch.multinomial(torch.Tensor(1,Nex):fill(1),opt.batchSize)[1] 117 | inds = inds:sort() 118 | assert(inds:size(1)==opt.batchSize) 119 | for aa = 1,opt.batchSize do 120 | a = inds[aa] 121 | table.insert(imagePaths,localimagePaths[a]) 122 | table.insert(imageClasses, localimageClasses[a]) -- 1-index 123 | table.insert(ids,localids[a]) 124 | end 125 | end 126 | elseif opt.setup == 'sigmoid' then 127 | -- TODO 128 | assert(false,'Invalid opt.setup') 129 | else 130 | assert(false,'Invalid opt.setup') 131 | end 132 | f:close() 133 | end 134 | end 135 | 136 | -- Convert the generated list to a tensor for faster loading 137 | local nImages = #imagePaths 138 | local maxLength = -1 139 | for _,p in pairs(imagePaths) do 140 | maxLength = math.max(maxLength, #p + 1) 141 | end 142 | local imagePath = torch.CharTensor(nImages, maxLength):zero() 143 | for i, path in ipairs(imagePaths) do 144 | ffi.copy(imagePath[i]:data(), path) 145 | end 146 | 147 | local maxLength2 = -1 148 | for _,p in pairs(ids) do 149 | maxLength2 = math.max(maxLength2, #p + 1) 150 | end 151 | local ids_tensor = torch.CharTensor(nImages, maxLength2):zero() 152 | for i, path in ipairs(ids) do 153 | ffi.copy(ids_tensor[i]:data(), path) 154 | end 155 | 156 | local imageClass = torch.LongTensor(imageClasses) 157 | if split=='val_video' then 158 | imageClass = imageClasses2[{{1,count},{}}] 159 | end 160 | assert(imagePath:size(1)==imageClass:size(1),"Sizes do not match") 161 | 162 | return imagePath, imageClass, ids_tensor 163 | end 164 | 165 | 166 | local function findClasses(dir) 167 | return Nil, Nil 168 | end 169 | 170 | 171 | function M.exec(opt, cacheFile) 172 | -- find the image path names 173 | local imagePath = torch.CharTensor() -- path to each image in dataset 174 | local imageClass = torch.LongTensor() -- class index of each image (class index in self.classes) 175 | 176 | local filename = opt.trainfile 177 | local filenametest = opt.testfile 178 | local labels = parseCSV(filename) 179 | print('done parsing train csv') 180 | local labelstest = parseCSV(filenametest) 181 | print('done parsing test csv') 182 | 183 | print("=> Generating list of images") 184 | local classList, classToIdx = findClasses(trainDir) 185 | 186 | print(" | finding all validation2 images") 187 | local val2ImagePath, val2ImageClass, val2ids = prepare(opt,labelstest,'val_video') 188 | 189 | print(" | finding all validation images") 190 | local valImagePath, valImageClass, valids = prepare(opt,labelstest,'val') 191 | 192 | print(" | finding all training images") 193 | local trainImagePath, trainImageClass, ids = prepare(opt,labels,'train') 194 | 195 | local info = { 196 | basedir = opt.data, 197 | classList = classList, 198 | train = { 199 | imagePath = trainImagePath, 200 | imageClass = trainImageClass, 201 | ids = ids, 202 | }, 203 | val = { 204 | imagePath = valImagePath, 205 | imageClass = valImageClass, 206 | ids = valids, 207 | }, 208 | val2 = { 209 | imagePath = val2ImagePath, 210 | imageClass = val2ImageClass, 211 | ids = val2ids, 212 | }, 213 | } 214 | 215 | print(" | saving list of images to " .. cacheFile) 216 | torch.save(cacheFile, info) 217 | return info 218 | end 219 | 220 | return M 221 | -------------------------------------------------------------------------------- /torch/datasets/charadessync.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Charades dataset loader 10 | -- Contributor: Gunnar Atli Sigurdsson 11 | 12 | local image = require 'image' 13 | local paths = require 'paths' 14 | local t = require 'datasets/transforms' 15 | local ffi = require 'ffi' 16 | 17 | local M = {} 18 | local CharadesDataset = torch.class('resnet.CharadesDataset', M) 19 | 20 | function CharadesDataset:__init(imageInfo, opt, split) 21 | self.imageInfo = imageInfo[split] 22 | self.opt = opt 23 | self.split = split 24 | self.dir = opt.data 25 | assert(paths.dirp(self.dir), 'directory does not exist: ' .. self.dir) 26 | end 27 | 28 | function CharadesDataset:get(i) 29 | local path = ffi.string(self.imageInfo.imagePath[i]:data()) 30 | local image = self:_loadImage(paths.concat(self.dir, path)) 31 | local class = self.imageInfo.imageClass[i] 32 | local id = ffi.string(self.imageInfo.ids[i]:data()) 33 | 34 | return { 35 | input = image, 36 | target = class, 37 | id = id 38 | } 39 | end 40 | 41 | function CharadesDataset:_loadImage(path) 42 | local ok, input = pcall(function() 43 | return image.load(path, 3, 'float') 44 | end) 45 | 46 | -- Sometimes image.load fails because the file extension does not match the 47 | -- image format. In that case, use image.decompress on a ByteTensor. 48 | if not ok then 49 | local f = io.open(path, 'r') 50 | assert(f, 'Error reading: ' .. tostring(path)) 51 | local data = f:read('*a') 52 | f:close() 53 | 54 | local b = torch.ByteTensor(string.len(data)) 55 | ffi.copy(b:data(), data, b:size(1)) 56 | 57 | input = image.decompress(b, 3, 'float') 58 | end 59 | 60 | return input 61 | end 62 | 63 | function CharadesDataset:size() 64 | return self.imageInfo.imageClass:size(1) 65 | end 66 | 67 | -- Computed from random subset of ImageNet training images 68 | local meanstd = { 69 | mean = { 103.939/255, 116.779/255, 123.68/255 }, --vgg16 70 | std = { 1.0, 1.0, 1.0 }, -- I don't think caffe normalizes 71 | } 72 | local pca = { 73 | eigval = torch.Tensor{ 0.2175, 0.0188, 0.0045 }, 74 | eigvec = torch.Tensor{ 75 | { -0.5675, 0.7192, 0.4009 }, 76 | { -0.5808, -0.0045, -0.8140 }, 77 | { -0.5836, -0.6948, 0.4203 }, 78 | }, 79 | } 80 | 81 | function CharadesDataset:preprocess() 82 | if self.split == 'train' then 83 | return t.Compose{ 84 | t.RandomSizedCrop(224), 85 | t.ColorJitter({ 86 | brightness = 0.4, 87 | contrast = 0.4, 88 | saturation = 0.4, 89 | }), 90 | t.Lighting(0.1, pca.eigval, pca.eigvec), 91 | t.ColorNormalize(meanstd), 92 | t.HorizontalFlip(0.5), 93 | } 94 | elseif self.split == 'val' then 95 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 96 | return t.Compose{ 97 | t.Scale(256), 98 | t.ColorNormalize(meanstd), 99 | Crop(224), 100 | } 101 | elseif self.split == 'val2' then 102 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 103 | return t.Compose{ 104 | t.Scale(256), 105 | t.ColorNormalize(meanstd), 106 | Crop(224), 107 | } 108 | else 109 | error('invalid split: ' .. self.split) 110 | end 111 | end 112 | 113 | return M.CharadesDataset 114 | -------------------------------------------------------------------------------- /torch/datasets/charadessyncflow-gen.lua: -------------------------------------------------------------------------------- 1 | -- This source code is licensed under the BSD-style license found in the 2 | -- LICENSE file in the root directory of this source tree. An additional grant 3 | -- of patent rights can be found in the PATENTS file in the same directory. 4 | -- 5 | -- Script to compute list of Charades filenames and classes 6 | -- 7 | -- This generates a file gen/charadesflow.t7 which contains the list of all 8 | -- Charades training and validation images and their classes. This script also 9 | -- works for other datasets arragned with the same layout. 10 | -- 11 | -- Contributor: Gunnar Atli Sigurdsson 12 | 13 | local sys = require 'sys' 14 | local ffi = require 'ffi' 15 | 16 | local M = {} 17 | 18 | local function parseCSV(filename) 19 | require 'csvigo' 20 | print(('Loading csv: %s'):format(filename)) 21 | local all = csvigo.load{path=filename, mode='tidy'} 22 | local ids = all['id'] 23 | local actionss = all['actions'] 24 | local N = #ids 25 | local labels = {} 26 | for i = 1,#ids do 27 | local id = ids[i] 28 | local actions = actionss[i] 29 | local label = {} 30 | for a in string.gmatch(actions, '([^;]+)') do -- split on ';' 31 | local a = string.gmatch(a, '([^ ]+)') -- split on ' ' 32 | table.insert(label,{c=a(), s=tonumber(a()), e=tonumber(a())}) 33 | end 34 | labels[id] = label 35 | end 36 | return labels 37 | end 38 | 39 | 40 | local function prepare(opt,labels,split) 41 | require 'sys' 42 | require 'string' 43 | local imagePath = torch.CharTensor() 44 | local imageClass = torch.LongTensor() 45 | local dir = opt.data 46 | assert(paths.dirp(dir), 'directory not found: ' .. dir) 47 | local imagePaths = {} 48 | local imageClasses = {} 49 | local ids = {} 50 | local FPS = 24 51 | local GAP = 4 52 | local testGAP = 25 53 | local flowframes = 10 54 | 55 | local e = 0 56 | local count = 0 57 | -- For each video annotation, prepare test files 58 | local imageClasses2 59 | if split=='val_video' then 60 | imageClasses2 = torch.IntTensor(4000000, opt.nClasses):zero() 61 | end 62 | for id,label in pairs(labels) do 63 | e = e+1 64 | if e % 100 == 1 then print(e) end 65 | iddir = dir .. '/' .. id 66 | local f = io.popen(('find -L %s -iname "*.jpg" '):format(iddir)) 67 | if not f then 68 | print('class not found: ' .. id) 69 | print(('find -L %s -iname "*.jpg" '):format(iddir)) 70 | else 71 | local lines = {} 72 | while true do 73 | local line = f:read('*line') 74 | if not line then break end 75 | table.insert(lines,line) 76 | end 77 | local N = torch.floor(#lines/2) -- to account for x and y 78 | if split=='val_video' then 79 | local target = torch.IntTensor(157,1):zero() 80 | for _,anno in pairs(label) do 81 | target[1+tonumber(string.sub(anno.c,2,-1))] = 1 -- 1-index 82 | end 83 | local tmp = torch.linspace(1,N-flowframes-1,testGAP) -- -1 so we don't get bad flow 84 | for ii = 1,testGAP do 85 | local i = torch.floor(tmp[ii]) 86 | local impath = iddir .. '/' .. id .. '-' .. string.format('%06d',i) .. 'x' .. '.jpg' 87 | count = count + 1 88 | table.insert(imagePaths,impath) 89 | imageClasses2[count]:copy(target) 90 | table.insert(ids,id) 91 | end 92 | elseif opt.setup == 'softmax' then 93 | local localimagePaths = {} 94 | local localimageClasses = {} 95 | local localids = {} 96 | if #label>0 then 97 | for i = 1,N,GAP do 98 | for _,anno in pairs(label) do 99 | if (anno.s<(i-1)/FPS) and ((i-1)/FPS=opt.batchSize then 113 | local inds = torch.multinomial(torch.Tensor(1,Nex):fill(1),opt.batchSize)[1] 114 | inds = inds:sort() 115 | assert(inds:size(1)==opt.batchSize) 116 | for aa = 1,opt.batchSize do 117 | a = inds[aa] 118 | table.insert(imagePaths,localimagePaths[a]) 119 | table.insert(imageClasses, localimageClasses[a]) -- 1-index 120 | table.insert(ids,localids[a]) 121 | end 122 | end 123 | elseif opt.setup == 'sigmoid' then 124 | -- TODO 125 | assert(false,'Invalid opt.setup') 126 | else 127 | assert(false,'Invalid opt.setup') 128 | end 129 | f:close() 130 | end 131 | end 132 | 133 | -- Convert the generated list to a tensor for faster loading 134 | local nImages = #imagePaths 135 | local maxLength = -1 136 | for _,p in pairs(imagePaths) do 137 | maxLength = math.max(maxLength, #p + 1) 138 | end 139 | local imagePath = torch.CharTensor(nImages, maxLength):zero() 140 | for i, path in ipairs(imagePaths) do 141 | ffi.copy(imagePath[i]:data(), path) 142 | end 143 | local maxLength2 = -1 144 | for _,p in pairs(ids) do 145 | maxLength2 = math.max(maxLength2, #p + 1) 146 | end 147 | local ids_tensor = torch.CharTensor(nImages, maxLength2):zero() 148 | for i, path in ipairs(ids) do 149 | ffi.copy(ids_tensor[i]:data(), path) 150 | end 151 | local imageClass = torch.LongTensor(imageClasses) 152 | if split=='val_video' then 153 | imageClass = imageClasses2[{{1,count},{}}] 154 | end 155 | assert(imagePath:size(1)==imageClass:size(1),"Sizes do not match") 156 | 157 | return imagePath, imageClass, ids_tensor 158 | end 159 | 160 | 161 | local function findClasses(dir) 162 | return Nil, Nil 163 | end 164 | 165 | 166 | function M.exec(opt, cacheFile) 167 | -- find the image path names 168 | local imagePath = torch.CharTensor() -- path to each image in dataset 169 | local imageClass = torch.LongTensor() -- class index of each image (class index in self.classes) 170 | 171 | local filename = opt.trainfile 172 | local filenametest = opt.testfile 173 | local labels = parseCSV(filename) 174 | print('done parsing train csv') 175 | local labelstest = parseCSV(filenametest) 176 | print('done parsing test csv') 177 | 178 | print("=> Generating list of images") 179 | local classList, classToIdx = findClasses(trainDir) 180 | 181 | print(" | finding all validation2 images") 182 | local val2ImagePath, val2ImageClass, val2ids = prepare(opt,labelstest,'val_video') 183 | 184 | print(" | finding all validation images") 185 | local valImagePath, valImageClass, valids = prepare(opt,labelstest,'val') 186 | 187 | print(" | finding all training images") 188 | local trainImagePath, trainImageClass, ids = prepare(opt,labels,'train') 189 | 190 | local info = { 191 | basedir = opt.data, 192 | classList = classList, 193 | train = { 194 | imagePath = trainImagePath, 195 | imageClass = trainImageClass, 196 | ids = ids, 197 | }, 198 | val = { 199 | imagePath = valImagePath, 200 | imageClass = valImageClass, 201 | ids = valids, 202 | }, 203 | val2 = { 204 | imagePath = val2ImagePath, 205 | imageClass = val2ImageClass, 206 | ids = val2ids, 207 | }, 208 | } 209 | 210 | print(" | saving list of images to " .. cacheFile) 211 | torch.save(cacheFile, info) 212 | return info 213 | end 214 | 215 | return M 216 | -------------------------------------------------------------------------------- /torch/datasets/charadessyncflow.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- ImageNet dataset loader 10 | -- 11 | 12 | local image = require 'image' 13 | local paths = require 'paths' 14 | local t = require 'datasets/transforms' 15 | local ffi = require 'ffi' 16 | 17 | local M = {} 18 | local CharadesDataset = torch.class('resnet.CharadesDataset', M) 19 | 20 | function CharadesDataset:__init(imageInfo, opt, split) 21 | self.imageInfo = imageInfo[split] 22 | self.opt = opt 23 | self.split = split 24 | self.dir = opt.data 25 | assert(paths.dirp(self.dir), 'directory does not exist: ' .. self.dir) 26 | end 27 | 28 | function CharadesDataset:get(i) 29 | -- This function loads in 20 consecutive optical flow images (10 x and 10 y images) 30 | -- Follwing the two-stream architecture 31 | local path = ffi.string(self.imageInfo.imagePath[i]:data()) 32 | local image1 = self:_loadImage(paths.concat(self.dir, path)) 33 | local finalimage = torch.Tensor(20,image1:size(2),image1:size(3)) 34 | -- the path is of the format */?????-000000x.jpg 35 | local prefix = string.sub(path,1,#path-6-5) 36 | local number = string.sub(path,#path-6-5+1,#path-5) 37 | for j = 1,10 do 38 | local thispath1 = prefix .. string.format('%06d',number-1+j) .. 'x' .. '.jpg' 39 | local thispath2 = prefix .. string.format('%06d',number-1+j) .. 'y' .. '.jpg' 40 | local image1 = self:_loadImage(paths.concat(self.dir, thispath1)) 41 | local image2 = self:_loadImage(paths.concat(self.dir, thispath2)) 42 | finalimage[{(j-1)*2+1,{},{}}] = image1 43 | finalimage[{(j-1)*2+1+1,{},{}}] = image2 44 | end 45 | 46 | local class = self.imageInfo.imageClass[i] 47 | local id = ffi.string(self.imageInfo.ids[i]:data()) 48 | 49 | return { 50 | input = finalimage, 51 | target = class, 52 | id = id 53 | } 54 | end 55 | 56 | function CharadesDataset:_loadImage(path) 57 | local ok, input = pcall(function() 58 | --return image.load(path, 3, 'float') 59 | return image.load(path, 1, 'float') 60 | end) 61 | 62 | -- Sometimes image.load fails because the file extension does not match the 63 | -- image format. In that case, use image.decompress on a ByteTensor. 64 | if not ok then 65 | local f = io.open(path, 'r') 66 | assert(f, 'Error reading: ' .. tostring(path)) 67 | local data = f:read('*a') 68 | f:close() 69 | 70 | local b = torch.ByteTensor(string.len(data)) 71 | ffi.copy(b:data(), data, b:size(1)) 72 | 73 | --input = image.decompress(b, 3, 'float') 74 | input = image.decompress(b, 1, 'float') 75 | end 76 | 77 | return input 78 | end 79 | 80 | function CharadesDataset:size() 81 | return self.imageInfo.imageClass:size(1) 82 | end 83 | 84 | -- Computed from random subset of ImageNet training images 85 | local meanstd = { 86 | mean = { 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255, 128.0/255 }, --flow vgg16 87 | std = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }, -- I don't think caffe normalizes 88 | } 89 | local pca = { 90 | eigval = torch.Tensor{ 0.2175, 0.0188, 0.0045 }, 91 | eigvec = torch.Tensor{ 92 | { -0.5675, 0.7192, 0.4009 }, 93 | { -0.5808, -0.0045, -0.8140 }, 94 | { -0.5836, -0.6948, 0.4203 }, 95 | }, 96 | } 97 | 98 | function CharadesDataset:preprocess() 99 | if self.split == 'train' then 100 | return t.Compose{ 101 | t.RandomSizedCrop(224), 102 | --t.ColorJitter({ 103 | -- brightness = 0.4, 104 | -- contrast = 0.4, 105 | -- saturation = 0.4, 106 | --}), 107 | --t.Lighting(0.1, pca.eigval, pca.eigvec), 108 | t.ColorNormalize(meanstd), 109 | t.HorizontalFlip(0.5), 110 | } 111 | elseif self.split == 'val' then 112 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 113 | return t.Compose{ 114 | t.Scale(256), 115 | t.ColorNormalize(meanstd), 116 | Crop(224), 117 | } 118 | elseif self.split == 'val2' then 119 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 120 | return t.Compose{ 121 | t.Scale(256), 122 | t.ColorNormalize(meanstd), 123 | Crop(224), 124 | } 125 | else 126 | error('invalid split: ' .. self.split) 127 | end 128 | end 129 | 130 | return M.CharadesDataset 131 | -------------------------------------------------------------------------------- /torch/datasets/cifar10-gen.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Script to compute list of ImageNet filenames and classes 10 | -- 11 | -- This automatically downloads the CIFAR-10 dataset from 12 | -- http://torch7.s3-website-us-east-1.amazonaws.com/data/cifar-10-torch.tar.gz 13 | -- 14 | 15 | local URL = 'http://torch7.s3-website-us-east-1.amazonaws.com/data/cifar-10-torch.tar.gz' 16 | 17 | local M = {} 18 | 19 | local function convertToTensor(files) 20 | local data, labels 21 | 22 | for _, file in ipairs(files) do 23 | local m = torch.load(file, 'ascii') 24 | if not data then 25 | data = m.data:t() 26 | labels = m.labels:squeeze() 27 | else 28 | data = torch.cat(data, m.data:t(), 1) 29 | labels = torch.cat(labels, m.labels:squeeze()) 30 | end 31 | end 32 | 33 | -- This is *very* important. The downloaded files have labels 0-9, which do 34 | -- not work with CrossEntropyCriterion 35 | labels:add(1) 36 | 37 | return { 38 | data = data:contiguous():view(-1, 3, 32, 32), 39 | labels = labels, 40 | } 41 | end 42 | 43 | function M.exec(opt, cacheFile) 44 | print("=> Downloading CIFAR-10 dataset from " .. URL) 45 | local ok = os.execute('curl ' .. URL .. ' | tar xz -C gen/') 46 | assert(ok == true or ok == 0, 'error downloading CIFAR-10') 47 | 48 | print(" | combining dataset into a single file") 49 | local trainData = convertToTensor({ 50 | 'gen/cifar-10-batches-t7/data_batch_1.t7', 51 | 'gen/cifar-10-batches-t7/data_batch_2.t7', 52 | 'gen/cifar-10-batches-t7/data_batch_3.t7', 53 | 'gen/cifar-10-batches-t7/data_batch_4.t7', 54 | 'gen/cifar-10-batches-t7/data_batch_5.t7', 55 | }) 56 | local testData = convertToTensor({ 57 | 'gen/cifar-10-batches-t7/test_batch.t7', 58 | }) 59 | 60 | print(" | saving CIFAR-10 dataset to " .. cacheFile) 61 | torch.save(cacheFile, { 62 | train = trainData, 63 | val = testData, 64 | }) 65 | end 66 | 67 | return M 68 | -------------------------------------------------------------------------------- /torch/datasets/cifar10.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- CIFAR-10 dataset loader 10 | -- 11 | 12 | local t = require 'datasets/transforms' 13 | 14 | local M = {} 15 | local CifarDataset = torch.class('resnet.CifarDataset', M) 16 | 17 | function CifarDataset:__init(imageInfo, opt, split) 18 | assert(imageInfo[split], split) 19 | self.imageInfo = imageInfo[split] 20 | self.split = split 21 | end 22 | 23 | function CifarDataset:get(i) 24 | local image = self.imageInfo.data[i]:float() 25 | local label = self.imageInfo.labels[i] 26 | 27 | return { 28 | input = image, 29 | target = label, 30 | } 31 | end 32 | 33 | function CifarDataset:size() 34 | return self.imageInfo.data:size(1) 35 | end 36 | 37 | -- Computed from entire CIFAR-10 training set 38 | local meanstd = { 39 | mean = {125.3, 123.0, 113.9}, 40 | std = {63.0, 62.1, 66.7}, 41 | } 42 | 43 | function CifarDataset:preprocess() 44 | if self.split == 'train' then 45 | return t.Compose{ 46 | t.ColorNormalize(meanstd), 47 | t.HorizontalFlip(0.5), 48 | t.RandomCrop(32, 4), 49 | } 50 | elseif self.split == 'val' then 51 | return t.ColorNormalize(meanstd) 52 | else 53 | error('invalid split: ' .. self.split) 54 | end 55 | end 56 | 57 | return M.CifarDataset 58 | -------------------------------------------------------------------------------- /torch/datasets/imagenet-gen.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Script to compute list of ImageNet filenames and classes 10 | -- 11 | -- This generates a file gen/imagenet.t7 which contains the list of all 12 | -- ImageNet training and validation images and their classes. This script also 13 | -- works for other datasets arragned with the same layout. 14 | -- 15 | 16 | local sys = require 'sys' 17 | local ffi = require 'ffi' 18 | 19 | local M = {} 20 | 21 | local function findClasses(dir) 22 | local dirs = paths.dir(dir) 23 | table.sort(dirs) 24 | 25 | local classList = {} 26 | local classToIdx = {} 27 | for _ ,class in ipairs(dirs) do 28 | if not classToIdx[class] and class ~= '.' and class ~= '..' then 29 | table.insert(classList, class) 30 | classToIdx[class] = #classList 31 | end 32 | end 33 | 34 | -- assert(#classList == 1000, 'expected 1000 ImageNet classes') 35 | return classList, classToIdx 36 | end 37 | 38 | local function findImages(dir, classToIdx) 39 | local imagePath = torch.CharTensor() 40 | local imageClass = torch.LongTensor() 41 | 42 | ---------------------------------------------------------------------- 43 | -- Options for the GNU and BSD find command 44 | local extensionList = {'jpg', 'png', 'jpeg', 'JPG', 'PNG', 'JPEG', 'ppm', 'PPM', 'bmp', 'BMP'} 45 | local findOptions = ' -iname "*.' .. extensionList[1] .. '"' 46 | for i=2,#extensionList do 47 | findOptions = findOptions .. ' -o -iname "*.' .. extensionList[i] .. '"' 48 | end 49 | 50 | -- Find all the images using the find command 51 | local f = io.popen('find -L ' .. dir .. findOptions) 52 | 53 | local maxLength = -1 54 | local imagePaths = {} 55 | local imageClasses = {} 56 | 57 | -- Generate a list of all the images and their class 58 | while true do 59 | local line = f:read('*line') 60 | if not line then break end 61 | 62 | local className = paths.basename(paths.dirname(line)) 63 | local filename = paths.basename(line) 64 | local path = className .. '/' .. filename 65 | 66 | local classId = classToIdx[className] 67 | assert(classId, 'class not found: ' .. className) 68 | 69 | table.insert(imagePaths, path) 70 | table.insert(imageClasses, classId) 71 | 72 | maxLength = math.max(maxLength, #path + 1) 73 | end 74 | 75 | f:close() 76 | 77 | -- Convert the generated list to a tensor for faster loading 78 | local nImages = #imagePaths 79 | local imagePath = torch.CharTensor(nImages, maxLength):zero() 80 | for i, path in ipairs(imagePaths) do 81 | ffi.copy(imagePath[i]:data(), path) 82 | end 83 | 84 | local imageClass = torch.LongTensor(imageClasses) 85 | return imagePath, imageClass 86 | end 87 | 88 | function M.exec(opt, cacheFile) 89 | -- find the image path names 90 | local imagePath = torch.CharTensor() -- path to each image in dataset 91 | local imageClass = torch.LongTensor() -- class index of each image (class index in self.classes) 92 | 93 | local trainDir = paths.concat(opt.data, 'train') 94 | local valDir = paths.concat(opt.data, 'val') 95 | assert(paths.dirp(trainDir), 'train directory not found: ' .. trainDir) 96 | assert(paths.dirp(valDir), 'val directory not found: ' .. valDir) 97 | 98 | print("=> Generating list of images") 99 | local classList, classToIdx = findClasses(trainDir) 100 | 101 | print(" | finding all validation images") 102 | local valImagePath, valImageClass = findImages(valDir, classToIdx) 103 | 104 | print(" | finding all training images") 105 | local trainImagePath, trainImageClass = findImages(trainDir, classToIdx) 106 | 107 | local info = { 108 | basedir = opt.data, 109 | classList = classList, 110 | train = { 111 | imagePath = trainImagePath, 112 | imageClass = trainImageClass, 113 | }, 114 | val = { 115 | imagePath = valImagePath, 116 | imageClass = valImageClass, 117 | }, 118 | } 119 | 120 | print(" | saving list of images to " .. cacheFile) 121 | torch.save(cacheFile, info) 122 | return info 123 | end 124 | 125 | return M 126 | -------------------------------------------------------------------------------- /torch/datasets/imagenet.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- ImageNet dataset loader 10 | -- 11 | 12 | local image = require 'image' 13 | local paths = require 'paths' 14 | local t = require 'datasets/transforms' 15 | local ffi = require 'ffi' 16 | 17 | local M = {} 18 | local ImagenetDataset = torch.class('resnet.ImagenetDataset', M) 19 | 20 | function ImagenetDataset:__init(imageInfo, opt, split) 21 | self.imageInfo = imageInfo[split] 22 | self.opt = opt 23 | self.split = split 24 | self.dir = paths.concat(opt.data, split) 25 | assert(paths.dirp(self.dir), 'directory does not exist: ' .. self.dir) 26 | end 27 | 28 | function ImagenetDataset:get(i) 29 | local path = ffi.string(self.imageInfo.imagePath[i]:data()) 30 | 31 | local image = self:_loadImage(paths.concat(self.dir, path)) 32 | local class = self.imageInfo.imageClass[i] 33 | 34 | return { 35 | input = image, 36 | target = class, 37 | } 38 | end 39 | 40 | function ImagenetDataset:_loadImage(path) 41 | local ok, input = pcall(function() 42 | return image.load(path, 3, 'float') 43 | end) 44 | 45 | -- Sometimes image.load fails because the file extension does not match the 46 | -- image format. In that case, use image.decompress on a ByteTensor. 47 | if not ok then 48 | local f = io.open(path, 'r') 49 | assert(f, 'Error reading: ' .. tostring(path)) 50 | local data = f:read('*a') 51 | f:close() 52 | 53 | local b = torch.ByteTensor(string.len(data)) 54 | ffi.copy(b:data(), data, b:size(1)) 55 | 56 | input = image.decompress(b, 3, 'float') 57 | end 58 | 59 | return input 60 | end 61 | 62 | function ImagenetDataset:size() 63 | return self.imageInfo.imageClass:size(1) 64 | end 65 | 66 | -- Computed from random subset of ImageNet training images 67 | local meanstd = { 68 | mean = { 0.485, 0.456, 0.406 }, 69 | std = { 0.229, 0.224, 0.225 }, 70 | } 71 | local pca = { 72 | eigval = torch.Tensor{ 0.2175, 0.0188, 0.0045 }, 73 | eigvec = torch.Tensor{ 74 | { -0.5675, 0.7192, 0.4009 }, 75 | { -0.5808, -0.0045, -0.8140 }, 76 | { -0.5836, -0.6948, 0.4203 }, 77 | }, 78 | } 79 | 80 | function ImagenetDataset:preprocess() 81 | if self.split == 'train' then 82 | return t.Compose{ 83 | t.RandomSizedCrop(224), 84 | t.ColorJitter({ 85 | brightness = 0.4, 86 | contrast = 0.4, 87 | saturation = 0.4, 88 | }), 89 | t.Lighting(0.1, pca.eigval, pca.eigvec), 90 | t.ColorNormalize(meanstd), 91 | t.HorizontalFlip(0.5), 92 | } 93 | elseif self.split == 'val' then 94 | local Crop = self.opt.tenCrop and t.TenCrop or t.CenterCrop 95 | return t.Compose{ 96 | t.Scale(256), 97 | t.ColorNormalize(meanstd), 98 | Crop(224), 99 | } 100 | else 101 | error('invalid split: ' .. self.split) 102 | end 103 | end 104 | 105 | return M.ImagenetDataset 106 | -------------------------------------------------------------------------------- /torch/datasets/init.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- ImageNet and CIFAR-10 datasets 10 | -- 11 | 12 | local M = {} 13 | 14 | local function isvalid(opt, cachePath) 15 | local imageInfo = torch.load(cachePath) 16 | if imageInfo.basedir and imageInfo.basedir ~= opt.data then 17 | return false 18 | end 19 | return true 20 | end 21 | 22 | function M.create(opt, split) 23 | local cachePath = paths.concat(opt.gen, opt.dataset .. '.t7') 24 | if not paths.filep(cachePath) or not isvalid(opt, cachePath) then 25 | paths.mkdir('gen') 26 | 27 | local script = paths.dofile(opt.dataset .. '-gen.lua') 28 | script.exec(opt, cachePath) 29 | end 30 | local imageInfo = torch.load(cachePath) 31 | 32 | local Dataset = require('datasets/' .. opt.dataset) 33 | opt.dataopt = imageInfo.opt 34 | return Dataset(imageInfo, opt, split) 35 | end 36 | 37 | return M 38 | -------------------------------------------------------------------------------- /torch/datasets/transforms.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Image transforms for data augmentation and input normalization 10 | -- 11 | 12 | require 'image' 13 | 14 | local M = {} 15 | 16 | function M.Compose(transforms) 17 | return function(input) 18 | for _, transform in ipairs(transforms) do 19 | input = transform(input) 20 | end 21 | return input 22 | end 23 | end 24 | 25 | function M.ColorNormalize(meanstd) 26 | return function(img) 27 | img = img:clone() 28 | for i=1,img:size(1) do 29 | img[i]:add(-meanstd.mean[i]) 30 | img[i]:div(meanstd.std[i]) 31 | end 32 | return img 33 | end 34 | end 35 | 36 | -- Scales the smaller edge to size 37 | function M.Scale(size, interpolation) 38 | interpolation = interpolation or 'bicubic' 39 | return function(input) 40 | local w, h = input:size(3), input:size(2) 41 | if (w <= h and w == size) or (h <= w and h == size) then 42 | return input 43 | end 44 | if w < h then 45 | return image.scale(input, size, h/w * size, interpolation) 46 | else 47 | return image.scale(input, w/h * size, size, interpolation) 48 | end 49 | end 50 | end 51 | 52 | -- Crop to centered rectangle 53 | function M.CenterCrop(size) 54 | return function(input) 55 | local w1 = math.ceil((input:size(3) - size)/2) 56 | local h1 = math.ceil((input:size(2) - size)/2) 57 | return image.crop(input, w1, h1, w1 + size, h1 + size) -- center patch 58 | end 59 | end 60 | 61 | -- Random crop form larger image with optional zero padding 62 | function M.RandomCrop(size, padding) 63 | padding = padding or 0 64 | 65 | return function(input) 66 | if padding > 0 then 67 | local temp = input.new(3, input:size(2) + 2*padding, input:size(3) + 2*padding) 68 | temp:zero() 69 | :narrow(2, padding+1, input:size(2)) 70 | :narrow(3, padding+1, input:size(3)) 71 | :copy(input) 72 | input = temp 73 | end 74 | 75 | local w, h = input:size(3), input:size(2) 76 | if w == size and h == size then 77 | return input 78 | end 79 | 80 | local x1, y1 = torch.random(0, w - size), torch.random(0, h - size) 81 | local out = image.crop(input, x1, y1, x1 + size, y1 + size) 82 | assert(out:size(2) == size and out:size(3) == size, 'wrong crop size') 83 | return out 84 | end 85 | end 86 | 87 | -- Four corner patches and center crop from image and its horizontal reflection 88 | function M.TenCrop(size) 89 | local centerCrop = M.CenterCrop(size) 90 | 91 | return function(input) 92 | local w, h = input:size(3), input:size(2) 93 | 94 | local output = {} 95 | for _, img in ipairs{input, image.hflip(input)} do 96 | table.insert(output, centerCrop(img)) 97 | table.insert(output, image.crop(img, 0, 0, size, size)) 98 | table.insert(output, image.crop(img, w-size, 0, w, size)) 99 | table.insert(output, image.crop(img, 0, h-size, size, h)) 100 | table.insert(output, image.crop(img, w-size, h-size, w, h)) 101 | end 102 | 103 | -- View as mini-batch 104 | for i, img in ipairs(output) do 105 | output[i] = img:view(1, img:size(1), img:size(2), img:size(3)) 106 | end 107 | 108 | return input.cat(output, 1) 109 | end 110 | end 111 | 112 | -- Resized with shorter side randomly sampled from [minSize, maxSize] (ResNet-style) 113 | function M.RandomScale(minSize, maxSize) 114 | return function(input) 115 | local w, h = input:size(3), input:size(2) 116 | 117 | local targetSz = torch.random(minSize, maxSize) 118 | local targetW, targetH = targetSz, targetSz 119 | if w < h then 120 | targetH = torch.round(h / w * targetW) 121 | else 122 | targetW = torch.round(w / h * targetH) 123 | end 124 | 125 | return image.scale(input, targetW, targetH, 'bicubic') 126 | end 127 | end 128 | 129 | -- Random crop with size 8%-100% and aspect ratio 3/4 - 4/3 (Inception-style) 130 | function M.RandomSizedCrop(size) 131 | local scale = M.Scale(size) 132 | local crop = M.CenterCrop(size) 133 | 134 | return function(input) 135 | local attempt = 0 136 | repeat 137 | local area = input:size(2) * input:size(3) 138 | local targetArea = torch.uniform(0.08, 1.0) * area 139 | 140 | local aspectRatio = torch.uniform(3/4, 4/3) 141 | local w = torch.round(math.sqrt(targetArea * aspectRatio)) 142 | local h = torch.round(math.sqrt(targetArea / aspectRatio)) 143 | 144 | if torch.uniform() < 0.5 then 145 | w, h = h, w 146 | end 147 | 148 | if h <= input:size(2) and w <= input:size(3) then 149 | local y1 = torch.random(0, input:size(2) - h) 150 | local x1 = torch.random(0, input:size(3) - w) 151 | 152 | local out = image.crop(input, x1, y1, x1 + w, y1 + h) 153 | assert(out:size(2) == h and out:size(3) == w, 'wrong crop size') 154 | 155 | return image.scale(out, size, size, 'bicubic') 156 | end 157 | attempt = attempt + 1 158 | until attempt >= 10 159 | 160 | -- fallback 161 | return crop(scale(input)) 162 | end 163 | end 164 | 165 | function M.HorizontalFlip(prob) 166 | return function(input) 167 | if torch.uniform() < prob then 168 | input = image.hflip(input) 169 | end 170 | return input 171 | end 172 | end 173 | 174 | function M.Rotation(deg) 175 | return function(input) 176 | if deg ~= 0 then 177 | input = image.rotate(input, (torch.uniform() - 0.5) * deg * math.pi / 180, 'bilinear') 178 | end 179 | return input 180 | end 181 | end 182 | 183 | -- Lighting noise (AlexNet-style PCA-based noise) 184 | function M.Lighting(alphastd, eigval, eigvec) 185 | return function(input) 186 | if alphastd == 0 then 187 | return input 188 | end 189 | 190 | local alpha = torch.Tensor(3):normal(0, alphastd) 191 | local rgb = eigvec:clone() 192 | :cmul(alpha:view(1, 3):expand(3, 3)) 193 | :cmul(eigval:view(1, 3):expand(3, 3)) 194 | :sum(2) 195 | :squeeze() 196 | 197 | input = input:clone() 198 | for i=1,3 do 199 | input[i]:add(rgb[i]) 200 | end 201 | return input 202 | end 203 | end 204 | 205 | local function blend(img1, img2, alpha) 206 | return img1:mul(alpha):add(1 - alpha, img2) 207 | end 208 | 209 | local function grayscale(dst, img) 210 | dst:resizeAs(img) 211 | dst[1]:zero() 212 | dst[1]:add(0.299, img[1]):add(0.587, img[2]):add(0.114, img[3]) 213 | dst[2]:copy(dst[1]) 214 | dst[3]:copy(dst[1]) 215 | return dst 216 | end 217 | 218 | function M.Saturation(var) 219 | local gs 220 | 221 | return function(input) 222 | gs = gs or input.new() 223 | grayscale(gs, input) 224 | 225 | local alpha = 1.0 + torch.uniform(-var, var) 226 | blend(input, gs, alpha) 227 | return input 228 | end 229 | end 230 | 231 | function M.Brightness(var) 232 | local gs 233 | 234 | return function(input) 235 | gs = gs or input.new() 236 | gs:resizeAs(input):zero() 237 | 238 | local alpha = 1.0 + torch.uniform(-var, var) 239 | blend(input, gs, alpha) 240 | return input 241 | end 242 | end 243 | 244 | function M.Contrast(var) 245 | local gs 246 | 247 | return function(input) 248 | gs = gs or input.new() 249 | grayscale(gs, input) 250 | gs:fill(gs[1]:mean()) 251 | 252 | local alpha = 1.0 + torch.uniform(-var, var) 253 | blend(input, gs, alpha) 254 | return input 255 | end 256 | end 257 | 258 | function M.RandomOrder(ts) 259 | return function(input) 260 | local img = input.img or input 261 | local order = torch.randperm(#ts) 262 | for i=1,#ts do 263 | img = ts[order[i]](img) 264 | end 265 | return input 266 | end 267 | end 268 | 269 | function M.ColorJitter(opt) 270 | local brightness = opt.brightness or 0 271 | local contrast = opt.contrast or 0 272 | local saturation = opt.saturation or 0 273 | 274 | local ts = {} 275 | if brightness ~= 0 then 276 | table.insert(ts, M.Brightness(brightness)) 277 | end 278 | if contrast ~= 0 then 279 | table.insert(ts, M.Contrast(contrast)) 280 | end 281 | if saturation ~= 0 then 282 | table.insert(ts, M.Saturation(saturation)) 283 | end 284 | 285 | if #ts == 0 then 286 | return function(input) return input end 287 | end 288 | 289 | return M.RandomOrder(ts) 290 | end 291 | 292 | return M 293 | -------------------------------------------------------------------------------- /torch/exp/flownet.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using flow 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start Torch 6 | -- Usage: dofile 'exp/flownet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16flow', 16 | '-dataset','charadesflow', 17 | '-LR','0.005', 18 | '-LR_decay_freq','15', 19 | '-epochSize','0.2', 20 | '-nThreads','4', 21 | '-testSize','0.1', 22 | '-nEpochs','40', 23 | '-conv1LR','1', 24 | '-conv2LR','1', 25 | '-conv3LR','1', 26 | '-conv4LR','1', 27 | '-conv5LR','1', 28 | '-batchSize','64', 29 | '-accumGrad','4', 30 | '-trainfile','../Charades_v1_train.csv', 31 | '-testfile','../Charades_v1_test.csv', 32 | '-optnet','true', 33 | '-cacheDir','/mnt/raid00/gunnars/cache/', 34 | '-data','/mnt/raid00/gunnars/Charades_v1_flow/', 35 | } 36 | for _,v in pairs(morearg) do 37 | table.insert(arg,v) 38 | end 39 | dofile 'main.lua' 40 | -------------------------------------------------------------------------------- /torch/exp/flownet_localize.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using flow 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start Torch 6 | -- Usage: dofile 'exp/flownet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16flow', 16 | '-dataset','charadesflow', 17 | '-LR','0.005', 18 | '-LR_decay_freq','15', 19 | '-epochSize','0.2', 20 | '-nThreads','4', 21 | '-testSize','0.1', 22 | '-nEpochs','40', 23 | '-conv1LR','1', 24 | '-conv2LR','1', 25 | '-conv3LR','1', 26 | '-conv4LR','1', 27 | '-conv5LR','1', 28 | '-batchSize','64', 29 | '-accumGrad','4', 30 | '-dumpLocalize','true', 31 | '-cacheDir','/mnt/raid00/gunnars/cache/', 32 | '-data','/mnt/raid00/gunnars/Charades_v1_flow/', 33 | '-trainfile','../Charades_v1_train.csv', 34 | '-testfile','../Charades_v1_test.csv', 35 | '-optnet','true', 36 | } 37 | for _,v in pairs(morearg) do 38 | table.insert(arg,v) 39 | end 40 | dofile 'main.lua' 41 | -------------------------------------------------------------------------------- /torch/exp/flownet_resume.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using flow 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start Torch 6 | -- Usage: dofile 'exp/flownet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16flow', 16 | '-dataset','charadesflow', 17 | '-LR','0.005', 18 | '-LR_decay_freq','15', 19 | '-epochSize','0.2', 20 | '-nThreads','4', 21 | '-testSize','0.1', 22 | '-nEpochs','40', 23 | '-conv1LR','1', 24 | '-conv2LR','1', 25 | '-conv3LR','1', 26 | '-conv4LR','1', 27 | '-conv5LR','1', 28 | '-batchSize','64', 29 | '-accumGrad','4', 30 | '-retrain','/mnt/raid00/gunnars/cache/rgbnet/checkpoints/model_9.t7', -- path to the trained model to use 31 | '-epochNumber','9', -- what epoch to resume from 32 | '-optimState','/mnt/raid00/gunnars/cache/rgbnet/checkpoints/optimstate_9.t7', -- path to the optimizer state 33 | '-gen','/mnt/raid00/gunnars/cache/rgbnet/gen/', -- what cached data to use 34 | '-cacheDir','/mnt/raid00/gunnars/cache/', 35 | '-data','/mnt/raid00/gunnars/Charades_v1_flow/', 36 | '-trainfile','../Charades_v1_train.csv', 37 | '-testfile','../Charades_v1_test.csv', 38 | '-optnet','true', 39 | } 40 | for _,v in pairs(morearg) do 41 | table.insert(arg,v) 42 | end 43 | dofile 'main.lua' 44 | -------------------------------------------------------------------------------- /torch/exp/flownet_test.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using flow 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start Torch 6 | -- Usage: dofile 'exp/flownet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16flow', 16 | '-dataset','charadesflow', 17 | '-LR','0.005', 18 | '-LR_decay_freq','15', 19 | '-epochSize','0.2', 20 | '-nThreads','4', 21 | '-testSize','0.1', 22 | '-nEpochs','40', 23 | '-conv1LR','1', 24 | '-conv2LR','1', 25 | '-conv3LR','1', 26 | '-conv4LR','1', 27 | '-conv5LR','1', 28 | '-batchSize','64', 29 | '-accumGrad','4', 30 | '-testOnly','true', 31 | '-retrain','/mnt/raid00/gunnars/cache/rgbnet/checkpoints/model_9.t7', -- path to the trained model to use 32 | '-gen','/mnt/raid00/gunnars/cache/rgbnet/gen/', -- what cached data to use 33 | '-cacheDir','/mnt/raid00/gunnars/cache/', 34 | '-data','/mnt/raid00/gunnars/Charades_v1_flow/', 35 | '-trainfile','../Charades_v1_train.csv', 36 | '-testfile','../Charades_v1_test.csv', 37 | '-optnet','true', 38 | } 39 | for _,v in pairs(morearg) do 40 | table.insert(arg,v) 41 | end 42 | dofile 'main.lua' 43 | -------------------------------------------------------------------------------- /torch/exp/lstmflownet.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using flow 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start Torch 6 | -- Usage: dofile 'exp/flownet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16lstmflow', 16 | '-dataset','charadessyncflow', 17 | '-LR','5e-4', 18 | '-LR_decay_freq','15', 19 | '-epochSize','0.6', 20 | '-nThreads','4', 21 | '-testSize','0.1', 22 | '-nEpochs','35', 23 | '-conv1LR','1', 24 | '-conv2LR','1', 25 | '-conv3LR','1', 26 | '-conv4LR','1', 27 | '-conv5LR','1', 28 | '-batchSize','64', 29 | '-accumGrad','4', 30 | '-trainfile','../Charades_v1_train.csv', 31 | '-testfile','../Charades_v1_test.csv', 32 | '-optnet','true', 33 | '-cacheDir','/mnt/raid00/gunnars/cache/', 34 | '-data','/mnt/raid00/gunnars/Charades_v1_flow/', 35 | } 36 | for _,v in pairs(morearg) do 37 | table.insert(arg,v) 38 | end 39 | dofile 'main.lua' 40 | -------------------------------------------------------------------------------- /torch/exp/lstmrgbnet.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using rgb 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start torch 6 | -- Usage: dofile 'exp/rgbnet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16lstm', 16 | '-dataset','charadessync', 17 | '-LR_decay_freq','10', 18 | '-LR','0.0015', 19 | '-epochSize','0.3', 20 | '-testSize','0.1', 21 | '-nEpochs','30', 22 | '-conv1LR','1', 23 | '-conv2LR','1', 24 | '-conv3LR','1', 25 | '-conv4LR','1', 26 | '-conv5LR','1', 27 | '-batchSize','64', 28 | '-accumGrad','4', 29 | '-cacheDir','/mnt/raid00/gunnars/cache/', 30 | '-data','/mnt/raid00/gunnars/Charades_v1_rgb/', 31 | '-trainfile','../Charades_v1_train.csv', 32 | '-testfile','../Charades_v1_test.csv', 33 | '-optnet','true', 34 | } 35 | for _,v in pairs(morearg) do 36 | table.insert(arg,v) 37 | end 38 | dofile 'main.lua' 39 | -------------------------------------------------------------------------------- /torch/exp/rgbnet.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using rgb 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start torch 6 | -- Usage: dofile 'exp/rgbnet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16', 16 | '-dataset','charades', 17 | '-LR_decay_freq','30', 18 | '-LR','0.001', 19 | '-epochSize','0.1', 20 | '-testSize','0.1', 21 | '-nEpochs','10', 22 | '-conv1LR','1', 23 | '-conv2LR','1', 24 | '-conv3LR','1', 25 | '-conv4LR','1', 26 | '-conv5LR','1', 27 | '-batchSize','64', 28 | '-accumGrad','4', 29 | '-cacheDir','/mnt/raid00/gunnars/cache/', 30 | '-data','/mnt/raid00/gunnars/Charades_v1_rgb/', 31 | '-trainfile','../Charades_v1_train.csv', 32 | '-testfile','../Charades_v1_test.csv', 33 | '-optnet','true', 34 | } 35 | for _,v in pairs(morearg) do 36 | table.insert(arg,v) 37 | end 38 | dofile 'main.lua' 39 | -------------------------------------------------------------------------------- /torch/exp/rgbnet_localize.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using rgb 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start torch 6 | -- Usage: dofile 'exp/rgbnet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16', 16 | '-dataset','charades', 17 | '-LR_decay_freq','30', 18 | '-LR','0.001', 19 | '-epochSize','0.1', 20 | '-testSize','0.1', 21 | '-nEpochs','10', 22 | '-conv1LR','1', 23 | '-conv2LR','1', 24 | '-conv3LR','1', 25 | '-conv4LR','1', 26 | '-conv5LR','1', 27 | '-batchSize','64', 28 | '-accumGrad','4', 29 | '-dumpLocalize','true', 30 | '-cacheDir','/mnt/raid00/gunnars/cache/', 31 | '-data','/mnt/raid00/gunnars/Charades_v1_rgb/', 32 | '-trainfile','../Charades_v1_train.csv', 33 | '-testfile','../Charades_v1_test.csv', 34 | '-optnet','true', 35 | } 36 | for _,v in pairs(morearg) do 37 | table.insert(arg,v) 38 | end 39 | dofile 'main.lua' 40 | -------------------------------------------------------------------------------- /torch/exp/rgbnet_resume.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using rgb 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start torch 6 | -- Usage: dofile 'exp/rgbnet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16', 16 | '-dataset','charades', 17 | '-LR_decay_freq','30', 18 | '-LR','0.001', 19 | '-epochSize','0.1', 20 | '-testSize','0.1', 21 | '-nEpochs','10', 22 | '-conv1LR','1', 23 | '-conv2LR','1', 24 | '-conv3LR','1', 25 | '-conv4LR','1', 26 | '-conv5LR','1', 27 | '-batchSize','64', 28 | '-accumGrad','4', 29 | '-retrain','/mnt/raid00/gunnars/cache/flownet/checkpoints/model_9.t7', -- path to the trained model to use 30 | '-epochNumber','9', -- what epoch to resume from 31 | '-optimState','/mnt/raid00/gunnars/cache/flowrgbnet/checkpoints/optimstate_9.t7', -- path to the optimizer state 32 | '-cacheDir','/mnt/raid00/gunnars/cache/', 33 | '-data','/mnt/raid00/gunnars/Charades_v1_rgb/', 34 | '-trainfile','../Charades_v1_train.csv', 35 | '-testfile','../Charades_v1_test.csv', 36 | '-optnet','true', 37 | } 38 | for _,v in pairs(morearg) do 39 | table.insert(arg,v) 40 | end 41 | dofile 'main.lua' 42 | -------------------------------------------------------------------------------- /torch/exp/rgbnet_test.lua: -------------------------------------------------------------------------------- 1 | -- Action recognition experiment using rgb 2 | -- 3 | -- Purpose: ? 4 | -- 5 | -- start torch 6 | -- Usage: dofile 'exp/rgbnet.lua' 7 | 8 | local info = debug.getinfo(1,'S'); 9 | name = info.source 10 | name = string.sub(name,1,#name-4) --remove ext 11 | local name = name:match( "([^/]+)$" ) --remove folders 12 | arg = arg or {} 13 | morearg = { 14 | '-name',name, 15 | '-netType','vgg16', 16 | '-dataset','charades', 17 | '-LR_decay_freq','30', 18 | '-LR','0.001', 19 | '-epochSize','0.1', 20 | '-testSize','0.1', 21 | '-nEpochs','10', 22 | '-conv1LR','1', 23 | '-conv2LR','1', 24 | '-conv3LR','1', 25 | '-conv4LR','1', 26 | '-conv5LR','1', 27 | '-batchSize','64', 28 | '-accumGrad','4', 29 | '-testOnly','true', 30 | '-retrain','/mnt/raid00/gunnars/cache/flownet/checkpoints/model_9.t7', -- path to the trained model to use 31 | '-cacheDir','/mnt/raid00/gunnars/cache/', 32 | '-data','/mnt/raid00/gunnars/Charades_v1_rgb/', 33 | '-trainfile','../Charades_v1_train.csv', 34 | '-testfile','../Charades_v1_test.csv', 35 | '-optnet','true', 36 | } 37 | for _,v in pairs(morearg) do 38 | table.insert(arg,v) 39 | end 40 | dofile 'main.lua' 41 | -------------------------------------------------------------------------------- /torch/get_alreadytrained.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to download pretrained models on Charades 3 | # Approximately equivalent to models obtained by running exp/rgbnet.lua and exp/flownet.lua 4 | # 5 | # The flow model was obtained after 31 epochs (epochSize=0.2) 6 | # The flow model has a classification accuracy of 15.4% mAP (via charades_v1_classify.m) 7 | # The rgb model was obtained after 6 epochs (epochSize=0.1) 8 | # The rgb model has a classification accuracy of 15.6% mAP (via charades_v1_classify.m) 9 | # 10 | # Combining the predictions (submission files) of those models using combine_rgb_flow.py 11 | # yields a final classification accuracy of 18.9% mAP (via charades_v1_classify.m) 12 | 13 | wget -O twostream_flow.t7 https://www.dropbox.com/s/o7afkhw52rqr48g/twostream_flow.t7?dl=1 14 | wget -O twostream_rgb.t7 https://www.dropbox.com/s/bo9rv32zaxojsmz/twostream_rgb.t7?dl=1 15 | -------------------------------------------------------------------------------- /torch/get_alreadytrained_lstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to download pretrained lstm models on Charades 3 | # Approximately equivalent to models obtained by running exp/lstmrgbnet.lua and exp/lstmflownet.lua 4 | # 5 | # The flow model was obtained after 30 epochs (epochSize=0.6) 6 | # The flow model has a classification accuracy of 15.4% mAP (via charades_v1_classify.m) 7 | # The rgb model was obtained after 25 epochs (epochSize=0.3) 8 | # The rgb model has a classification accuracy of 16.6% mAP (via charades_v1_classify.m) 9 | # 10 | # Combining the predictions (submission files) of those models using combine_rgb_flow.py 11 | # yields a final classification accuracy of 19.8% mAP (via charades_v1_classify.m) 12 | 13 | wget -O lstm_flow.t7 https://www.dropbox.com/s/gj808t2izq2el4e/lstm_flow.t7?dl=1 14 | wget -O lstm_rgb.t7 https://www.dropbox.com/s/t7n0ivjj15v75kt/lstm_rgb.t7?dl=1 15 | -------------------------------------------------------------------------------- /torch/get_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to download models pretrained on ImageNet and UCF101 3 | # Those are used as the starting point for training on Charades 4 | 5 | wget -O VGG_ILSVRC_16_layers_deploy.prototxt https://www.dropbox.com/s/iycrzeruaf75soc/VGG_ILSVRC_16_layers_deploy.prototxt?dl=1 6 | wget -O VGG_UCF101_16_layers_deploy.prototxt https://www.dropbox.com/s/4ktxsdiiqm429j2/VGG_UCF101_16_layers_deploy.prototxt?dl=1 7 | wget -O VGG_ILSVRC_16_layers.caffemodel https://www.dropbox.com/s/rwo3iim5z2w07aa/VGG_ILSVRC_16_layers.caffemodel?dl=1 8 | wget -O VGG_UCF101_16_layers.caffemodel https://www.dropbox.com/s/d1n9emy0awzlwlr/VGG_UCF101_16_layers.caffemodel?dl=1 9 | -------------------------------------------------------------------------------- /torch/layers/CrossEntropyCriterion.lua: -------------------------------------------------------------------------------- 1 | local CrossEntropyCriterion, Criterion = torch.class('nn.CrossEntropyCriterion', 'nn.Criterion') 2 | 3 | function CrossEntropyCriterion:__init(weights) 4 | Criterion.__init(self) 5 | self.lsm = nn.LogSoftMax() 6 | self.nll = nn.ClassNLLCriterion(weights) 7 | end 8 | 9 | function CrossEntropyCriterion:updateOutput(input, target) 10 | input = input:squeeze() 11 | target = type(target) == 'number' and target or target:squeeze() 12 | self.lsm:updateOutput(input) 13 | self.nll:updateOutput(self.lsm.output, target) 14 | self.output = self.nll.output 15 | return self.output 16 | end 17 | 18 | function CrossEntropyCriterion:updateGradInput(input, target) 19 | local size = input:size() 20 | input = input:squeeze() 21 | target = type(target) == 'number' and target or target:squeeze() 22 | self.nll:updateGradInput(self.lsm.output, target) 23 | self.lsm:updateGradInput(input, self.nll.gradInput) 24 | self.gradInput:view(self.lsm.gradInput, size) 25 | return self.gradInput 26 | end 27 | 28 | return nn.CrossEntropyCriterion 29 | -------------------------------------------------------------------------------- /torch/main.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | require 'torch' 10 | require 'paths' 11 | require 'optim' 12 | require 'nn' 13 | local DataLoader = require 'dataloader' 14 | local models = require 'models/init' 15 | local Trainer 16 | local opts = require 'opts' 17 | local checkpoints = require 'checkpoints' 18 | 19 | torch.setdefaulttensortype('torch.FloatTensor') 20 | torch.setnumthreads(1) 21 | 22 | local opt = opts.parse(arg) 23 | torch.manualSeed(opt.manualSeed) 24 | cutorch.manualSeedAll(opt.manualSeed) 25 | 26 | Trainer = require 'train' 27 | 28 | -- Load previous checkpoint, if it exists 29 | local checkpoint, optimState = checkpoints.latest(opt) 30 | 31 | -- Data loading 32 | print('Creating Data Loader') 33 | local trainLoader, valLoader, val2Loader = DataLoader.create(opt) 34 | 35 | -- Create model 36 | local model, criterion = models.setup(opt, checkpoint) 37 | 38 | -- The trainer handles the training loop and evaluation on validation set 39 | print('Creating Trainer') 40 | local trainer = Trainer(model, criterion, opt, optimState) 41 | 42 | if opt.testOnly then 43 | --local top1Err, top5Err = trainer:test(opt, 0, valLoader) 44 | --print(string.format(' * Results top1: %6.3f top5: %6.3f', top1Err, top5Err)) 45 | 46 | local AP = trainer:test2(opt, 0, val2Loader) 47 | local mAP = AP:mean() 48 | print(string.format(' * Results mAP: %6.3f', mAP)) 49 | 50 | return 51 | end 52 | 53 | local startEpoch = checkpoint and checkpoint.epoch + 1 or opt.epochNumber 54 | local bestTop1 = math.huge 55 | local bestTop5 = math.huge 56 | local bestmAP = 0 57 | for epoch = startEpoch, opt.nEpochs do 58 | -- Train for a single epoch 59 | local trainTop1, trainTop5, trainLoss = trainer:train(opt, epoch, trainLoader) 60 | 61 | -- Run model on validation set evaluating on the whole video 62 | local AP = trainer:test2(opt, epoch, val2Loader) 63 | local mAP = AP:mean() 64 | 65 | -- Run model on validation set 66 | local testTop1, testTop5 = trainer:test(opt, epoch, valLoader) 67 | 68 | local bestModel = false 69 | if testTop1 < bestTop1 then 70 | bestModel = true 71 | bestTop1 = testTop1 72 | bestTop5 = testTop5 73 | bestmAP = mAP 74 | print(' * Best model ', testTop1, testTop5, mAP) 75 | end 76 | 77 | local score = {trainTop1, trainTop5, testTop1, testTop5, mAP} 78 | checkpoints.save(epoch, model, trainer.optimState, bestModel, opt, score) 79 | end 80 | 81 | print(string.format(' * Finished top1: %6.3f top5: %6.3f mAP: %6.3f', bestTop1, bestTop5, bestmAP)) 82 | -------------------------------------------------------------------------------- /torch/models/init.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Generic model creating code. For the specific ResNet model see 10 | -- models/resnet.lua 11 | -- 12 | 13 | require 'nn' 14 | require 'cunn' 15 | require 'cudnn' 16 | 17 | local M = {} 18 | 19 | function M.setup(opt, checkpoint) 20 | local model 21 | local criterion 22 | if checkpoint then 23 | local modelPath = paths.concat(opt.resume, checkpoint.modelFile) 24 | assert(paths.filep(modelPath), 'Saved model not found: ' .. modelPath) 25 | print('=> Resuming model from ' .. modelPath) 26 | model,criterion = torch.load(modelPath):cuda() 27 | elseif opt.retrain ~= 'none' then 28 | assert(paths.filep(opt.retrain), 'File not found: ' .. opt.retrain) 29 | print('Loading model from file: ' .. opt.retrain) 30 | model,criterion = torch.load(opt.retrain):cuda() 31 | if not criterion then 32 | local _ 33 | _,criterion = require('models/' .. opt.netType)(opt) 34 | end 35 | else 36 | print('=> Creating model from file: models/' .. opt.netType .. '.lua') 37 | model,criterion = require('models/' .. opt.netType)(opt) 38 | end 39 | 40 | -- First remove any DataParallelTable 41 | if torch.type(model) == 'nn.DataParallelTable' then 42 | model = model:get(1) 43 | end 44 | 45 | -- optnet is an general library for reducing memory usage in neural networks 46 | if opt.optnet then 47 | local optnet = require 'optnet' 48 | local imsize = opt.dataset == 'cifar10' and 32 or 224 49 | local sampleInput = (opt.dataset == 'charadesflow') and torch.zeros(4,20,imsize,imsize):cuda() or torch.zeros(4,3,imsize,imsize):cuda() 50 | optnet.optimizeMemory(model, sampleInput, {inplace = false, mode = 'training'}) 51 | end 52 | 53 | -- This is useful for fitting ResNet-50 on 4 GPUs, but requires that all 54 | -- containers override backwards to call backwards recursively on submodules 55 | if opt.shareGradInput then 56 | M.shareGradInput(model) 57 | end 58 | 59 | -- For resetting the classifier when fine-tuning on a different Dataset 60 | if opt.resetClassifier and not checkpoint then 61 | print(' => Replacing classifier with ' .. opt.nClasses .. '-way classifier') 62 | 63 | local orig = model:get(#model.modules) 64 | assert(torch.type(orig) == 'nn.Linear', 65 | 'expected last layer to be fully connected') 66 | 67 | local linear = nn.Linear(orig.weight:size(2), opt.nClasses) 68 | linear.bias:zero() 69 | 70 | model:remove(#model.modules) 71 | model:add(linear:cuda()) 72 | end 73 | 74 | -- Set the CUDNN flags 75 | if opt.cudnn == 'fastest' then 76 | cudnn.fastest = true 77 | cudnn.benchmark = true 78 | elseif opt.cudnn == 'deterministic' then 79 | -- Use a deterministic convolution implementation 80 | model:apply(function(m) 81 | if m.setMode then m:setMode(1, 1, 1) end 82 | end) 83 | end 84 | 85 | -- Wrap the model with DataParallelTable, if using more than one GPU 86 | if opt.nGPU > 1 then 87 | local gpus = torch.range(1, opt.nGPU):totable() 88 | local fastest, benchmark = cudnn.fastest, cudnn.benchmark 89 | 90 | local dpt = nn.DataParallelTable(1, true, true) 91 | :add(model, gpus) 92 | :threads(function() 93 | local cudnn = require 'cudnn' 94 | cudnn.fastest, cudnn.benchmark = fastest, benchmark 95 | end) 96 | dpt.gradInput = nil 97 | 98 | model = dpt:cuda() 99 | end 100 | 101 | if not criterion then 102 | criterion = nn.CrossEntropyCriterion():cuda() 103 | end 104 | return model, criterion 105 | end 106 | 107 | function M.shareGradInput(model) 108 | local function sharingKey(m) 109 | local key = torch.type(m) 110 | if m.__shareGradInputKey then 111 | key = key .. ':' .. m.__shareGradInputKey 112 | end 113 | return key 114 | end 115 | 116 | -- Share gradInput for memory efficient backprop 117 | local cache = {} 118 | model:apply(function(m) 119 | local moduleType = torch.type(m) 120 | if torch.isTensor(m.gradInput) and moduleType ~= 'nn.ConcatTable' then 121 | local key = sharingKey(m) 122 | if cache[key] == nil then 123 | cache[key] = torch.CudaStorage(1) 124 | end 125 | m.gradInput = torch.CudaTensor(cache[key], 1, 0) 126 | end 127 | end) 128 | for i, m in ipairs(model:findModules('nn.ConcatTable')) do 129 | if cache[i % 2] == nil then 130 | cache[i % 2] = torch.CudaStorage(1) 131 | end 132 | m.gradInput = torch.CudaTensor(cache[i % 2], 1, 0) 133 | end 134 | end 135 | 136 | return M 137 | -------------------------------------------------------------------------------- /torch/models/preresnet.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- The full pre-activation ResNet variation from the technical report 10 | -- "Identity Mappings in Deep Residual Networks" (http://arxiv.org/abs/1603.05027) 11 | -- 12 | 13 | local nn = require 'nn' 14 | require 'cunn' 15 | 16 | local Convolution = cudnn.SpatialConvolution 17 | local Avg = cudnn.SpatialAveragePooling 18 | local ReLU = cudnn.ReLU 19 | local Max = nn.SpatialMaxPooling 20 | local SBatchNorm = nn.SpatialBatchNormalization 21 | 22 | local function createModel(opt) 23 | local depth = opt.depth 24 | local shortcutType = opt.shortcutType or 'B' 25 | local iChannels 26 | 27 | -- The shortcut layer is either identity or 1x1 convolution 28 | local function shortcut(nInputPlane, nOutputPlane, stride) 29 | local useConv = shortcutType == 'C' or 30 | (shortcutType == 'B' and nInputPlane ~= nOutputPlane) 31 | if useConv then 32 | -- 1x1 convolution 33 | return nn.Sequential() 34 | :add(Convolution(nInputPlane, nOutputPlane, 1, 1, stride, stride)) 35 | elseif nInputPlane ~= nOutputPlane then 36 | -- Strided, zero-padded identity shortcut 37 | return nn.Sequential() 38 | :add(nn.SpatialAveragePooling(1, 1, stride, stride)) 39 | :add(nn.Concat(2) 40 | :add(nn.Identity()) 41 | :add(nn.MulConstant(0))) 42 | else 43 | return nn.Identity() 44 | end 45 | end 46 | 47 | -- Typically shareGradInput uses the same gradInput storage for all modules 48 | -- of the same type. This is incorrect for some SpatialBatchNormalization 49 | -- modules in this network b/c of the in-place CAddTable. This marks the 50 | -- module so that it's shared only with other modules with the same key 51 | local function ShareGradInput(module, key) 52 | assert(key) 53 | module.__shareGradInputKey = key 54 | return module 55 | end 56 | 57 | -- The basic residual layer block for 18 and 34 layer network, and the 58 | -- CIFAR networks 59 | local function basicblock(n, stride, type) 60 | local nInputPlane = iChannels 61 | iChannels = n 62 | 63 | local block = nn.Sequential() 64 | local s = nn.Sequential() 65 | if type == 'both_preact' then 66 | block:add(ShareGradInput(SBatchNorm(nInputPlane), 'preact')) 67 | block:add(ReLU(true)) 68 | elseif type ~= 'no_preact' then 69 | s:add(SBatchNorm(nInputPlane)) 70 | s:add(ReLU(true)) 71 | end 72 | s:add(Convolution(nInputPlane,n,3,3,stride,stride,1,1)) 73 | s:add(SBatchNorm(n)) 74 | s:add(ReLU(true)) 75 | s:add(Convolution(n,n,3,3,1,1,1,1)) 76 | 77 | return block 78 | :add(nn.ConcatTable() 79 | :add(s) 80 | :add(shortcut(nInputPlane, n, stride))) 81 | :add(nn.CAddTable(true)) 82 | end 83 | 84 | -- The bottleneck residual layer for 50, 101, and 152 layer networks 85 | local function bottleneck(n, stride, type) 86 | local nInputPlane = iChannels 87 | iChannels = n * 4 88 | 89 | local block = nn.Sequential() 90 | local s = nn.Sequential() 91 | if type == 'both_preact' then 92 | block:add(ShareGradInput(SBatchNorm(nInputPlane), 'preact')) 93 | block:add(ReLU(true)) 94 | elseif type ~= 'no_preact' then 95 | s:add(SBatchNorm(nInputPlane)) 96 | s:add(ReLU(true)) 97 | end 98 | s:add(Convolution(nInputPlane,n,1,1,1,1,0,0)) 99 | s:add(SBatchNorm(n)) 100 | s:add(ReLU(true)) 101 | s:add(Convolution(n,n,3,3,stride,stride,1,1)) 102 | s:add(SBatchNorm(n)) 103 | s:add(ReLU(true)) 104 | s:add(Convolution(n,n*4,1,1,1,1,0,0)) 105 | 106 | return block 107 | :add(nn.ConcatTable() 108 | :add(s) 109 | :add(shortcut(nInputPlane, n * 4, stride))) 110 | :add(nn.CAddTable(true)) 111 | end 112 | 113 | -- Creates count residual blocks with specified number of features 114 | local function layer(block, features, count, stride, type) 115 | local s = nn.Sequential() 116 | if count < 1 then 117 | return s 118 | end 119 | s:add(block(features, stride, 120 | type == 'first' and 'no_preact' or 'both_preact')) 121 | for i=2,count do 122 | s:add(block(features, 1)) 123 | end 124 | return s 125 | end 126 | 127 | local model = nn.Sequential() 128 | if opt.dataset == 'imagenet' then 129 | -- Configurations for ResNet: 130 | -- num. residual blocks, num features, residual block function 131 | local cfg = { 132 | [18] = {{2, 2, 2, 2}, 512, basicblock}, 133 | [34] = {{3, 4, 6, 3}, 512, basicblock}, 134 | [50] = {{3, 4, 6, 3}, 2048, bottleneck}, 135 | [101] = {{3, 4, 23, 3}, 2048, bottleneck}, 136 | [152] = {{3, 8, 36, 3}, 2048, bottleneck}, 137 | [200] = {{3, 24, 36, 3}, 2048, bottleneck}, 138 | } 139 | 140 | assert(cfg[depth], 'Invalid depth: ' .. tostring(depth)) 141 | local def, nFeatures, block = table.unpack(cfg[depth]) 142 | iChannels = 64 143 | print(' | ResNet-' .. depth .. ' ImageNet') 144 | 145 | -- The ResNet ImageNet model 146 | model:add(Convolution(3,64,7,7,2,2,3,3)) 147 | model:add(SBatchNorm(64)) 148 | model:add(ReLU(true)) 149 | model:add(Max(3,3,2,2,1,1)) 150 | model:add(layer(block, 64, def[1], 1, 'first')) 151 | model:add(layer(block, 128, def[2], 2)) 152 | model:add(layer(block, 256, def[3], 2)) 153 | model:add(layer(block, 512, def[4], 2)) 154 | model:add(ShareGradInput(SBatchNorm(iChannels), 'last')) 155 | model:add(ReLU(true)) 156 | model:add(Avg(7, 7, 1, 1)) 157 | model:add(nn.View(nFeatures):setNumInputDims(3)) 158 | model:add(nn.Linear(nFeatures, 1000)) 159 | elseif opt.dataset == 'cifar10' then 160 | -- Model type specifies number of layers for CIFAR-10 model 161 | assert((depth - 2) % 6 == 0, 'depth should be one of 20, 32, 44, 56, 110, 1202') 162 | local n = (depth - 2) / 6 163 | iChannels = 16 164 | print(' | ResNet-' .. depth .. ' CIFAR-10') 165 | 166 | -- The ResNet CIFAR-10 model 167 | model:add(Convolution(3,16,3,3,1,1,1,1)) 168 | model:add(layer(basicblock, 16, n, 1)) 169 | model:add(layer(basicblock, 32, n, 2)) 170 | model:add(layer(basicblock, 64, n, 2)) 171 | model:add(ShareGradInput(SBatchNorm(iChannels), 'last')) 172 | model:add(ReLU(true)) 173 | model:add(Avg(8, 8, 1, 1)) 174 | model:add(nn.View(64):setNumInputDims(3)) 175 | model:add(nn.Linear(64, 10)) 176 | else 177 | error('invalid dataset: ' .. opt.dataset) 178 | end 179 | 180 | local function ConvInit(name) 181 | for k,v in pairs(model:findModules(name)) do 182 | local n = v.kW*v.kH*v.nOutputPlane 183 | v.weight:normal(0,math.sqrt(2/n)) 184 | if cudnn.version >= 4000 then 185 | v.bias = nil 186 | v.gradBias = nil 187 | else 188 | v.bias:zero() 189 | end 190 | end 191 | end 192 | local function BNInit(name) 193 | for k,v in pairs(model:findModules(name)) do 194 | v.weight:fill(1) 195 | v.bias:zero() 196 | end 197 | end 198 | 199 | ConvInit('cudnn.SpatialConvolution') 200 | ConvInit('nn.SpatialConvolution') 201 | BNInit('fbnn.SpatialBatchNormalization') 202 | BNInit('cudnn.SpatialBatchNormalization') 203 | BNInit('nn.SpatialBatchNormalization') 204 | for k,v in pairs(model:findModules('nn.Linear')) do 205 | v.bias:zero() 206 | end 207 | model:cuda() 208 | 209 | if opt.cudnn == 'deterministic' then 210 | model:apply(function(m) 211 | if m.setMode then m:setMode(1,1,1) end 212 | end) 213 | end 214 | 215 | model:get(1).gradInput = nil 216 | 217 | return model 218 | end 219 | 220 | return createModel 221 | -------------------------------------------------------------------------------- /torch/models/resnet.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- The ResNet model definition 10 | -- 11 | 12 | local nn = require 'nn' 13 | require 'cunn' 14 | 15 | local Convolution = cudnn.SpatialConvolution 16 | local Avg = cudnn.SpatialAveragePooling 17 | local ReLU = cudnn.ReLU 18 | local Max = nn.SpatialMaxPooling 19 | local SBatchNorm = nn.SpatialBatchNormalization 20 | 21 | local function createModel(opt) 22 | local depth = opt.depth 23 | local shortcutType = opt.shortcutType or 'B' 24 | local iChannels 25 | 26 | -- The shortcut layer is either identity or 1x1 convolution 27 | local function shortcut(nInputPlane, nOutputPlane, stride) 28 | local useConv = shortcutType == 'C' or 29 | (shortcutType == 'B' and nInputPlane ~= nOutputPlane) 30 | if useConv then 31 | -- 1x1 convolution 32 | return nn.Sequential() 33 | :add(Convolution(nInputPlane, nOutputPlane, 1, 1, stride, stride)) 34 | :add(SBatchNorm(nOutputPlane)) 35 | elseif nInputPlane ~= nOutputPlane then 36 | -- Strided, zero-padded identity shortcut 37 | return nn.Sequential() 38 | :add(nn.SpatialAveragePooling(1, 1, stride, stride)) 39 | :add(nn.Concat(2) 40 | :add(nn.Identity()) 41 | :add(nn.MulConstant(0))) 42 | else 43 | return nn.Identity() 44 | end 45 | end 46 | 47 | -- The basic residual layer block for 18 and 34 layer network, and the 48 | -- CIFAR networks 49 | local function basicblock(n, stride) 50 | local nInputPlane = iChannels 51 | iChannels = n 52 | 53 | local s = nn.Sequential() 54 | s:add(Convolution(nInputPlane,n,3,3,stride,stride,1,1)) 55 | s:add(SBatchNorm(n)) 56 | s:add(ReLU(true)) 57 | s:add(Convolution(n,n,3,3,1,1,1,1)) 58 | s:add(SBatchNorm(n)) 59 | 60 | return nn.Sequential() 61 | :add(nn.ConcatTable() 62 | :add(s) 63 | :add(shortcut(nInputPlane, n, stride))) 64 | :add(nn.CAddTable(true)) 65 | :add(ReLU(true)) 66 | end 67 | 68 | -- The bottleneck residual layer for 50, 101, and 152 layer networks 69 | local function bottleneck(n, stride) 70 | local nInputPlane = iChannels 71 | iChannels = n * 4 72 | 73 | local s = nn.Sequential() 74 | s:add(Convolution(nInputPlane,n,1,1,1,1,0,0)) 75 | s:add(SBatchNorm(n)) 76 | s:add(ReLU(true)) 77 | s:add(Convolution(n,n,3,3,stride,stride,1,1)) 78 | s:add(SBatchNorm(n)) 79 | s:add(ReLU(true)) 80 | s:add(Convolution(n,n*4,1,1,1,1,0,0)) 81 | s:add(SBatchNorm(n * 4)) 82 | 83 | return nn.Sequential() 84 | :add(nn.ConcatTable() 85 | :add(s) 86 | :add(shortcut(nInputPlane, n * 4, stride))) 87 | :add(nn.CAddTable(true)) 88 | :add(ReLU(true)) 89 | end 90 | 91 | -- Creates count residual blocks with specified number of features 92 | local function layer(block, features, count, stride) 93 | local s = nn.Sequential() 94 | for i=1,count do 95 | s:add(block(features, i == 1 and stride or 1)) 96 | end 97 | return s 98 | end 99 | 100 | local model = nn.Sequential() 101 | if opt.dataset == 'imagenet' then 102 | -- Configurations for ResNet: 103 | -- num. residual blocks, num features, residual block function 104 | local cfg = { 105 | [18] = {{2, 2, 2, 2}, 512, basicblock}, 106 | [34] = {{3, 4, 6, 3}, 512, basicblock}, 107 | [50] = {{3, 4, 6, 3}, 2048, bottleneck}, 108 | [101] = {{3, 4, 23, 3}, 2048, bottleneck}, 109 | [152] = {{3, 8, 36, 3}, 2048, bottleneck}, 110 | } 111 | 112 | assert(cfg[depth], 'Invalid depth: ' .. tostring(depth)) 113 | local def, nFeatures, block = table.unpack(cfg[depth]) 114 | iChannels = 64 115 | print(' | ResNet-' .. depth .. ' ImageNet') 116 | 117 | -- The ResNet ImageNet model 118 | model:add(Convolution(3,64,7,7,2,2,3,3)) 119 | model:add(SBatchNorm(64)) 120 | model:add(ReLU(true)) 121 | model:add(Max(3,3,2,2,1,1)) 122 | model:add(layer(block, 64, def[1])) 123 | model:add(layer(block, 128, def[2], 2)) 124 | model:add(layer(block, 256, def[3], 2)) 125 | model:add(layer(block, 512, def[4], 2)) 126 | model:add(Avg(7, 7, 1, 1)) 127 | model:add(nn.View(nFeatures):setNumInputDims(3)) 128 | model:add(nn.Linear(nFeatures, 1000)) 129 | elseif opt.dataset == 'cifar10' then 130 | -- Model type specifies number of layers for CIFAR-10 model 131 | assert((depth - 2) % 6 == 0, 'depth should be one of 20, 32, 44, 56, 110, 1202') 132 | local n = (depth - 2) / 6 133 | iChannels = 16 134 | print(' | ResNet-' .. depth .. ' CIFAR-10') 135 | 136 | -- The ResNet CIFAR-10 model 137 | model:add(Convolution(3,16,3,3,1,1,1,1)) 138 | model:add(SBatchNorm(16)) 139 | model:add(ReLU(true)) 140 | model:add(layer(basicblock, 16, n)) 141 | model:add(layer(basicblock, 32, n, 2)) 142 | model:add(layer(basicblock, 64, n, 2)) 143 | model:add(Avg(8, 8, 1, 1)) 144 | model:add(nn.View(64):setNumInputDims(3)) 145 | model:add(nn.Linear(64, 10)) 146 | else 147 | error('invalid dataset: ' .. opt.dataset) 148 | end 149 | 150 | local function ConvInit(name) 151 | for k,v in pairs(model:findModules(name)) do 152 | local n = v.kW*v.kH*v.nOutputPlane 153 | v.weight:normal(0,math.sqrt(2/n)) 154 | if cudnn.version >= 4000 then 155 | v.bias = nil 156 | v.gradBias = nil 157 | else 158 | v.bias:zero() 159 | end 160 | end 161 | end 162 | local function BNInit(name) 163 | for k,v in pairs(model:findModules(name)) do 164 | v.weight:fill(1) 165 | v.bias:zero() 166 | end 167 | end 168 | 169 | ConvInit('cudnn.SpatialConvolution') 170 | ConvInit('nn.SpatialConvolution') 171 | BNInit('fbnn.SpatialBatchNormalization') 172 | BNInit('cudnn.SpatialBatchNormalization') 173 | BNInit('nn.SpatialBatchNormalization') 174 | for k,v in pairs(model:findModules('nn.Linear')) do 175 | v.bias:zero() 176 | end 177 | model:cuda() 178 | 179 | if opt.cudnn == 'deterministic' then 180 | model:apply(function(m) 181 | if m.setMode then m:setMode(1,1,1) end 182 | end) 183 | end 184 | 185 | model:get(1).gradInput = nil 186 | 187 | return model 188 | end 189 | 190 | return createModel 191 | -------------------------------------------------------------------------------- /torch/models/vgg16.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- The RGB model definition 10 | -- 11 | -- Contributor: Gunnar Atli Sigurdsson 12 | 13 | local nn = require 'nn' 14 | require 'cunn' 15 | require 'loadcaffe' 16 | 17 | local function createModel(opt) 18 | local model = loadcaffe.load(opt.pretrainpath .. 'VGG_ILSVRC_16_layers_deploy.prototxt', opt.pretrainpath .. 'VGG_ILSVRC_16_layers.caffemodel','cudnn') 19 | 20 | print(' => Replacing classifier with ' .. opt.nClasses .. '-way classifier') 21 | 22 | model:remove(#model.modules) --remove softmax 23 | local orig = model:get(#model.modules) 24 | assert(torch.type(orig) == 'nn.Linear', 25 | 'expected last layer to be fully connected') 26 | 27 | local linear = nn.Linear(orig.weight:size(2), opt.nClasses) 28 | linear.name = "fc8" 29 | linear.bias:zero() 30 | 31 | model:remove(#model.modules) 32 | model:add(linear:cuda()) 33 | model:cuda() 34 | 35 | print(tostring(model)) 36 | if opt.cudnn == 'deterministic' then 37 | model:apply(function(m) 38 | if m.setMode then m:setMode(1,1,1) end 39 | end) 40 | end 41 | 42 | return model 43 | end 44 | 45 | return createModel 46 | -------------------------------------------------------------------------------- /torch/models/vgg16flow.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- The Flow model definition 10 | -- 11 | -- Contributor: Gunnar Atli Sigurdsson 12 | 13 | local nn = require 'nn' 14 | require 'cunn' 15 | require 'loadcaffe' 16 | 17 | local function createModel(opt) 18 | local model = loadcaffe.load(opt.pretrainpath .. 'VGG_UCF101_16_layers_deploy.prototxt', opt.pretrainpath .. 'VGG_UCF101_16_layers.caffemodel','cudnn') 19 | 20 | print(' => Replacing classifier with ' .. opt.nClasses .. '-way classifier') 21 | 22 | --model:remove(#model.modules) --remove softmax 23 | local orig = model:get(#model.modules) 24 | assert(torch.type(orig) == 'nn.Linear', 25 | 'expected last layer to be fully connected') 26 | 27 | local linear = nn.Linear(orig.weight:size(2), opt.nClasses) 28 | linear.name = "fc8" 29 | linear.bias:zero() 30 | 31 | model:remove(#model.modules) 32 | model:add(linear:cuda()) 33 | model:cuda() 34 | 35 | print(tostring(model)) 36 | if opt.cudnn == 'deterministic' then 37 | model:apply(function(m) 38 | if m.setMode then m:setMode(1,1,1) end 39 | end) 40 | end 41 | 42 | return model 43 | end 44 | 45 | return createModel 46 | -------------------------------------------------------------------------------- /torch/models/vgg16lstm.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- The LSTM RGB model definition 10 | -- 11 | -- Contributor: Gunnar Atli Sigurdsson 12 | 13 | local nn = require 'nn' 14 | require 'cunn' 15 | 16 | local function createModel(opt) 17 | local model = torch.load(opt.pretrainpath .. 'twostream_rgb.t7'):cuda() -- Load pretrained Two-Stream model 18 | 19 | print(' => Replacing classifier with ' .. opt.nClasses .. '-way classifier') 20 | 21 | local orig = model:get(#model.modules) 22 | assert(torch.type(orig) == 'nn.Linear', 23 | 'expected last layer to be fully connected') 24 | 25 | local lstm = cudnn.LSTM(4096,512,1,false) 26 | lstm.name = "fc8" 27 | local linear = nn.Linear(512, opt.nClasses) 28 | linear.name = "fc8" 29 | linear.bias:zero() 30 | 31 | model:remove(#model.modules) 32 | model:add(nn.View(1,4096)) 33 | model:add(lstm) 34 | model:add(nn.View(512)) 35 | model:add(linear:cuda()) 36 | model:cuda() 37 | 38 | print(tostring(model)) 39 | if opt.cudnn == 'deterministic' then 40 | model:apply(function(m) 41 | if m.setMode then m:setMode(1,1,1) end 42 | end) 43 | end 44 | 45 | return model 46 | end 47 | 48 | return createModel 49 | -------------------------------------------------------------------------------- /torch/models/vgg16lstmflow.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- The LSTM Flow model definition 10 | -- 11 | -- Contributor: Gunnar Atli Sigurdsson 12 | 13 | local nn = require 'nn' 14 | require 'cunn' 15 | 16 | local function createModel(opt) 17 | local model = torch.load(opt.pretrainpath .. 'twostream_flow.t7'):cuda() -- Load pretrained Two-Stream model 18 | 19 | print(' => Replacing classifier with ' .. opt.nClasses .. '-way classifier') 20 | 21 | local orig = model:get(#model.modules) 22 | assert(torch.type(orig) == 'nn.Linear', 23 | 'expected last layer to be fully connected') 24 | 25 | local lstm = cudnn.LSTM(4096,512,1,false) 26 | lstm.name = "fc8" 27 | local linear = nn.Linear(512, opt.nClasses) 28 | linear.name = "fc8" 29 | linear.bias:zero() 30 | 31 | model:remove(#model.modules) 32 | model:add(nn.View(1,4096)) 33 | model:add(lstm) 34 | model:add(nn.View(512)) 35 | model:add(linear:cuda()) 36 | model:cuda() 37 | 38 | print(tostring(model)) 39 | if opt.cudnn == 'deterministic' then 40 | model:apply(function(m) 41 | if m.setMode then m:setMode(1,1,1) end 42 | end) 43 | end 44 | 45 | return model 46 | end 47 | 48 | return createModel 49 | -------------------------------------------------------------------------------- /torch/opts.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | local M = { } 10 | 11 | function M.parse(arg) 12 | local cmd = torch.CmdLine() 13 | cmd:text() 14 | cmd:text('Torch-7 Charades Two-Stream Training Script') 15 | cmd:text('Check out the README file for an overview, and the exp/ folder for training examples') 16 | cmd:text('See https://github.com/facebook/fb.resnet.torch/blob/master/TRAINING.md for examples') 17 | cmd:text() 18 | cmd:text('Options:') 19 | ------------ General options -------------------- 20 | cmd:option('-data', '/mnt/raid00/gunnars/Charades_v1_jpg/', 'Path to dataset') 21 | cmd:option('-trainfile', './Charades_v1_train.csv', 'Path to training annotations') 22 | cmd:option('-testfile', './Charades_v1_test.csv', 'Path to testing annotations') 23 | cmd:option('-cacheDir', '/mnt/raid00/gunnars/cache/', 'Path to model caches') 24 | cmd:option('-name', 'test', 'Experiment name') 25 | cmd:option('-dataset', 'charades', 'Options: imagenet | cifar10 | charades') 26 | cmd:option('-setup', 'softmax', 'Options: softmax | sigmoid') 27 | cmd:option('-manualSeed', 0, 'Manually set RNG seed') 28 | cmd:option('-nGPU', 1, 'Number of GPUs to use by default') 29 | cmd:option('-backend', 'cudnn', 'Options: cudnn | cunn') 30 | cmd:option('-cudnn', 'default', 'Options: fastest | default | deterministic') 31 | cmd:option('-gen', 'gen', 'Path to save generated files') 32 | ------------- Data options ------------------------ 33 | cmd:option('-nThreads', 1, 'number of data loading threads') 34 | ------------- Training options -------------------- 35 | cmd:option('-nEpochs', 1, 'Number of total epochs to run') 36 | cmd:option('-epochNumber', 1, 'Manual epoch number (useful on restarts)') 37 | cmd:option('-epochSize', 1, 'Epoch size (Int | [0,1])') 38 | cmd:option('-testSize', 1, 'Size of test set (Int | [0,1])') 39 | cmd:option('-batchSize', 64, 'mini-batch size (1 = pure stochastic)') 40 | cmd:option('-testOnly', 'false', 'Run on validation set only') 41 | cmd:option('-dumpLocalize','false', 'Output localization') 42 | cmd:option('-tenCrop', 'false', 'Ten-crop testing') 43 | cmd:option('-accumGrad', 4, 'Accumulate gradient accross N batches (Increase effective batch size)') 44 | cmd:option('-solver', 'sgd', 'Solver to use. Options: sgd | adam') 45 | ------------- Checkpointing options --------------- 46 | cmd:option('-save', 'checkpoints', 'Directory in which to save checkpoints') 47 | cmd:option('-resume', 'none', 'Resume from the latest checkpoint in this directory') 48 | ---------- Optimization options ---------------------- 49 | cmd:option('-LR', 0.001, 'initial learning rate') 50 | cmd:option('-LR_decay_freq', 6, 'epoch at which LR drops to 1/10') 51 | cmd:option('-momentum', 0.9, 'momentum') 52 | cmd:option('-weightDecay', 5e-4, 'weight decay') 53 | cmd:option('-conv1LR', 1.0, 'convolution layer LR modifier') 54 | cmd:option('-conv2LR', 1.0, 'convolution layer LR modifier') 55 | cmd:option('-conv3LR', 1.0, 'convolution layer LR modifier') 56 | cmd:option('-conv4LR', 1.0, 'convolution layer LR modifier') 57 | cmd:option('-conv5LR', 1.0, 'convolution layer LR modifier') 58 | cmd:option('-fc8LR', 1.0, 'fc8 layer LR modifier') 59 | ---------- Model options ---------------------------------- 60 | cmd:option('-netType', 'vgg16','Options: resnet | preresnet | vgg16') 61 | cmd:option('-pretrainpath', './', 'Path to pretrained models') 62 | cmd:option('-depth', 34, 'ResNet depth: 18 | 34 | 50 | 101 | ...', 'number') 63 | cmd:option('-fc7_dropout', 0.5, 'Dropout rate after fc7 [0,1]') 64 | cmd:option('-marginal', 'mean', 'Type of inference (mean | max)') 65 | cmd:option('-shortcutType', '', 'Options: A | B | C') 66 | cmd:option('-retrain', 'none', 'Path to model to retrain with') 67 | cmd:option('-optimState', 'none', 'Path to an optimState to reload from') 68 | ---------- Model options ---------------------------------- 69 | cmd:option('-shareGradInput', 'false', 'Share gradInput tensors to reduce memory usage') 70 | cmd:option('-optnet', 'true', 'Use optnet to reduce memory usage') 71 | cmd:option('-resetClassifier', 'false', 'Reset the fully connected layer for fine-tuning') 72 | cmd:option('-nClasses', 157, 'Number of classes in the dataset') 73 | cmd:text() 74 | 75 | print(arg) 76 | local opt = cmd:parse(arg or {}) 77 | opt.cacheDir = opt.cacheDir .. opt.name .. '/' -- brand new cacheDir 78 | 79 | if not paths.dirp(opt.cacheDir) and not paths.mkdir(opt.cacheDir) then 80 | cmd:error('error: unable to create cache directory: ' .. opt.cacheDir .. '\n') 81 | end 82 | cmd:log(opt.cacheDir .. '/log.txt', opt) --start logging 83 | cmd:addTime(name,'%F %T') 84 | 85 | opt.save = opt.cacheDir .. opt.save 86 | if not (string.sub(opt.gen,1,1)=='/') then 87 | -- If path is not absolute, then put it under opt.cacheDir 88 | opt.gen = opt.cacheDir .. opt.gen 89 | end 90 | 91 | opt.testOnly = opt.testOnly ~= 'false' 92 | opt.tenCrop = opt.tenCrop ~= 'false' 93 | opt.shareGradInput = opt.shareGradInput ~= 'false' 94 | opt.optnet = opt.optnet ~= 'false' 95 | opt.resetClassifier = opt.resetClassifier ~= 'false' 96 | opt.dumpLocalize = opt.dumpLocalize ~= 'false' 97 | 98 | if not paths.dirp(opt.save) and not paths.mkdir(opt.save) then 99 | cmd:error('error: unable to create checkpoint directory: ' .. opt.save .. '\n') 100 | end 101 | if not paths.dirp(opt.gen) and not paths.mkdir(opt.gen) then 102 | cmd:error('error: unable to create checkpoint directory: ' .. opt.gen .. '\n') 103 | end 104 | 105 | if opt.dataset == 'imagenet' then 106 | -- Handle the most common case of missing -data flag 107 | local trainDir = paths.concat(opt.data, 'train') 108 | if not paths.dirp(opt.data) then 109 | cmd:error('error: missing ImageNet data directory') 110 | elseif not paths.dirp(trainDir) then 111 | cmd:error('error: ImageNet missing `train` directory: ' .. trainDir) 112 | end 113 | -- Default shortcutType=B and nEpochs=90 114 | opt.shortcutType = opt.shortcutType == '' and 'B' or opt.shortcutType 115 | opt.nEpochs = opt.nEpochs == 0 and 90 or opt.nEpochs 116 | elseif opt.dataset == 'cifar10' then 117 | -- Default shortcutType=A and nEpochs=164 118 | opt.shortcutType = opt.shortcutType == '' and 'A' or opt.shortcutType 119 | opt.nEpochs = opt.nEpochs == 0 and 164 or opt.nEpochs 120 | elseif opt.dataset == 'charades' then 121 | if not paths.dirp(opt.data) then 122 | cmd:error('error: missing Charades data directory') 123 | end 124 | opt.nEpochs = opt.nEpochs == 0 and 1 or opt.nEpochs 125 | elseif opt.dataset == 'charadesflow' then 126 | if not paths.dirp(opt.data) then 127 | cmd:error('error: missing Charadesflow data directory') 128 | end 129 | opt.nEpochs = opt.nEpochs == 0 and 1 or opt.nEpochs 130 | elseif opt.dataset == 'charadessync' then 131 | if not paths.dirp(opt.data) then 132 | cmd:error('error: missing Charades data directory') 133 | end 134 | opt.nEpochs = opt.nEpochs == 0 and 1 or opt.nEpochs 135 | elseif opt.dataset == 'charadessyncflow' then 136 | if not paths.dirp(opt.data) then 137 | cmd:error('error: missing Charadesflow data directory') 138 | end 139 | opt.nEpochs = opt.nEpochs == 0 and 1 or opt.nEpochs 140 | else 141 | cmd:error('unknown dataset: ' .. opt.dataset) 142 | end 143 | 144 | if opt.resetClassifier then 145 | if opt.nClasses == 0 then 146 | cmd:error('-nClasses required when resetClassifier is set') 147 | end 148 | end 149 | 150 | if opt.shareGradInput and opt.optnet then 151 | cmd:error('error: cannot use both -shareGradInput and -optnet') 152 | end 153 | 154 | return opt 155 | end 156 | 157 | return M 158 | -------------------------------------------------------------------------------- /torch/train.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2016, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- The training loop and learning rate schedule 10 | -- 11 | -- Contributor: Gunnar Atli Sigurdsson 12 | 13 | local optim = require 'optim' 14 | 15 | local M = {} 16 | local Trainer = torch.class('resnet.Trainer', M) 17 | 18 | -- name of the modules in the same order as model:parameters() 19 | -- assumes a single nn.Sequential 20 | local function layer_names(model) 21 | local w = {} 22 | for i=1,#model.modules do 23 | local name = model.modules[i].name or "" 24 | local mw,_ = model.modules[i]:parameters() 25 | if mw then 26 | for k,_ in pairs(mw) do 27 | table.insert(w,name) 28 | end 29 | end 30 | end 31 | return w 32 | end 33 | 34 | function Trainer:__init(model, criterion, opt, optimState) 35 | self.model = model 36 | self.criterion = criterion 37 | optimState = optimState or { 38 | originalLR = opt.LR, 39 | learningRate = opt.LR, 40 | learningRateDecay = 0.0, 41 | momentum = opt.momentum, 42 | nesterov = true, 43 | dampening = 0.0, 44 | weightDecay = opt.weightDecay, 45 | } 46 | self.opt = opt 47 | self.params, self.gradParams = model:parameters() 48 | self.L = #self.params 49 | self.LR_decay_freq = opt.LR_decay_freq 50 | self.optimState = {} 51 | local names = layer_names(self.model) 52 | assert(#names==self.L) 53 | for i=1,self.L do 54 | local layername = names[i] or "" 55 | self.optimState[i] = {} 56 | for k,v in pairs(optimState) do 57 | self.optimState[i][k] = v 58 | end 59 | if string.find(layername, "conv1") then 60 | self.optimState[i].learningRate = opt.LR*opt.conv1LR 61 | end 62 | if string.find(layername, "conv2") then 63 | self.optimState[i].learningRate = opt.LR*opt.conv2LR 64 | end 65 | if string.find(layername, "conv3") then 66 | self.optimState[i].learningRate = opt.LR*opt.conv3LR 67 | end 68 | if string.find(layername, "conv4") then 69 | self.optimState[i].learningRate = opt.LR*opt.conv4LR 70 | end 71 | if string.find(layername, "conv5") then 72 | self.optimState[i].learningRate = opt.LR*opt.conv5LR 73 | end 74 | if string.find(layername, "fc8") then 75 | self.optimState[i].learningRate = opt.LR*opt.fc8LR 76 | end 77 | end 78 | end 79 | 80 | function Trainer:train(opt, epoch, dataloader) 81 | -- Trains the model for a single epoch 82 | 83 | local timer = torch.Timer() 84 | local dataTimer = torch.Timer() 85 | local LRM = self:learningRateModifier(epoch) 86 | for l=1,self.L do 87 | self.optimState[l].learningRate = self.optimState[l].originalLR*LRM 88 | end 89 | 90 | local function feval(i) 91 | return function () return self.criterion.output, self.gradParams[i] end 92 | end 93 | 94 | local trainSize = dataloader:size() 95 | local top1Sum, top5Sum, lossSum = 0.0, 0.0, 0.0 96 | local N = 0 97 | 98 | print('=> Training epoch # ' .. epoch) 99 | -- set the batch norm to training mode 100 | self.model:training() 101 | self.model:zeroGradParameters() 102 | for n, sample in dataloader:run() do 103 | local dataTime = dataTimer:time().real 104 | 105 | -- Copy input and target to the GPU 106 | self:copyInputs(sample) 107 | 108 | local output = self.model:forward(self.input):float() 109 | local batchSize = output:size(1) 110 | local loss = self.criterion:forward(self.model.output, self.target) 111 | 112 | if dataloader.synchronous then 113 | -- Double check that dataloader is giving one video per batch 114 | for i=1,batchSize-1 do -- make sure there is no error in the loader, this should be one video 115 | assert(sample.ids[{{i}}]==sample.ids[{{i+1}}],"Training set batch size and current batch size do not match!") 116 | end 117 | end 118 | 119 | self.criterion:backward(self.model.output, self.target) 120 | self.model:backward(self.input, self.criterion.gradInput) 121 | --require('fb.debugger'):enter() 122 | 123 | if n % opt.accumGrad == 0 then -- accumulate batches 124 | for i=1,self.L do -- sgd on invdividual layers 125 | optim.sgd(feval(i), self.params[i], self.optimState[i]) 126 | end 127 | self.model:zeroGradParameters() 128 | end 129 | 130 | local top1, top5 = self:computeScore(output, sample.target, 1) 131 | top1Sum = top1Sum + top1*batchSize 132 | top5Sum = top5Sum + top5*batchSize 133 | lossSum = lossSum + loss*batchSize 134 | N = N + batchSize 135 | 136 | print(('%s | Epoch: [%d][%d/%d] Time %.3f Data %.3f Err %1.4f top1 %7.3f top5 %7.3f'):format( 137 | opt.name, epoch, n, trainSize, timer:time().real, dataTime, loss, top1, top5)) 138 | 139 | -- check that the storage didn't get changed do to an unfortunate getParameters call 140 | assert(self.params[1]:storage() == self.model:parameters()[1]:storage()) -- TODO this ok? 141 | 142 | timer:reset() 143 | dataTimer:reset() 144 | end 145 | 146 | return top1Sum / N, top5Sum / N, lossSum / N 147 | end 148 | 149 | function Trainer:test(opt, epoch, dataloader) 150 | -- Computes the top-1 and top-5 err on the validation set 151 | 152 | local timer = torch.Timer() 153 | local dataTimer = torch.Timer() 154 | local size = dataloader:size() 155 | 156 | local nCrops = self.opt.tenCrop and 10 or 1 157 | local top1Sum, top5Sum = 0.0, 0.0 158 | local N = 0 159 | 160 | self.model:evaluate() 161 | for n, sample in dataloader:run() do 162 | local dataTime = dataTimer:time().real 163 | 164 | -- Copy input and target to the GPU 165 | self:copyInputs(sample) 166 | 167 | local output = self.model:forward(self.input):float() 168 | local batchSize = output:size(1) / nCrops 169 | local loss = self.criterion:forward(self.model.output, self.target) 170 | 171 | local top1, top5 = self:computeScore(output, sample.target, nCrops) 172 | top1Sum = top1Sum + top1*batchSize 173 | top5Sum = top5Sum + top5*batchSize 174 | N = N + batchSize 175 | 176 | print(('%s | Test: [%d][%d/%d] Time %.3f Data %.3f top1 %7.3f (%7.3f) top5 %7.3f (%7.3f)'):format( 177 | opt.name, epoch, n, size, timer:time().real, dataTime, top1, top1Sum / N, top5, top5Sum / N)) 178 | 179 | timer:reset() 180 | dataTimer:reset() 181 | end 182 | self.model:training() 183 | 184 | print((' * Finished epoch # %d top1: %7.3f top5: %7.3f\n'):format( 185 | epoch, top1Sum / N, top5Sum / N)) 186 | 187 | return top1Sum / N, top5Sum / N 188 | end 189 | 190 | -- Torch port of THUMOSeventclspr in THUMOS'15 191 | local function mAP(conf, gt) 192 | local so,sortind = torch.sort(conf, 1, true) --desc order 193 | local tp = gt:index(1,sortind:view(-1)):eq(1):int() 194 | local fp = gt:index(1,sortind:view(-1)):eq(0):int() 195 | local npos = torch.sum(tp) 196 | 197 | fp = torch.cumsum(fp) 198 | tp = torch.cumsum(tp) 199 | local rec = tp:float()/npos 200 | local prec = torch.cdiv(tp:float(),(fp+tp):float()) 201 | 202 | local ap = 0 203 | local tmp = gt:index(1,sortind:view(-1)):eq(1):view(-1) 204 | for i=1,conf:size(1) do 205 | if tmp[i]==1 then 206 | ap = ap+prec[i] 207 | end 208 | end 209 | ap = ap/npos 210 | 211 | return rec,prec,ap 212 | end 213 | 214 | local function charades_ap(outputs, gt) 215 | -- approximate version of the charades evaluation function 216 | -- For precise numbers, use the submission file with the official matlab script 217 | conf = outputs:clone() 218 | conf[gt:sum(2):eq(0):expandAs(conf)] = -math.huge -- This is to match the official matlab evaluation code. This omits videos with no annotations 219 | ap = torch.Tensor(157,1) 220 | for i=1,157 do 221 | _,_,ap[{{i},{}}] = mAP(conf[{{},{i}}],gt[{{},{i}}]) 222 | end 223 | return ap 224 | end 225 | 226 | local function tensor2str(x) 227 | str = "" 228 | for i=1,x:size(1) do 229 | if i == x:size(1) then 230 | str = str .. x[i] 231 | else 232 | str = str .. x[i] .. " " 233 | end 234 | end 235 | return str 236 | end 237 | 238 | function Trainer:test2(opt, epoch, dataloader) 239 | -- Computes the mAP over the whole videos 240 | 241 | local timer = torch.Timer() 242 | local dataTimer = torch.Timer() 243 | local size = dataloader:size() 244 | 245 | local nCrops = 1 246 | local N = 0 247 | local outputs = torch.Tensor(2000,157) --allocate memory 248 | local gt = torch.Tensor(2000,157) --allocate memory 249 | local names = {} 250 | 251 | local frameoutputs, framenr, framenames, nframe 252 | if opt.dumpLocalize then 253 | frameoutputs = torch.Tensor(25*2000,157) 254 | framenames = {} 255 | framenr = {} 256 | nframe = 0 257 | end 258 | 259 | self.model:evaluate() 260 | n2 = 0 261 | for n, sample in dataloader:run() do 262 | n2 = n2 + 1 263 | local dataTime = dataTimer:time().real 264 | 265 | -- Copy input and target to the GPU 266 | self:copyInputs(sample) 267 | 268 | local output = self.model:forward(self.input):float() 269 | local batchSize = 25 270 | 271 | for i=1,25-1 do -- make sure there is no error in the loader, this should be one video 272 | assert(torch.all(torch.eq( 273 | sample.target[{{i},{}}], 274 | sample.target[{{i+1},{}}] 275 | ))) 276 | end 277 | 278 | local tmp = output:exp() 279 | tmp = tmp:cdiv(tmp:sum(2):expandAs(output)) 280 | outputs[{{n2},{}}] = tmp:mean(1) 281 | gt[{{n2},{}}] = sample.target[{{1},{}}] 282 | table.insert(names,sample.ids[1]) 283 | 284 | if opt.dumpLocalize then 285 | frameoutputs[{{nframe+1,nframe+25},{}}] = tmp 286 | for b=1,25 do 287 | framenames[nframe+b] = sample.ids[1] 288 | framenr[nframe+b] = b 289 | end 290 | nframe = nframe+25 291 | end 292 | 293 | print(('%s | Test2: [%d][%d/%d] Time %.3f Data %.3f'):format( 294 | opt.name, epoch, n, size, timer:time().real, dataTime)) 295 | 296 | timer:reset() 297 | dataTimer:reset() 298 | end 299 | self.model:training() 300 | outputs = outputs[{{1,n2},{}}] 301 | gt = gt[{{1,n2},{}}] 302 | ap = charades_ap(outputs, gt) 303 | 304 | print((' * Finished epoch # %d mAP: %7.3f\n'):format( 305 | epoch, torch.mean(ap))) 306 | 307 | print('dumping output to file') 308 | local out = assert(io.open(self.opt.save .. "/epoch" .. epoch .. ".txt", "w")) 309 | for i=1,outputs:size(1) do 310 | out:write(names[i] .. " " .. tensor2str(outputs[{{i},{}}]:view(-1)) .. "\n") 311 | end 312 | out:close() 313 | 314 | if opt.dumpLocalize then 315 | print('dumping localization output to file') 316 | frameoutputs = frameoutputs[{{1,nframe},{}}] 317 | local out = assert(io.open(self.opt.save .. "/localize" .. epoch .. ".txt", "w")) 318 | for i=1,frameoutputs:size(1) do 319 | f = framenr[i] 320 | vidid = framenames[i] 321 | out:write(vidid .. " " .. f .. " " .. tensor2str(frameoutputs[{{i},{}}]:view(-1)) .. "\n") 322 | end 323 | out:close() 324 | end 325 | 326 | return ap 327 | end 328 | 329 | 330 | function Trainer:computeScore(output, target, nCrops) 331 | if nCrops > 1 then 332 | -- Sum over crops 333 | output = output:view(output:size(1) / nCrops, nCrops, output:size(2)) 334 | --:exp() 335 | :sum(2):squeeze(2) 336 | end 337 | 338 | -- Coputes the top1 and top5 error rate 339 | local batchSize = output:size(1) 340 | 341 | local _ , predictions = output:float():sort(2, true) -- descending 342 | 343 | -- Find which predictions match the target 344 | local correct = predictions:eq( 345 | target:long():view(batchSize, 1):expandAs(output)) 346 | 347 | -- Top-1 score 348 | local top1 = 1.0 - (correct:narrow(2, 1, 1):sum() / batchSize) 349 | 350 | -- Top-5 score, if there are at least 5 classes 351 | local len = math.min(5, correct:size(2)) 352 | local top5 = 1.0 - (correct:narrow(2, 1, len):sum() / batchSize) 353 | 354 | return top1 * 100, top5 * 100 355 | end 356 | 357 | function Trainer:copyInputs(sample) 358 | -- Copies the input to a CUDA tensor, if using 1 GPU, or to pinned memory, 359 | -- if using DataParallelTable. The target is always copied to a CUDA tensor 360 | self.input = self.input or (self.opt.nGPU == 1 361 | and torch.CudaTensor() 362 | or cutorch.createCudaHostTensor()) 363 | self.target = self.target or torch.CudaTensor() 364 | 365 | self.input:resize(sample.input:size()):copy(sample.input) 366 | self.target:resize(sample.target:size()):copy(sample.target) 367 | end 368 | 369 | function Trainer:learningRateModifier(epoch) 370 | -- Training schedule 371 | local decay = 0 372 | if self.opt.dataset == 'charades' then 373 | decay = math.floor((epoch - 1) / self.LR_decay_freq) 374 | elseif self.opt.dataset == 'charadesflow' then 375 | decay = math.floor((epoch - 1) / self.LR_decay_freq) 376 | elseif self.opt.dataset == 'imagenet' then 377 | decay = math.floor((epoch - 1) / 30) 378 | elseif self.opt.dataset == 'cifar10' then 379 | decay = epoch >= 122 and 2 or epoch >= 81 and 1 or 0 380 | else 381 | decay = math.floor((epoch - 1) / self.LR_decay_freq) 382 | end 383 | return math.pow(0.1, decay) 384 | end 385 | 386 | 387 | 388 | return M.Trainer 389 | --------------------------------------------------------------------------------