├── .gitignore ├── .gitmodules ├── .theanorc.example ├── LICENSE ├── README.md ├── cocoeval.py ├── common.py ├── config.py ├── create_movies.py ├── data ├── README.md ├── __init__.py ├── create_dataset.py ├── create_msr_vtt.py ├── create_mvad_mpii_lsmdc.py ├── create_skip_vectors.py ├── create_tacos.py ├── create_trecvid.py ├── create_y2t.py ├── process_frames.py ├── process_pca.py ├── py3_process_features.py ├── subsect_videos.py ├── util.py └── validate_feats.py ├── data_engine.py ├── download.py ├── hyperband.py ├── metrics.py ├── model_attention.py ├── model_lstmdd.py ├── model_mtle.py ├── py2-vid-desc_requirements.txt ├── py2_pip_freeze.txt ├── py3-vid-desc_requirements.txt ├── py3_pip_freeze.txt └── train_model.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | .idea/* 60 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "coco-caption"] 2 | path = coco-caption 3 | url = https://github.com/tylin/coco-caption.git 4 | [submodule "jobman"] 5 | path = jobman 6 | url = https://github.com/crmne/jobman.git 7 | [submodule "data/skip-thoughts"] 8 | path = data/skip-thoughts 9 | url = https://github.com/olivernina/skip-thoughts.git 10 | -------------------------------------------------------------------------------- /.theanorc.example: -------------------------------------------------------------------------------- 1 | [global] 2 | device = gpu0 3 | floatX = float32 4 | 5 | [dnn] 6 | enabled = True 7 | 8 | [nvcc] 9 | flags = -D_FORCE_INLINES 10 | 11 | [lib] 12 | cnmem = 10500 13 | # Use 11200 for training anything with an entire dataset 14 | #cnmem = 11200 15 | # Use 5500 for HYPERBAND 16 | #cnmem=5500 17 | 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 OSU Photogammetric Computer Vision Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MTLE 2 | 3 | This is the latest version of our code described in our [paper](https://arxiv.org/abs/1809.07257). An earlier version of our code was used at LSMDC17 where we won the movie description task. 4 | 5 | ## Dependencies 6 | 7 | These are the general, high-level dependencies: 8 | 9 | - CUDA-capable GPU(s) 10 | - Large storage medium for dataset videos (for re-creating results) 11 | - Python 2.7 + Python 3.5 (both required, more info below) 12 | 13 | For Python 2.7: 14 | 15 | - `Theano 0.8.1` 16 | - `cuDNN 5.4` 17 | - `CNMeM Memory backend` (optional) 18 | - Working `.theanorc` config file (provided, more info below) 19 | 20 | For Python 3.5 and above: 21 | 22 | - `PyTorch 0.4.0` (torch + torchvision) 23 | - `pretrainedmodels` (Cadene repository) 24 | 25 | In-depth Python package information is provided for each respective environment: 26 | 27 | - `py2-vid-desc_requirements.txt` 28 | - `py2_pip_freeze.txt` 29 | - `py3-vid-desc_requirements.txt` 30 | - `py3_pip_freeze.txt` 31 | 32 | The rest of this guide assumes you are using Linux. 33 | 34 | We recommend the use of Anaconda to handle dependencies, as the files above can be used to easily re-create the necessary environments. 35 | 36 | Theano uses a file called `.theanorc` to configure certain options. This file goes in your home directory on Linux. We have provided one that we use on a working test system, called `.theanorc.example`. 37 | 38 | #### Why two Python versions? 39 | 40 | We use the `pretrainedmodels` package provided by GitHub user Cadene, due to its ease of use and better portability over Caffe. 41 | However, this means having to use Python 3 for this specific step. Everything else uses Python 2.7. 42 | We thought this was a worthwhile hurdle to take advantage of PyTorch's ease of installation. 43 | 44 | 45 | ## Installation 46 | 47 | We recommend Anaconda, available [here](https://www.anaconda.com/download/). 48 | 49 | Clone the code as follows: 50 | 51 | ` git clone https://github.com/OSUPCVLab/VideoToTextDNN.git --recursive` 52 | 53 | Once Anaconda is installed, you must create two anaconda environments: 54 | 55 | The general-purpose one: 56 | 57 | `conda create --name vid-desc python=2.7 --file py2-vid-desc_requirements.txt` 58 | 59 | and one for feature extraction: 60 | 61 | `conda create --name vid-desc-feats python=3.6 --file py3-vid-desc_requirements.txt` 62 | 63 | Use `conda activate ` to switch between environments. 64 | 65 | Clone the repo recursively in order to clone other submodules that are required for the project 66 | 67 | `git clone https://github.com/OSUPCVLab/VideoToTextDNN.git --recursive` 68 | 69 | Install the required packages for the project 70 | 71 | `pip install -r py2_pip_freeze.txt` 72 | 73 | You might see some complaints about the following packages so you will need them to install them manually: 74 | 75 | `conda install -c conda-forge pyro4` 76 | 77 | For client installation, the following modules are also needed: 78 | 79 | `python -m pip install --upgrade mss` 80 | 81 | `conda install -c https://conda.anaconda.org/menpo opencv3` 82 | 83 | `pip install pyttsx3` 84 | 85 | `pip install pretrainedmodels` 86 | 87 | 88 | 89 | ## Data 90 | 91 | The data pipeline is handled under the `data/` directory. The `README.md` file there describes how to download the necessary datasets and process them for consumption in detail. 92 | 93 | 94 | 95 | ## Tutorial 96 | 97 | Since the data collection process can take from minutes to weeks, depending on your available hardware, we have split the tutorial into two paths. 98 | 99 | Visit the file `data/README.md` and follow the path most interesting to you to prepare the data. Once you have your data files ready, come back to this file to perform training or prediction on the path relevant to you. 100 | 101 | The rest of this tutorial assumes you have either 1) extracted feature files, or 2) created `.pkl` files, as described in `data/README.md`. If there are any problems, feel free to file an issue on this repo, as this release is still a work in progress. 102 | 103 | 104 | With your extracted features ready, you will need a pre-trained model. We have provided two checkpoint files, one trained on the 10k video MSVD Youtube-based dataset, and another on the 120k video LSMDC16 Movie dataset. 105 | 106 | MSVD and LSMDC Checkpoint: [download](https://uflorida-my.sharepoint.com/:f:/g/personal/w_garcia_ufl_edu/Ev7InIZkYc5Pn91wlU3oK1gB_NQ6BAArSll4iFELl8Hj2w?e=vad0K7) 107 | 108 | 109 | ## Demo 110 | For a live demo, we make use of a server running our system and a client extracting and submiting cnn features to the server. 111 | 112 | To start the server just run the script: 113 | 114 | `python live_mtle_server.py ` 115 | 116 | The server then will output a temp_uri string that you need to use on the client to point where you want to send the input to. 117 | 118 | To run the client just execute the following script with the mode you want to run the client. There are three types: live (screen capture), prompt (you pass the path of the video), headless (you pass a list of videos to process) 119 | 120 | `python live_mtle_client.py --mode ` 121 | 122 | 123 | ## Acknowledgements 124 | Big thanks to Li Yao and his original project [arctic-capgen-vid](https://github.com/yaoli/arctic-capgen-vid), which this project derives from. 125 | 126 | This work was sponsored by the SMART DOD program. 127 | 128 | We apologize for the delay in releasing the code. The main author encountered some difficulties and life events leading up to the public release of the paper which made it difficult to relase the paper and code sooner. 129 | -------------------------------------------------------------------------------- /cocoeval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(1,'coco-caption') 3 | 4 | from pycocoevalcap.bleu.bleu import Bleu 5 | from pycocoevalcap.rouge.rouge import Rouge 6 | from pycocoevalcap.cider.cider import Cider 7 | from pycocoevalcap.meteor.meteor import Meteor 8 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 9 | import os, cPickle 10 | 11 | class COCOScorer(object): 12 | def __init__(self): 13 | print 'init COCO-EVAL scorer' 14 | 15 | def score(self, GT, RES, IDs): 16 | self.eval = {} 17 | self.imgToEval = {} 18 | gts = {} 19 | res = {} 20 | for ID in IDs: 21 | gts[ID] = GT[ID] 22 | res[ID] = RES[ID] 23 | print 'tokenization...' 24 | tokenizer = PTBTokenizer() 25 | gts = tokenizer.tokenize(gts) 26 | res = tokenizer.tokenize(res) 27 | 28 | # ================================================= 29 | # Set up scorers 30 | # ================================================= 31 | print 'setting up scorers...' 32 | scorers = [ 33 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 34 | (Meteor(),"METEOR"), 35 | (Rouge(), "ROUGE_L"), 36 | (Cider(), "CIDEr") 37 | ] 38 | 39 | # ================================================= 40 | # Compute scores 41 | # ================================================= 42 | eval = {} 43 | for scorer, method in scorers: 44 | print 'computing %s score...'%(scorer.method()) 45 | score, scores = scorer.compute_score(gts, res) 46 | if type(method) == list: 47 | for sc, scs, m in zip(score, scores, method): 48 | self.setEval(sc, m) 49 | self.setImgToEvalImgs(scs, IDs, m) 50 | print "%s: %0.3f"%(m, sc) 51 | else: 52 | self.setEval(score, method) 53 | self.setImgToEvalImgs(scores, IDs, method) 54 | print "%s: %0.3f"%(method, score) 55 | 56 | for metric, score in self.eval.items(): 57 | print '%s: %.5f'%(metric, score) 58 | return self.eval 59 | 60 | def setEval(self, score, method): 61 | self.eval[method] = score 62 | 63 | def setImgToEvalImgs(self, scores, imgIds, method): 64 | for imgId, score in zip(imgIds, scores): 65 | if not imgId in self.imgToEval: 66 | self.imgToEval[imgId] = {} 67 | self.imgToEval[imgId]["image_id"] = imgId 68 | self.imgToEval[imgId][method] = score 69 | 70 | 71 | def load_pkl(path): 72 | f = open(path, 'rb') 73 | try: 74 | rval = cPickle.load(f) 75 | finally: 76 | f.close() 77 | return rval 78 | 79 | def score(ref, sample): 80 | # ref and sample are both dict 81 | scorers = [ 82 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 83 | (Meteor(),"METEOR"), 84 | (Rouge(), "ROUGE_L"), 85 | (Cider(), "CIDEr") 86 | ] 87 | final_scores = {} 88 | for scorer, method in scorers: 89 | print 'computing %s score with COCO-EVAL...'%(scorer.method()) 90 | score, scores = scorer.compute_score(ref, sample) 91 | if type(score) == list: 92 | for m, s in zip(method, score): 93 | final_scores[m] = s 94 | else: 95 | final_scores[method] = score 96 | return final_scores 97 | 98 | def test_cocoscorer(): 99 | '''gts = { 100 | 184321:[ 101 | {u'image_id': 184321, u'id': 352188, u'caption': u'A train traveling down-tracks next to lights.'}, 102 | {u'image_id': 184321, u'id': 356043, u'caption': u"A blue and silver train next to train's station and trees."}, 103 | {u'image_id': 184321, u'id': 356382, u'caption': u'A blue train is next to a sidewalk on the rails.'}, 104 | {u'image_id': 184321, u'id': 361110, u'caption': u'A passenger train pulls into a train station.'}, 105 | {u'image_id': 184321, u'id': 362544, u'caption': u'A train coming down the tracks arriving at a station.'}], 106 | 81922: [ 107 | {u'image_id': 81922, u'id': 86779, u'caption': u'A large jetliner flying over a traffic filled street.'}, 108 | {u'image_id': 81922, u'id': 90172, u'caption': u'An airplane flies low in the sky over a city street. '}, 109 | {u'image_id': 81922, u'id': 91615, u'caption': u'An airplane flies over a street with many cars.'}, 110 | {u'image_id': 81922, u'id': 92689, u'caption': u'An airplane comes in to land over a road full of cars'}, 111 | {u'image_id': 81922, u'id': 823814, u'caption': u'The plane is flying over top of the cars'}] 112 | } 113 | 114 | samples = { 115 | 184321: [{u'image_id': 184321, 'id': 111, u'caption': u'train traveling down a track in front of a road'}], 116 | 81922: [{u'image_id': 81922, 'id': 219, u'caption': u'plane is flying through the sky'}], 117 | } 118 | ''' 119 | gts = { 120 | '184321':[ 121 | {u'image_id': '184321', u'cap_id': 0, u'caption': u'A train traveling down tracks next to lights.', 122 | 'tokenized': 'a train traveling down tracks next to lights'}, 123 | {u'image_id': '184321', u'cap_id': 1, u'caption': u'A train coming down the tracks arriving at a station.', 124 | 'tokenized': 'a train coming down the tracks arriving at a station'}], 125 | '81922': [ 126 | {u'image_id': '81922', u'cap_id': 0, u'caption': u'A large jetliner flying over a traffic filled street.', 127 | 'tokenized': 'a large jetliner flying over a traffic filled street'}, 128 | {u'image_id': '81922', u'cap_id': 1, u'caption': u'The plane is flying over top of the cars', 129 | 'tokenized': 'the plan is flying over top of the cars'},] 130 | } 131 | 132 | samples = { 133 | '184321': [{u'image_id': '184321', u'caption': u'train traveling down a track in front of a road'}], 134 | '81922': [{u'image_id': '81922', u'caption': u'plane is flying through the sky'}], 135 | } 136 | IDs = ['184321', '81922'] 137 | scorer = COCOScorer() 138 | scorer.score(gts, samples, IDs) 139 | 140 | if __name__ == '__main__': 141 | test_cocoscorer() 142 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | import cPickle, os 2 | 3 | import numpy 4 | from collections import OrderedDict 5 | import theano 6 | import theano.tensor as tensor 7 | from theano.sandbox.rng_mrg import MRG_RandomStreams 8 | from theano import config 9 | 10 | # the dir where there should be a subdir named 'youtube2text_iccv15' 11 | 12 | RAB_DATASET_BASE_PATH = './data/' 13 | # RAB_DATASET_BASE_PATH = '/media/onina/sea2/datasets/lsmdc/out_pkl/' 14 | 15 | 16 | # the dir where all the experiment data is dumped. 17 | RAB_EXP_PATH = 'results/' 18 | 19 | 20 | def numpy_floatX(data): 21 | return numpy.asarray(data, dtype=config.floatX) 22 | 23 | def get_two_rngs(seed=None): 24 | if seed is None: 25 | seed = 1234 26 | else: 27 | seed = seed 28 | rng_numpy = numpy.random.RandomState(seed) 29 | rng_theano = MRG_RandomStreams(seed) 30 | return rng_numpy, rng_theano 31 | 32 | rng_numpy, rng_theano = get_two_rngs() 33 | 34 | def concatenate(tensor_list, axis=0): 35 | """ 36 | Alternative implementation of `theano.tensor.concatenate`. 37 | This function does exactly the same thing, but contrary to Theano's own 38 | implementation, the gradient is implemented on the GPU. 39 | Backpropagating through `theano.tensor.concatenate` yields slowdowns 40 | because the inverse operation (splitting) needs to be done on the CPU. 41 | This implementation does not have that problem. 42 | :usage: 43 | >>> x, y = theano.tensor.matrices('x', 'y') 44 | >>> c = concatenate([x, y], axis=1) 45 | :parameters: 46 | - tensor_list : list 47 | list of Theano tensor expressions that should be concatenated. 48 | - axis : int 49 | the tensors will be joined along this axis. 50 | :returns: 51 | - out : tensor 52 | the concatenated tensor expression. 53 | """ 54 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 55 | 56 | output_shape = () 57 | for k in range(axis): 58 | output_shape += (tensor_list[0].shape[k],) 59 | output_shape += (concat_size,) 60 | for k in range(axis + 1, tensor_list[0].ndim): 61 | output_shape += (tensor_list[0].shape[k],) 62 | 63 | out = tensor.zeros(output_shape) 64 | offset = 0 65 | for tt in tensor_list: 66 | indices = () 67 | for k in range(axis): 68 | indices += (slice(None),) 69 | indices += (slice(offset, offset + tt.shape[axis]),) 70 | for k in range(axis + 1, tensor_list[0].ndim): 71 | indices += (slice(None),) 72 | 73 | out = tensor.set_subtensor(out[indices], tt) 74 | offset += tt.shape[axis] 75 | 76 | return out 77 | ''' 78 | Theano shared variables require GPUs, so to 79 | make this code more portable, these two functions 80 | push and pull variables between a shared 81 | variable dictionary and a regular numpy 82 | dictionary 83 | ''' 84 | # push parameters to Theano shared variables 85 | def zipp(params, tparams): 86 | for kk, vv in params.iteritems(): 87 | tparams[kk].set_value(vv) 88 | 89 | # pull parameters from Theano shared variables 90 | def unzip(zipped): 91 | new_params = OrderedDict() 92 | for kk, vv in zipped.iteritems(): 93 | new_params[kk] = vv.get_value() 94 | return new_params 95 | 96 | # get the list of parameters: Note that tparams must be OrderedDict 97 | def itemlist(tparams): 98 | return [vv for kk, vv in tparams.iteritems()] 99 | 100 | # dropout 101 | def dropout_layer(state_before, use_noise, trng): 102 | proj = tensor.switch(use_noise, 103 | state_before * 104 | trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype), 105 | state_before * 0.5) 106 | return proj 107 | 108 | 109 | # initialize Theano shared variables according to the initial parameters 110 | def init_tparams(params): 111 | tparams = OrderedDict() 112 | for kk, pp in params.iteritems(): 113 | tparams[kk] = theano.shared(params[kk], name=kk) 114 | return tparams 115 | 116 | # some utilities 117 | def ortho_weight(ndim): 118 | """ 119 | Random orthogonal weights, we take 120 | the right matrix in the SVD. 121 | 122 | Remember in SVD, u has the same # rows as W 123 | and v has the same # of cols as W. So we 124 | are ensuring that the rows are 125 | orthogonal. 126 | """ 127 | W = rng_numpy.randn(ndim, ndim) 128 | u, _, _ = numpy.linalg.svd(W) 129 | return u.astype('float32') 130 | 131 | def norm_weight(nin,nout=None, scale=0.01, ortho=True): 132 | """ 133 | Random weights drawn from a Gaussian 134 | """ 135 | if nout == None: 136 | nout = nin 137 | if nout == nin and ortho: 138 | W = ortho_weight(nin) 139 | else: 140 | W = scale * rng_numpy.randn(nin, nout) 141 | return W.astype('float32') 142 | 143 | def tanh(x): 144 | return tensor.tanh(x) 145 | 146 | def rectifier(x): 147 | return tensor.maximum(0., x) 148 | 149 | def linear(x): 150 | return x 151 | 152 | # load parameters 153 | def load_params(path, params): 154 | pp = numpy.load(path) 155 | for kk, vv in params.iteritems(): 156 | if kk not in pp: 157 | raise Warning('%s is not in the archive'%kk) 158 | params[kk] = pp[kk] 159 | return params 160 | 161 | def grad_nan_report(grads, tparams): 162 | numpy.set_printoptions(precision=3) 163 | D = OrderedDict() 164 | i = 0 165 | NaN_keys = [] 166 | magnitude = [] 167 | assert len(grads) == len(tparams) 168 | for k, v in tparams.iteritems(): 169 | grad = grads[i] 170 | magnitude.append(numpy.abs(grad).mean()) 171 | if numpy.isnan(grad.sum()): 172 | NaN_keys.append(k) 173 | #assert v.get_value().shape == grad.shape 174 | D[k] = grad 175 | i += 1 176 | #norm = [numpy.sqrt(numpy.sum(grad**2)) for grad in grads] 177 | #print '\tgrad mean(abs(x))', numpy.array(magnitude) 178 | return D, NaN_keys 179 | 180 | # optimizers 181 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 182 | def adadelta(lr, tparams, grads, inp, cost, extra): 183 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] 184 | running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] 185 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] 186 | 187 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 188 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] 189 | 190 | f_grad_shared = theano.function(inp, [cost] + extra, updates=zgup+rg2up, 191 | profile=False, on_unused_input='ignore',allow_input_downcast=True) 192 | 193 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] 194 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] 195 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 196 | 197 | f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=False) 198 | 199 | return f_grad_shared, f_update 200 | 201 | def adam(lr, tparams, grads, inp, cost): 202 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] 203 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 204 | 205 | f_grad_shared = theano.function(inp, cost, updates=gsup) 206 | 207 | lr0 = 0.0002 208 | b1 = 0.1 209 | b2 = 0.001 210 | e = 1e-8 211 | 212 | updates = [] 213 | 214 | i = theano.shared(numpy_floatX(0.)) 215 | i_t = i + 1. 216 | fix1 = 1. - b1**(i_t) 217 | fix2 = 1. - b2**(i_t) 218 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1) 219 | 220 | for p, g in zip(tparams.values(), gshared): 221 | m = theano.shared(p.get_value() * 0.) 222 | v = theano.shared(p.get_value() * 0.) 223 | m_t = (b1 * g) + ((1. - b1) * m) 224 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) 225 | g_t = m_t / (tensor.sqrt(v_t) + e) 226 | p_t = p - (lr_t * g_t) 227 | updates.append((m, m_t)) 228 | updates.append((v, v_t)) 229 | updates.append((p, p_t)) 230 | updates.append((i, i_t)) 231 | 232 | f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore') 233 | 234 | return f_grad_shared, f_update 235 | 236 | def rmsprop(lr, tparams, grads, inp, cost): 237 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] 238 | running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad'%k) for k, p in tparams.iteritems()] 239 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] 240 | 241 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 242 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 243 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] 244 | 245 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, profile=False) 246 | 247 | updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir'%k) for k, p in tparams.iteritems()] 248 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] 249 | param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)] 250 | f_update = theano.function([lr], [], updates=updir_new+param_up, on_unused_input='ignore', profile=False) 251 | 252 | return f_grad_shared, f_update 253 | 254 | def sgd(lr, tparams, grads, inp, cost): 255 | gshared = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] 256 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 257 | 258 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=False) 259 | 260 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 261 | f_update = theano.function([lr], [], updates=pup, profile=False) 262 | 263 | return f_grad_shared, f_update 264 | 265 | def load_pkl(path): 266 | """ 267 | Load a pickled file. 268 | 269 | :param path: Path to the pickled file. 270 | 271 | :return: The unpickled Python object. 272 | """ 273 | f = open(path, 'rb') 274 | try: 275 | rval = cPickle.load(f) 276 | finally: 277 | f.close() 278 | return rval 279 | 280 | def dump_pkl(obj, path): 281 | """ 282 | Save a Python object into a pickle file. 283 | """ 284 | f = open(path, 'wb') 285 | try: 286 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 287 | finally: 288 | f.close() 289 | 290 | 291 | def generate_minibatch_idx(dataset_size, minibatch_size): 292 | # generate idx for minibatches SGD 293 | # output [m1, m2, m3, ..., mk] where mk is a list of indices 294 | assert dataset_size >= minibatch_size 295 | n_minibatches = dataset_size / minibatch_size 296 | leftover = dataset_size % minibatch_size 297 | idx = range(dataset_size) 298 | if leftover == 0: 299 | minibatch_idx = numpy.split(numpy.asarray(idx), n_minibatches) 300 | else: 301 | print 'uneven minibath chunking, overall %d, last one %d'%(minibatch_size, leftover) 302 | minibatch_idx = numpy.split(numpy.asarray(idx)[:-leftover], n_minibatches) 303 | minibatch_idx = minibatch_idx + [numpy.asarray(idx[-leftover:])] 304 | minibatch_idx = [idx_.tolist() for idx_ in minibatch_idx] 305 | return minibatch_idx 306 | 307 | def get_rab_dataset_base_path(): 308 | return RAB_DATASET_BASE_PATH 309 | 310 | def get_rab_exp_path(): 311 | return RAB_EXP_PATH 312 | 313 | def create_dir_if_not_exist(directory): 314 | if not os.path.exists(directory): 315 | print 'creating directory %s'%directory 316 | os.makedirs(directory) 317 | else: 318 | print "%s already exists!"%directory 319 | 320 | def flatten_list_of_list(l): 321 | # l is a list of list 322 | return [item for sublist in l for item in sublist] 323 | 324 | def load_txt_file(path): 325 | f = open(path,'r') 326 | lines = f.readlines() 327 | f.close() 328 | return lines 329 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | from jobman import DD 2 | import common 3 | 4 | exp_path = common.get_rab_exp_path() 5 | 6 | config = DD({ 7 | 'model': 'attention', 8 | 'random_seed': 1234, 9 | # ERASE everything under save_model_path 10 | 'erase_history': True, 11 | 'attention': DD({ 12 | 'reload_': False, 13 | 'save_model_dir': exp_path + 'arctic-capgen-vid/test_non/', 14 | 'from_dir': '', 15 | 'dataset': 'youtube2text',#'youtube2text',#'lsmdc',mvad. 'ysvd' 16 | 'video_feature': 'googlenet', 17 | 'dim_word':468, # 474 18 | 'ctx_dim':-1,# auto set 19 | 'dim':3518, # lstm dim # 536 20 | 'n_layers_out':1, # for predicting next word 21 | 'n_layers_init':0, 22 | 'encoder_dim': 300, 23 | 'prev2out':True, 24 | 'ctx2out':True, 25 | 'patience':20, 26 | 'max_epochs':500, 27 | 'decay_c':1e-4, 28 | 'alpha_entropy_r': 0., 29 | 'alpha_c':0.70602, 30 | 'lrate':0.01, 31 | 'selector':True, 32 | 'n_words':20000, 33 | 'maxlen':30, # max length of the descprition 34 | 'optimizer':'adadelta', 35 | 'clip_c': 10., 36 | 'batch_size': 64, # for trees use 25 37 | # 'batch_size': 2, # for trees use 25 38 | 'valid_batch_size':200, 39 | # 'valid_batch_size':2, 40 | # in the unit of minibatches 41 | 'dispFreq':200, 42 | 'validFreq':2000, 43 | 'saveFreq':-1, # this is disabled, now use sampleFreq instead 44 | 'sampleFreq':100, 45 | # blue, meteor, or both 46 | 'metric': 'everything', # set to perplexity on DVS 47 | 'use_dropout':True, 48 | 'K':28, # 26 when compare 49 | 'OutOf':None, # used to be 240, for motionfeature use 26 50 | 'verbose': True, 51 | 'debug': False, 52 | 'dec':'standard', 53 | 'encoder':None, 54 | 'mode':'train', 55 | 'proc':'nostd', 56 | 'data_dir':'', 57 | 'feats_dir':'' 58 | }), 59 | 'iLSTM': DD({ 60 | 'reload_': False, 61 | 'save_model_dir': exp_path + 'attention_mod/', 62 | 'dec':'standard', 63 | 'valid_batch_size':200, 64 | 'dataset': 'youtube2text', 65 | 'encoder': None, 66 | 'max_epochs':500, 67 | 'from_dir': '', 68 | }), 69 | 'attention_mod': DD({ 70 | 'reload_': False, 71 | 'save_model_dir': exp_path + 'attention_mod/', 72 | 'dec':'multi-stdist' 73 | }), 74 | 'mtle': DD({ 75 | 'save_model_dir': exp_path + 'arctic-capgen-vid/test_non/', 76 | 'reload_': False, 77 | 'from_dir': '', 78 | 'dec':'multi-stdist', 79 | 'dim_word':468, # 474 80 | 'encoder':None, 81 | 'encoder_dim': 300, 82 | 'batch_size': 64, #64 for trees use 25 83 | 'valid_batch_size':200, 84 | 'dataset': 'vtt', 85 | 'dim':3518, # lstm dim # 536 86 | 'video_feature': 'googlenet', 87 | 'validFreq': 2000, 88 | 'max_epochs': 500, 89 | 'mode':'train', 90 | 'proc':'nostd', 91 | 'K':28, # 26 when compare 92 | 'lrate':0.0001, 93 | 'data_dir':'', 94 | 'dispFreq':10, 95 | 'feats_dir':'', 96 | 'cost_type':'v1' 97 | }), 98 | 'fcoupled': DD({ 99 | 'save_model_dir': exp_path + 'arctic-capgen-vid/test_non/', 100 | 'reload_': False, 101 | 'dec':'multi-random', 102 | 'encoder':None, 103 | 'encoder_dim': 300, 104 | 'batch_size': 64, # for trees use 25 105 | 'dataset': 'youtube2text', 106 | 'dim':3518, # lstm dim # 536 107 | 'from_dir': '', 108 | 'valid_batch_size':200, 109 | 'max_epochs':500, 110 | 'video_feature': 'googlenet', 111 | }), 112 | 'const': DD({ 113 | 'save_model_dir': exp_path + 'arctic-capgen-vid/test_non/', 114 | 'reload_': False, 115 | 'dec':'multi-random', 116 | 'encoder':None, 117 | 'encoder_dim': 300, 118 | 'batch_size': 64, # for trees use 25 119 | 'dataset': 'youtube2text', 120 | 'dim':3518, # lstm dim # 536 121 | 'from_dir': '', 122 | }), 123 | 'const2': DD({ 124 | 'save_model_dir': exp_path + 'arctic-capgen-vid/test_non/', 125 | 'reload_': False, 126 | 'dec':'multi-random', 127 | 'encoder':None, 128 | 'encoder_dim': 300, 129 | 'batch_size': 64, # for trees use 25 130 | 'dataset': 'youtube2text' 131 | }), 132 | 'LSTM': DD({ 133 | 'reload_': False, 134 | 'save_model_dir': exp_path + 'attention_mod/', 135 | 'dec':'standard', 136 | 'valid_batch_size':200, 137 | 'dataset': 'youtube2text', 138 | 'encoder': 'lstm_uni', 139 | 'max_epochs':500, 140 | 'from_dir': '', 141 | }), 142 | 'lstmdd': DD({ 143 | 'save_model_dir': exp_path + 'arctic-capgen-vid/test_non/', 144 | 'reload_': False, 145 | 'from_dir': '', 146 | 'dec':'multi-stdi', 147 | 'dim_word':468, # 474 148 | 'encoder':None, 149 | 'encoder_dim': 300, 150 | 'batch_size': 64, #64 for trees use 25 151 | 'valid_batch_size':200, 152 | 'dataset': 'vtt', 153 | 'dim':3518, # lstm dim # 536 154 | 'video_feature': 'googlenet', 155 | 'validFreq': 2000, 156 | 'max_epochs': 500, 157 | 'mode':'train', 158 | 'proc':'nostd', 159 | 'K':28, # 26 when compare 160 | 'lrate':0.0001, 161 | 'data_dir':'', 162 | 'dispFreq':10, 163 | 'feats_dir':'', 164 | 'cost_type':'v1' 165 | 166 | }), 167 | 'gru': DD({ 168 | 'reload_': False, 169 | 'save_model_dir': exp_path + 'gru_model2/', 170 | 'from_dir': '', 171 | 'dataset': 'youtube2text',#'youtube2text',#'lsmdc',mvad. 'ysvd' 172 | 'video_feature': 'googlenet', 173 | 'dim_word':468, # 474 174 | 'ctx_dim':-1,# auto set 175 | 'dim':3518, # lstm dim # 536 176 | 'n_layers_out':1, # for predicting next word 177 | 'n_layers_init':0, 178 | 'encoder_dim': 300, 179 | 'prev2out':True, 180 | 'ctx2out':True, 181 | 'patience':20, 182 | 'max_epochs':500, 183 | 'decay_c':1e-4, 184 | 'alpha_entropy_r': 0., 185 | 'alpha_c':0.70602, 186 | 'lrate':0.01, 187 | 'selector':True, 188 | 'n_words':20000, 189 | 'maxlen':30, # max length of the descprition 190 | 'optimizer':'adadelta', 191 | 'clip_c': 10., 192 | 'batch_size': 64, # for trees use 25 193 | # 'batch_size': 2, # for trees use 25 194 | 'valid_batch_size':200, 195 | # 'valid_batch_size':2, 196 | # in the unit of minibatches 197 | 'dispFreq':10, 198 | 'validFreq':2000, 199 | 'saveFreq':-1, # this is disabled, now use sampleFreq instead 200 | 'sampleFreq':100, 201 | # blue, meteor, or both 202 | 'metric': 'everything', # set to perplexity on DVS 203 | 'use_dropout':True, 204 | 'K':28, # 26 when compare 205 | 'OutOf':None, # used to be 240, for motionfeature use 26 206 | 'verbose': True, 207 | 'debug': False, 208 | 'dec':'standard', 209 | 'encoder':None, 210 | 'mode':'train', 211 | 'proc':'nostd' 212 | }), 213 | 'fc': DD({ 214 | 'reload_': False, 215 | 'save_model_dir': exp_path + 'attention_mod/', 216 | 'dec':'standard', 217 | 'dataset': 'youtube2text', 218 | 'encoder': None, 219 | 'from_dir': '', 220 | }), 221 | 'ic': DD({ 222 | 'reload_': False, 223 | 'save_model_dir': exp_path + 'attention_mod/', 224 | 'dec':'standard', 225 | 'dataset': 'youtube2text', 226 | 'encoder': None, 227 | 'from_dir': '', 228 | }), 229 | 'const_w': DD({ 230 | 'save_model_dir': exp_path + 'const_w/', 231 | 'reload_': False, 232 | 'dec':'multi-stdist', 233 | 'encoder':None, 234 | 'encoder_dim': 300, 235 | 'batch_size': 64, # for trees use 25 236 | 'dataset': 'youtube2text', 237 | 'video_feature': 'googlenet', 238 | }), 239 | 240 | 241 | }) 242 | -------------------------------------------------------------------------------- /create_movies.py: -------------------------------------------------------------------------------- 1 | __author__ = 'onina' 2 | 3 | import argparse 4 | import json 5 | import os 6 | import pickle 7 | import numpy as np 8 | 9 | # import process_frames 10 | from PIL import Image 11 | from PIL import ImageFont 12 | from PIL import ImageDraw 13 | from multiprocessing import Pool 14 | # import subprocess 15 | 16 | from data.util import mkdirs_safe 17 | 18 | 19 | def resizeImage(new_frame_path): 20 | print 'resizeImage: {}'.format(new_frame_path) 21 | command = 'magick {} -resize 1000x562\\! {}'.format(new_frame_path, new_frame_path) 22 | os.system(command) 23 | 24 | 25 | def drawOverlay(image, text): 26 | 27 | print 'drawOverlay: ' + image + ' text: ' + text 28 | img = Image.open(image) 29 | draw = ImageDraw.Draw(img) 30 | text_length = len(text)*10 31 | 32 | w = 1000 33 | y = 440 34 | 35 | font_size = 16 36 | if text_length >= w: 37 | font_size = 12 38 | text_length = len(text) * 7.2 39 | 40 | x = int((w - text_length) / 2) 41 | draw.rectangle((x,y, x+text_length,y+16*1.5), fill=(0,0,0)) 42 | 43 | font = ImageFont.truetype("DejaVuSans-Bold.ttf", font_size) 44 | draw.text((x, y),' '+text+' ',(0,255,0),font=font) 45 | img.save(image) 46 | 47 | 48 | def createVideo(inputdir, output_file, framerate): 49 | print 'createVideo: {} -> {}'.format(inputdir, output_file) 50 | command = 'magick -delay 7x100 -loop 0 ' + inputdir + '/*.jpg -layers OptimizePlus ' + output_file 51 | print 'command: {}'.format(command) 52 | os.system(command) 53 | print 'Finished job for {}.'.format(output_file) 54 | 55 | 56 | def main(args): 57 | # print params 58 | with open(args.rfile) as file: 59 | data = json.load(file) 60 | 61 | needed_files = os.listdir(args.vidpath) 62 | 63 | id_to_file_dict = {} 64 | 65 | if args.dataset != 'other': 66 | real_to_id_dict = pickle.load(open(os.path.join(args.dict_path))) 67 | 68 | # Hack to make lsmdc look like the other json files. 69 | if args.dataset == 'lsmdc16': 70 | data = {'result': data} 71 | print("Cut to {} videos".format(args.test)) 72 | print("Converted LSMDC16 to usable format") 73 | 74 | if args.test: 75 | data = {'result': np.random.choice(data['result'], args.test)} 76 | 77 | for i, desc in enumerate(data['result']): 78 | 79 | id = desc['video_id'] 80 | found_file = False 81 | print("Attempt {}".format(id)) 82 | 83 | for file in needed_files: 84 | if found_file: 85 | continue 86 | 87 | file_id = '.'.join(file.split('.')[:-1]) 88 | if args.dataset != 'other': 89 | if file_id not in real_to_id_dict: 90 | # print("{} not found in mapping dict.".format(file_id)) 91 | continue 92 | 93 | if real_to_id_dict[file_id] == id: 94 | id_to_file_dict[id] = file 95 | found_file = True 96 | elif file_id == id: 97 | id_to_file_dict[id] = file 98 | found_file = True 99 | 100 | if not found_file: 101 | print "Didn't find the file for {}.".format(id) 102 | 103 | getting_frames_jobs = [] 104 | video_create_jobs_after_frame_get = [] 105 | video_create_jobs = [] 106 | 107 | for i, desc in enumerate(data['result']): 108 | 109 | if desc['video_id'] not in id_to_file_dict.keys(): 110 | continue 111 | 112 | if desc['caption'] == '': 113 | continue 114 | 115 | # print result['video_id'] 116 | # print result['caption'] 117 | 118 | processed_vid_path = os.path.join(args.vidpath, id_to_file_dict[desc['video_id']]) 119 | 120 | out_path = os.path.join(args.rpath, 'frames') 121 | if not os.path.exists(out_path): 122 | mkdirs_safe(out_path) 123 | 124 | # out_path/out_frames_path.xxx/-> frames 125 | out_frames_path = os.path.join(out_path, id_to_file_dict[desc['video_id']]) 126 | 127 | framerate = 30 128 | 129 | if not os.path.exists(out_frames_path): 130 | if os.path.isdir(processed_vid_path): 131 | getting_frames_jobs.append((desc, out_frames_path, out_path, processed_vid_path)) 132 | 133 | rvid_path = os.path.join(args.rpath, 'vids') 134 | check_rvid_path(rvid_path) 135 | 136 | video_create_jobs_after_frame_get.append((desc, framerate, out_frames_path, rvid_path)) 137 | else: 138 | print 'No processed video directory found at {}!'.format(processed_vid_path) 139 | else: 140 | print "final frames already created" 141 | rvid_path = os.path.join(args.rpath, 'vids') 142 | check_rvid_path(rvid_path) 143 | 144 | video_create_jobs.append((desc, framerate, out_frames_path, rvid_path)) 145 | 146 | 147 | threadPoolWhenFramesNil = Pool() 148 | threadPoolFramesExisted = Pool() 149 | 150 | # Do frame getting jobs 151 | threadPoolWhenFramesNil.map(copy_frames_and_draw_overlay, getting_frames_jobs) 152 | # Also do video create jobs for frames already there 153 | threadPoolFramesExisted.map(prepare_path_and_create_video, video_create_jobs) 154 | 155 | threadPoolWhenFramesNil.close() 156 | threadPoolWhenFramesNil.join() 157 | 158 | threadPoolWhenFramesNil = Pool() 159 | 160 | # Now do video create jobs for previously nil frames 161 | threadPoolWhenFramesNil.map(prepare_path_and_create_video, video_create_jobs_after_frame_get) 162 | 163 | threadPoolWhenFramesNil.close() 164 | threadPoolWhenFramesNil.join() 165 | threadPoolFramesExisted.close() 166 | threadPoolFramesExisted.join() 167 | 168 | 169 | def check_rvid_path(rvid_path): 170 | if not os.path.exists(rvid_path): 171 | mkdirs_safe(rvid_path) 172 | 173 | 174 | def prepare_path_and_create_video((desc, framerate, out_frames_path, rvid_path)): 175 | new_vid_path = os.path.join(rvid_path, str(desc['video_id']) + '.gif') 176 | if not os.path.exists(new_vid_path): 177 | createVideo(out_frames_path, new_vid_path, framerate) 178 | 179 | 180 | def copy_frames_and_draw_overlay((desc, out_frames_path, out_path, processed_vid_path)): 181 | print 'Copying files {} -> {}'.format(processed_vid_path, out_path) 182 | command = "cp -r {} {}".format(processed_vid_path, out_path) 183 | os.system(command) 184 | frames = os.listdir(out_frames_path) 185 | print 'Creating Image Overlay For ' + str(len(frames)) + ' frames' 186 | for frame in frames: 187 | if frame.endswith('.jpg') or frame.endswith('.png'): 188 | new_frame_path = os.path.join(out_frames_path, frame) 189 | print new_frame_path 190 | print desc['caption'] 191 | resizeImage(new_frame_path) 192 | drawOverlay(new_frame_path, desc['caption']) 193 | 194 | 195 | def _validate(args): 196 | if args.dataset == 'msvd' or args.dataset == 'lsmdc16': 197 | if not args.dict_path: 198 | raise ValueError("Was given dataset={} but no annotations path was given.".format(args.dataset)) 199 | if not os.path.exists(args.dict_path): 200 | raise IOError("Was given dataset={} but dict_path={} does not exist.".format(args.dataset, args.dict_path)) 201 | 202 | 203 | if __name__=="__main__": 204 | #Run the script twice, the first time it will extract the frames the second time it will create the vids 205 | parser = argparse.ArgumentParser() 206 | parser.add_argument('-j','--rfile',dest='rfile', type=str, help='json file path with results',default='') 207 | parser.add_argument('-p','--vidpath',dest='vidpath',type=str,help='path where the processed videos reside', default='') 208 | parser.add_argument('-r','--rpath', dest='rpath',type=str, help='path where we will save vids',default='') 209 | parser.add_argument('-d', '--dataset', help="Dataset being processed. Some json files are written differently depending on the dataset.", default='other', choices=['msvd', 'lsmdc16', 'other']) 210 | parser.add_argument('-dp', '--dict_path', help="Path to msvd or lsmdc name mapping pkl file (containing mapping dict) [msdv & lsmdc16 only]", required=False) 211 | parser.add_argument('-t', '--test', type=int, help="Unit test/limit number movies to create to given arg. Default=0/OFF", default=0) 212 | parser.add_argument('-s', '--seed', help="Random seed.", default=9) 213 | 214 | args = parser.parse_args() 215 | np.random.seed(int(args.seed)) 216 | 217 | _validate(args) 218 | main(args) 219 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # `vid-desc/data` 2 | 3 | ## What is here? 4 | 5 | Standalone scripts for generating evalation data from these datasets: 6 | 7 | - MSR-VTT 8 | - M-VAD 9 | - MPII 10 | - LSMDC 2016 (M-VAD + MPII) 11 | - TRECVID 2016 12 | - MSVD (Youtube2text) 13 | - TACoS 14 | 15 | Multi-caption datasets rely on `pickle` generated files to store features, while single-caption datasets do not. 16 | This is mainly an artifact of the dataset sizes. Generating `pkl` files for LSMDC, MPII, and MVAD was usually unwieldy. 17 | 18 | Due to the modular nature, this pipeline can also be used as a general-purpose feature extractor for any videos. 19 | 20 | 21 | ## How are scripts run? 22 | 23 | The general pipeline is this: 24 | 25 | - `download videos to vid_dir` 26 | - `subsect_videos(vid_dir)`* 27 | - `process_frames(subsect_dir)` 28 | - `process_features(frames_dir)` 29 | - `create_dataset(feats_dir)`* 30 | 31 | `*` denotes a process that depends on dataset meta data. Everything else is dataset agnostic. 32 | 33 | Everything except `process_frames` uses the python2 environment. `process_frames` uses python3, which is a result of upgrading to pytorch for feature extraction. 34 | For the time being, you will just need to switch between two conda environments during the pipeline. 35 | 36 | So long as you have a directory with videos in it, the usage of each step should be clear. If you are not trying to re-create the results for some dataset, you can ommit `subsect_videos` and `create_dataset`, as those are for specific datasets. 37 | 38 | Here the pipeline will be described in detail following the two paths established in the root `README.md`. 39 | 40 | 41 | ## Tutorial 42 | 43 | #### "I just want to caption a couple of videos" 44 | 45 | 46 | 47 | 48 | 49 | 50 | #### "I want to re-create your results" (MSVD & LSMDC) 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSUPCVLab/VideoToTextDNN/a840172edf38e0a71d5e8feb130ab8f6c5eb19b6/data/__init__.py -------------------------------------------------------------------------------- /data/create_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset creation helper. Use this to generate command lines for lots of datasets. 3 | """ 4 | import logging 5 | import os 6 | import argparse 7 | 8 | from util import * 9 | 10 | from datetime import datetime 11 | 12 | logging.basicConfig() 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.DEBUG) 15 | 16 | possible_features = ['resnet', 'googlenet', 'nasnetalarge', 'resnet152', 'pnasnet5large', 'densenet121', 'polynet', 'senet154'] 17 | 18 | SEED = 9 19 | 20 | 21 | # When inputting dir paths in dict, make sure trailing `/` is removed. 22 | dataset_to_meta = { 23 | 'mvad': 24 | {"data_dir": "mvad/pkls", 25 | "base": "mvad/"}, 26 | 'vtt16': 27 | {"data_dir": "vtt/pkls2016", 28 | "base": "vtt"}, 29 | 'vtt17': 30 | {"data_dir": "vtt/pkls2017", 31 | "base": "vtt"}, 32 | 'youtube2text': 33 | {"data_dir": "youtube2text/pkls_yao", 34 | "base": "youtube2text"}, 35 | 'mpii': 36 | {"data_dir": "mpii/full", 37 | "base": "mpii"}, 38 | 'lsmdc16': 39 | {"data_dir": "lsmdc16/pkls16", 40 | "base": "lsmdc16/"}, 41 | 'tacos': 42 | {"data_dir": "TACoS/pkls", 43 | "base": "TACoS/"}, 44 | 'trecvid': 45 | {"data_dir": "trecvid/pkls", 46 | "base": "trecvid"}, 47 | } 48 | 49 | 50 | feats_dir_prefix = "features_" 51 | test_feats_dir_prefix = "features_testing_" 52 | annots_dir_name = "annotations" 53 | 54 | 55 | def create_commands(args, datasets, features): 56 | """ 57 | Create command lines to generate dataset files. 58 | 59 | :param args: 60 | :param datasets: 61 | :param features: 62 | :return: nil 63 | """ 64 | 65 | lines = set() 66 | main_lines = set() 67 | counting = 0 68 | 69 | for ds in datasets: 70 | for ft in features: 71 | if ds == 'mvad' or ds == 'mpii' or ds == 'lsmdc16' or ds == 'tacos': 72 | # single-caption take feats from feats_dir 73 | data_dir = dataset_to_meta[ds]["data_dir"] 74 | else: 75 | # multi caption take feats from pkl dir 76 | data_dir = dataset_to_meta[ds]["data_dir"] + '_' + ft 77 | 78 | if args.test: 79 | data_dir += '_ut{}'.format(args.test) 80 | 81 | data_dir = os.path.join(args.base_path, data_dir) 82 | 83 | base_dir = dataset_to_meta[ds]["base"] 84 | feat_dir = os.path.join(base_dir, feats_dir_prefix + ft) 85 | test_feat_dir = os.path.join(base_dir, test_feats_dir_prefix + ft) 86 | annots_dir = os.path.join(base_dir, annots_dir_name) 87 | 88 | for p in (feat_dir, annots_dir): 89 | if not os.path.isdir(p): 90 | logger.warning("Did not find directory at {}.".format(p)) 91 | 92 | if 'vtt' in ds: 93 | if not os.path.isdir(test_feat_dir): 94 | logger.warning("Did not find directory at {}.".format(test_feat_dir)) 95 | 96 | main_cmd = create_line(args.seed, ds, annots_dir, ft, data_dir, feat_dir, test_feat_dir, args.test, args.skip_thoughts) 97 | 98 | if main_cmd in main_lines: 99 | continue 100 | 101 | main_lines.add(main_cmd) 102 | 103 | lines.add(main_cmd) 104 | 105 | create_command_files(args, lines) 106 | 107 | 108 | def create_command_files(args, lines): 109 | out_txt_path = os.path.join(args.out, 'commands.txt') 110 | with open(out_txt_path, 'w') as f: 111 | for l in lines: 112 | f.write(l) 113 | f.write('\n') 114 | 115 | logger.info("Created list of dataset creation commands at {}".format(out_txt_path)) 116 | 117 | 118 | if __name__ == '__main__': 119 | ap = argparse.ArgumentParser() 120 | 121 | creation_args = ap.add_argument_group("Creation Args") 122 | creation_args.add_argument('-bp', '--base_path', help="Base path to prepend onto the dataset_to_meta dict values defined above", default="") 123 | creation_args.add_argument('-ds', '--dataset_select', help="Select a dataset rather than all.", nargs='+', required=False,default=None, choices=dataset_to_meta.keys()) 124 | creation_args.add_argument('-fs', '--feature_select', help="Select a feature type...", nargs='+', required=False,default=None, choices=possible_features) 125 | creation_args.add_argument('-s', '--seed', help="Random seed", required=False, default=SEED) 126 | creation_args.add_argument('-t', '--test', help="Create unit-test dataset. 0=Off, otherwise size of unittest dataset, in samples.", default=0) 127 | creation_args.add_argument('-st', '--skip_thoughts', help="Perform skip-thoughts as SDM.", action='store_true', default=False) 128 | 129 | file_args = ap.add_argument_group("FileArgs") 130 | file_args.add_argument("-o", "--out", help="Output file for generated commands from this script.", required=True) 131 | 132 | args = ap.parse_args() 133 | 134 | mkdirs_safe(args.out) 135 | 136 | if args.dataset_select: 137 | if type(args.dataset_select) == list: 138 | datasets = args.dataset_select 139 | else: 140 | datasets = [args.dataset_select] 141 | else: 142 | datasets = dataset_to_meta.keys() 143 | 144 | if args.feature_select: 145 | if type(args.feature_select) == list: 146 | features = args.feature_select 147 | else: 148 | features = [args.feature_select] 149 | else: 150 | features = possible_features 151 | 152 | create_commands(args, datasets, features) 153 | -------------------------------------------------------------------------------- /data/create_msr_vtt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import argparse 4 | import json 5 | import nltk 6 | import logging 7 | 8 | from util import * 9 | 10 | logging.basicConfig() 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(logging.DEBUG) 13 | 14 | SEED = 9 15 | 16 | 17 | def get_annots_vtt(feats_pool, train_val_list_path, test_list_path, test_sens_path, annotations, unittest=0): 18 | with open(train_val_list_path, 'r') as data_file: 19 | train_val_data = json.load(data_file) 20 | 21 | with open(test_list_path, 'r') as data_file: 22 | test_data = json.load(data_file) 23 | 24 | test_sens = None 25 | if test_sens_path: 26 | with open(test_sens_path, 'r') as data_file: 27 | test_sens = json.load(data_file) 28 | 29 | annotations, vids_train, vids_val, all_vids = get_annots_train_val_vtt(feats_pool, train_val_data, annotations, {}, unittest) 30 | annotations, vids_test, all_vids = get_annots_test_vtt(feats_pool, test_data, test_sens, annotations, all_vids, unittest) 31 | 32 | return annotations, vids_train, vids_val, vids_test, all_vids 33 | 34 | 35 | def get_annots_train_val_vtt(feats_pool, train_val_data, annotations, all_vids, unittest=0): 36 | vids_train = [] 37 | vids_val = [] 38 | 39 | logger.info('Retrieving annotations for train-val...') 40 | 41 | videos_getting = [i['video_id'] for i in train_val_data['videos']] 42 | 43 | if unittest: 44 | num_videos = unittest 45 | logger.debug('UNIT TEST: On') 46 | np.random.shuffle(videos_getting) 47 | videos_getting = videos_getting[:num_videos] 48 | 49 | sentences = train_val_data['sentences'] 50 | 51 | for sent in sentences: 52 | vid_name = sent['video_id'] 53 | if vid_name not in videos_getting: 54 | continue 55 | 56 | if vid_name not in feats_pool: 57 | logger.warn("feature was missing for video_id={}".format(vid_name)) 58 | continue 59 | 60 | if vid_name not in all_vids: 61 | all_vids[vid_name] = 1 62 | else: 63 | all_vids[vid_name] += 1 64 | 65 | ocaption = sent['caption'] 66 | ocaption = ocaption.strip().encode('utf-8', 'ignore') 67 | 68 | tokens = nltk.word_tokenize(ocaption) 69 | tokenized = ' '.join(tokens) 70 | tokenized = tokenized.lower() 71 | 72 | if vid_name in annotations: 73 | cap_id = str(len(annotations[vid_name])) 74 | annotations[vid_name].append({'tokenized':tokenized,'image_id':vid_name,'cap_id':cap_id,'caption':ocaption}) 75 | else: 76 | annotations[vid_name] = [] 77 | cap_id = str(0) 78 | annotations[vid_name].append({'tokenized':tokenized,'image_id':vid_name,'cap_id':cap_id,'caption':ocaption}) 79 | 80 | vid_and_cap = vid_name + '_' + cap_id 81 | 82 | vid_id = int(vid_name.split('video')[1]) 83 | 84 | gt_id = train_val_data['videos'][vid_id]['id'] 85 | assert gt_id == vid_id, 'Got an ID mis-match: vid_id={}, json_id={}'.format(vid_id, gt_id) 86 | 87 | if train_val_data['videos'][vid_id]['split'] == 'train': 88 | vids_train.append(vid_and_cap) 89 | elif train_val_data['videos'][vid_id]['split'] == 'validate': 90 | vids_val.append(vid_and_cap) 91 | else: 92 | raise ValueError("Video ID {} is not in train or valid split. Correct json file given?".format(vid_id)) 93 | 94 | np.random.shuffle(vids_train) # If we don't shuffle performance deminishes 95 | np.random.shuffle(vids_val) 96 | 97 | return annotations, vids_train, vids_val, all_vids 98 | 99 | 100 | def get_annots_test_vtt(feats_pool, test_list_data, test_sens, annotations, all_vids, unittest=0): 101 | vids_test = [] 102 | 103 | logger.info('Retrieving annotations for test...') 104 | 105 | videos_getting = [i['video_id'] for i in test_list_data['videos']] 106 | 107 | if unittest: 108 | num_videos = unittest 109 | logger.debug( 'UNIT TEST: On') 110 | np.random.shuffle(videos_getting) 111 | videos_getting = videos_getting[:num_videos] 112 | 113 | for vid_name in videos_getting: 114 | if vid_name not in videos_getting: 115 | continue 116 | 117 | if vid_name not in feats_pool: 118 | logger.warn("feature was missing for video_id={}".format(vid_name)) 119 | continue 120 | 121 | if vid_name not in all_vids: 122 | all_vids[vid_name] = 1 123 | else: 124 | all_vids[vid_name] += 1 125 | 126 | if test_sens: 127 | # Use the released test sentences 128 | vid_sens = [s for s in test_sens['sentences'] if s['video_id'] == vid_name] 129 | 130 | for sent in vid_sens: 131 | ocaption = sent['caption'] 132 | ocaption = ocaption.strip().encode('utf-8', 'ignore') 133 | 134 | tokens = nltk.word_tokenize(ocaption) 135 | tokenized = ' '.join(tokens) 136 | tokenized = tokenized.lower() 137 | 138 | if vid_name in annotations: 139 | cap_id = str(len(annotations[vid_name])) 140 | annotations[vid_name].append( 141 | {'tokenized': tokenized, 'image_id': vid_name, 'cap_id': cap_id, 'caption': ocaption}) 142 | else: 143 | annotations[vid_name] = [] 144 | cap_id = str(0) 145 | annotations[vid_name].append( 146 | {'tokenized': tokenized, 'image_id': vid_name, 'cap_id': cap_id, 'caption': ocaption}) 147 | 148 | vid_and_cap = vid_name + '_' + cap_id 149 | vids_test.append(vid_and_cap) 150 | 151 | else: 152 | ocaption = 'no caption' 153 | ocaption = ocaption.strip().encode('utf-8', 'ignore') 154 | 155 | tokens = nltk.word_tokenize(ocaption) 156 | tokenized = ' '.join(tokens) 157 | tokenized = tokenized.lower() 158 | 159 | annotations[vid_name] = [] 160 | cap_id = str(0) 161 | annotations[vid_name].append( 162 | {'tokenized': tokenized, 'image_id': vid_name, 'cap_id': cap_id, 'caption': ocaption}) 163 | 164 | vid_and_cap = vid_name + '_' + cap_id 165 | vids_test.append(vid_and_cap) 166 | 167 | np.random.shuffle(vids_test) 168 | 169 | return annotations, vids_test, all_vids 170 | 171 | 172 | def load_annots_vtt(cap_path): 173 | return load_pkl(cap_path) 174 | 175 | 176 | def get_features_from_dir(vid_frame_folder_names, feats_dir, feats_2017_test_dir, feat_type): 177 | 178 | feats = {} 179 | progress_checking = int(len(vid_frame_folder_names) / 10) 180 | 181 | logger.info("Extracting features...") 182 | 183 | for i, files in enumerate(vid_frame_folder_names): 184 | ext = '.' + files.split('.')[-1] 185 | feat_filename = files.split('/')[-1].split(ext)[0] 186 | 187 | vid_id = int(files.split('video')[1]) 188 | if vid_id >= 10000: 189 | feat_file_path = os.path.join(feats_2017_test_dir, feat_filename) 190 | else: 191 | feat_file_path = os.path.join(feats_dir, feat_filename) 192 | 193 | if feat_type == 'c3d': 194 | feats[feat_filename]=load_c3d_feat(feat_file_path) 195 | logger.info('features extracted successfuly: ' + feat_file_path) 196 | else: 197 | if os.path.exists(feat_file_path): 198 | feat = np.load(feat_file_path) 199 | feats[feat_filename] = feat 200 | # print('features extracted successfuly: ' + feat_file_path) 201 | else: 202 | logger.info('No features found!: ' + feat_file_path) 203 | 204 | if i % progress_checking == 0: 205 | logger.info("Processed " + str(i) + '/' + str(len(vid_frame_folder_names))) 206 | 207 | return feats 208 | 209 | 210 | def validate(vids_train, vids_val, vids_test): 211 | ntr = len(vids_train) 212 | logger.info("Have {} train samples".format(ntr)) 213 | assert ntr > 0 214 | 215 | nva = len(vids_val) 216 | logger.info("Have {} val samples".format(nva)) 217 | assert nva > 0 218 | 219 | nts = len(vids_test) 220 | logger.info("Have {} test samples.".format(nts)) 221 | assert nts > 0 222 | 223 | tr_s = set(vids_train) 224 | va_s = set(vids_val) 225 | ts_s = set(vids_test) 226 | 227 | inter = tr_s.intersection(va_s) 228 | assert len(inter) == 0, 'Validation contaminated with training data.' 229 | inter = va_s.intersection(ts_s) 230 | assert len(inter) == 0, 'Testing contaminated with validation data.' 231 | inter = tr_s.intersection(ts_s) 232 | assert len(inter) == 0, 'Testing contaminated with training data.' 233 | 234 | 235 | def vtt(params): 236 | pkl_dir = params.pkl_dir 237 | feats_dir = params.feats_dir 238 | feats_testing_dir = params.feats_testing_dir 239 | json_dir = params.json_dir 240 | unittest = params.test 241 | feat_type = params.type 242 | protocol = params.protocol 243 | version = params.version 244 | 245 | annotations = {} 246 | 247 | if not os.path.exists(pkl_dir): 248 | os.mkdir(pkl_dir) 249 | 250 | train_path = os.path.join(pkl_dir,'train.pkl') 251 | valid_path = os.path.join(pkl_dir,'valid.pkl') 252 | test_path = os.path.join(pkl_dir,'test.pkl') 253 | cap_path = os.path.join(pkl_dir,'CAP.pkl') 254 | dict_path = os.path.join(pkl_dir,'worddict.pkl') 255 | 256 | if protocol != '': 257 | filename = 'FEATS_{}_{}.pkl'.format(feat_type, protocol) 258 | else: 259 | filename = 'FEATS_{}.pkl'.format(feat_type) 260 | 261 | feats_path = os.path.join(pkl_dir, filename) 262 | 263 | if os.path.exists(train_path) or os.path.exists(valid_path) or os.path.exists(test_path): 264 | var = raw_input("Pickle files found in [{}]. Do you want to erase them? type: [yes] [no] ".format(pkl_dir)) 265 | 266 | if var == 'yes': 267 | logger.info('Removing old pkls...') 268 | remove_pickle_files(cap_path, dict_path, feats_path, test_path, train_path, valid_path) 269 | 270 | else: 271 | logger.info('Loading previous pickle files and creating new FEATS_ file at path: {}'.format(feats_path)) 272 | if os.path.exists(feats_path): 273 | os.remove(feats_path) 274 | 275 | annotations = load_annots_vtt(cap_path) 276 | 277 | features = get_features_from_dir(annotations.keys(), feats_dir, feats_testing_dir, feat_type) 278 | dump_pkl(features, feats_path) 279 | logger.info('FEAT file created! Path: {}'.format(feats_path)) 280 | sys.exit(0) 281 | 282 | vid_feats_dirs = os.listdir(feats_dir) 283 | vid_feats_dirs = sorted(vid_feats_dirs, key=lambda x: float(x.split('video')[-1])) #This is to sort the videos 284 | 285 | vid_testing_feats_dirs = os.listdir(feats_testing_dir) 286 | vid_testing_feats_dirs = sorted(vid_testing_feats_dirs, key=lambda x: float(x.split('video')[-1])) 287 | 288 | feats_pool = vid_feats_dirs + vid_testing_feats_dirs 289 | 290 | test_sens_path = None 291 | 292 | if version == '2016': 293 | test_list_path = os.path.join(json_dir, 'test_videodatainfo_nosen_2016.json') 294 | train_val_list_path = os.path.join(json_dir, 'train_val_videodatainfo.json') 295 | if args.with_sentences: test_sens_path = os.path.join(json_dir, 'videodatainfo_2017.json') 296 | else: 297 | test_list_path = os.path.join(json_dir, 'test_videodatainfo_nosen_2017.json') 298 | train_val_list_path = os.path.join(json_dir, 'videodatainfo_2017.json') 299 | if args.with_sentences: test_sens_path = os.path.join(json_dir, 'test_videodatainfo_2017.json') 300 | 301 | annotations, vids_train, vids_val, vids_test, all_vids = get_annots_vtt(feats_pool, train_val_list_path, 302 | test_list_path, test_sens_path, annotations, unittest) 303 | 304 | logger.info('Validating generated lists...') 305 | validate(vids_train, vids_val, vids_test) 306 | 307 | dump_pkl(vids_test, test_path) 308 | logger.info('test.pkl created') 309 | 310 | dump_pkl(vids_train,train_path) 311 | logger.info('train.pkl created') 312 | 313 | dump_pkl(vids_val,valid_path) 314 | logger.info('valid.pkl created') 315 | 316 | dump_pkl(all_vids.keys(), os.path.join(pkl_dir,'allvids.pkl')) 317 | dump_pkl(annotations, cap_path) 318 | logger.info('CAP.pkl created') 319 | worddict = create_dictionary(annotations,dict_path) 320 | dump_pkl(worddict,dict_path) 321 | logger.info('worddict.pkl created') 322 | 323 | features = get_features_from_dir(annotations.keys(), feats_dir, feats_testing_dir, feat_type) 324 | dump_pkl(features,feats_path) 325 | logger.info('FEAT file created! Path: {}'.format(feats_path)) 326 | 327 | if params.do_skip_thoughts: 328 | logger.info("Generating skip-thoughts...") 329 | import create_skip_vectors 330 | class ArgsFaker(): 331 | captions_file = cap_path 332 | output_file = os.path.join(pkl_dir, 'skip_vectors.pkl') 333 | 334 | fake_args = ArgsFaker() 335 | create_skip_vectors.main(fake_args) 336 | 337 | 338 | def remove_pickle_files(cap_path, dict_path, feats_path, test_path, train_path, valid_path): 339 | if os.path.exists(train_path): 340 | os.remove(train_path) 341 | if os.path.exists(valid_path): 342 | os.remove(valid_path) 343 | if os.path.exists(test_path): 344 | os.remove(test_path) 345 | if os.path.exists(cap_path): 346 | os.remove(cap_path) 347 | if os.path.exists(dict_path): 348 | os.remove(dict_path) 349 | if os.path.exists(feats_path): 350 | os.remove(feats_path) 351 | # if os.path.exists('allvids.pkl'): 352 | # os.remove('allvids.pkl') 353 | 354 | 355 | def _validate(args): 356 | if args.version == '2016' and args.with_sentences: 357 | logger.info("2016 version test sentences were made available in 2017 dataset.") 358 | sens_path = os.path.join(args.json_dir, "videodatainfo_2017.json") 359 | if os.path.exists(sens_path): 360 | logger.info("Found ground truth captions for 2016 test sentences") 361 | else: 362 | logger.critical("Did not find ground truth captions for 2016 test sentences: {}".format(sens_path)) 363 | sys.exit(1) 364 | 365 | if args.type not in args.feats_dir or args.type not in args.feats_testing_dir: 366 | logger.critical("Requested feature type {}, but directories are something else:\tfeats_dir={}\tfeats_testing_dir={}".format(args.type, args.feats_dir, args.feats_testing_dir)) 367 | sys.exit(1) 368 | 369 | 370 | if __name__=='__main__': 371 | arg_parser = argparse.ArgumentParser() 372 | 373 | creation_args = arg_parser.add_argument_group("CreationArgs") 374 | creation_args.add_argument('-s', '--seed', type=int, help="Random seed.", default=SEED, required=False) 375 | creation_args.add_argument('-f','--feats_dir',dest ='feats_dir',type=str, required=True) 376 | creation_args.add_argument('-ft', '--feats_testing_dir', dest='feats_testing_dir', type=str, required=True) 377 | creation_args.add_argument('-j','--json_dir',dest ='json_dir',type=str, required=True) 378 | creation_args.add_argument('-p','--pkl_dir',dest ='pkl_dir',type=str, required=True) 379 | creation_args.add_argument('-type','--type',dest ='type',type=str, required=True) 380 | creation_args.add_argument('-t', '--test', dest='test', type=int, default=0, 381 | help='perform small unit test. If value 0 not unit test if greater than 0 gets a dataset with that numbers of videos') 382 | creation_args.add_argument('-proc', '--protocol', dest='protocol', type=str, default='') 383 | creation_args.add_argument('-st', '--do_skip_thoughts', dest='do_skip_thoughts', action='store_true', default=False) 384 | 385 | vtt_args = arg_parser.add_argument_group("VTTArgs") 386 | vtt_args.add_argument('-v', '--version', dest='version', type=str, default='2016', help="Which MSR-VTT version to create.", choices=['2016', '2017']) 387 | vtt_args.add_argument('-ws', '--with_sentences', dest='with_sentences', default=False, action='store_true', help='Use the available test set sentences.') 388 | 389 | args = arg_parser.parse_args() 390 | 391 | np.random.seed(args.seed) 392 | 393 | if not len(sys.argv) > 1: 394 | print(arg_parser.print_help()) 395 | sys.exit(0) 396 | 397 | _validate(args) 398 | 399 | vtt(args) 400 | 401 | 402 | 403 | -------------------------------------------------------------------------------- /data/create_mvad_mpii_lsmdc.py: -------------------------------------------------------------------------------- 1 | __author__ = 'oliver' 2 | 3 | import argparse 4 | import nltk 5 | import shutil 6 | import numpy as np 7 | import logging 8 | from util import * 9 | 10 | import collections 11 | from collections import OrderedDict 12 | 13 | logging.basicConfig() 14 | logger = logging.getLogger(__name__) 15 | logger.setLevel(logging.DEBUG) 16 | 17 | SEED = 9 18 | 19 | 20 | def get_annots_lsmdc(filename, annotations, num_test): 21 | vids_names = {} 22 | 23 | with open(filename) as csvfile: 24 | rows = csvfile.readlines() 25 | for row in rows[:num_test]: 26 | row = row.split('\t') 27 | vid_name = row[0] 28 | 29 | if len(row) > 5: 30 | ocaption = row[5] 31 | 32 | ocaption = ocaption.replace('\n', '') 33 | udata = ocaption.decode("utf-8") 34 | caption = udata.encode("ascii", "ignore") 35 | 36 | tokens = nltk.word_tokenize(caption) 37 | tokenized = ' '.join(tokens) 38 | tokenized = tokenized.lower() 39 | 40 | if vids_names.has_key(vid_name): 41 | vids_names[vid_name] += 1 42 | logger.info('other annots') 43 | else: 44 | feat_path = '/PATH/TO/lsmdc16/videos/' + vid_name + '.avi' 45 | if not os.path.exists(feat_path): 46 | logger.warning('video not found ' + feat_path) 47 | vids_names[vid_name] = 1 48 | 49 | annotations[vid_name] = [ 50 | {'tokenized': tokenized, 'image_id': vid_name, 'cap_id': vids_names[vid_name], 'caption': ocaption}] 51 | 52 | return annotations, vids_names 53 | 54 | 55 | def get_blind_lsmdc(filename, num_test): 56 | vids_names = OrderedDict() 57 | # annotations = OrderedDict() 58 | 59 | with open(filename) as csvfile: 60 | rows = csvfile.readlines() 61 | for row in rows: 62 | row = row.split('\t') 63 | vid_name = row[0] 64 | 65 | if vids_names.has_key(vid_name): 66 | vids_names[vid_name] += 1 67 | logger.info('other annots') 68 | else: 69 | # feat_path = '/media/onina/sea2/datasets/lsmdc/features_chal/'+vid_name 70 | # if not os.path.exists(feat_path): 71 | # print 'features not found '+feat_path 72 | vids_names[vid_name] = 1 73 | 74 | # annotations[vid_name]=[{'tokenized':tokenized,'image_id':vid_name,'cap_id':1,'caption':''}] 75 | 76 | return vids_names 77 | 78 | 79 | def get_annots_mvad(rows, annots_corpus, annotations, feats_dir): 80 | vids_names = {} 81 | 82 | for i, row in enumerate(rows): 83 | 84 | # row = row.split('\t') 85 | vid_name = row.split('/')[-1].split('.')[0] 86 | caption = annots_corpus[i] 87 | caption = caption.replace('\n', '') 88 | 89 | udata = caption.decode("utf-8") 90 | caption = udata.encode("ascii", "ignore") 91 | 92 | tokens = nltk.word_tokenize(caption) 93 | tokenized = ' '.join(tokens) 94 | tokenized = tokenized.lower() 95 | 96 | if vids_names.has_key(vid_name): 97 | vids_names[vid_name] += 1 98 | logger.info('other annots, there should be only 1') 99 | # sys.exit(0) 100 | else: 101 | vids_names[vid_name] = 1 102 | 103 | annotations[vid_name] = [ 104 | {'tokenized': tokenized, 'image_id': vid_name, 'cap_id': vids_names[vid_name], 'caption': caption}] 105 | 106 | return annotations, vids_names 107 | 108 | 109 | def create_dictionary(annotations, pkl_dir): 110 | worddict = collections.OrderedDict() 111 | word_idx = 2 112 | for a in annotations: 113 | caps = annotations[a] 114 | 115 | for cap in caps: 116 | tokens = cap['tokenized'].split() 117 | for token in tokens: 118 | if token not in ['', '\t', '\n', ' ']: 119 | if not worddict.has_key(token): 120 | worddict[token] = word_idx 121 | word_idx += 1 122 | 123 | return worddict 124 | 125 | 126 | def lsmdc16(args): 127 | data_dir = args.data_dir 128 | pkl_dir = args.pkl_dir 129 | 130 | num_train, num_valid, num_test, num_blind = 9999999999, 9999999999, 9999999999, 9999999999 131 | 132 | test_mode = int(args.unit_test) 133 | 134 | train_list_path = 'LSMDC16_annos_training.csv' 135 | valid_list_path = 'LSMDC16_annos_val.csv' 136 | test_list_path = 'LSMDC16_annos_test.csv' 137 | btest_list_path = 'LSMDC16_annos_blindtest.csv' 138 | 139 | if test_mode: 140 | num_train = int(0.50 * test_mode) 141 | num_test = int(0.15 * test_mode) 142 | num_valid = int(0.25 * test_mode) 143 | num_blind = test_mode - (num_test + num_train + num_valid) 144 | 145 | annotations = {} 146 | 147 | if not os.path.exists(pkl_dir): 148 | os.mkdir(pkl_dir) 149 | 150 | all_vids = [] 151 | 152 | train_path = os.path.join(pkl_dir, 'train.pkl') 153 | if not os.path.exists(train_path): 154 | train_file = os.path.join(data_dir, train_list_path) 155 | logger.info(train_file) 156 | annotations, vids_names = get_annots_lsmdc(train_file, annotations, num_train) 157 | training_list = vids_names.keys() 158 | dump_pkl(training_list, train_path) 159 | else: 160 | training_list = load_pkl(train_path) 161 | 162 | all_vids = all_vids + training_list 163 | 164 | valid_path = os.path.join(pkl_dir, 'valid.pkl') 165 | if not os.path.exists(valid_path): 166 | valid_file = os.path.join(data_dir, valid_list_path) 167 | annotations, vids_names = get_annots_lsmdc(valid_file, annotations, num_valid) 168 | valid_list = vids_names.keys() 169 | dump_pkl(valid_list, valid_path) 170 | else: 171 | valid_list = load_pkl(valid_path) 172 | 173 | all_vids = all_vids + valid_list 174 | 175 | test_path = os.path.join(pkl_dir, 'test.pkl') 176 | if not os.path.exists(test_path): 177 | test_file = os.path.join(data_dir, test_list_path) 178 | annotations, vids_names = get_annots_lsmdc(test_file, annotations, num_test) 179 | test_list = vids_names.keys() 180 | dump_pkl(test_list, test_path) 181 | else: 182 | test_list = load_pkl(test_path) 183 | 184 | all_vids = all_vids + test_list 185 | 186 | cap_path = os.path.join(pkl_dir, 'CAP.pkl') 187 | if not os.path.exists(cap_path): 188 | dump_pkl(annotations, cap_path) 189 | 190 | dict_path = os.path.join(pkl_dir, 'worddict.pkl') 191 | if not os.path.exists(dict_path): 192 | worddict = create_dictionary(annotations, dict_path) 193 | dump_pkl(worddict, dict_path) 194 | 195 | btest_path = os.path.join(pkl_dir, 'blindtest.pkl') 196 | if not os.path.exists(btest_path): 197 | btest_file = os.path.join(data_dir, btest_list_path) 198 | vids_names = get_blind_lsmdc(btest_file, num_blind) 199 | btest_list = vids_names.keys() 200 | dump_pkl(btest_list, btest_path) 201 | else: 202 | btest_list = load_pkl(btest_path) 203 | 204 | logger.info('done creating dataset') 205 | 206 | 207 | def mpii(params): 208 | data_dir = params.data_dir 209 | pkl_dir = params.pkl_dir 210 | testing = params.unit_test 211 | local_dir = params.local_dir 212 | feats_dir = params.feats_dir 213 | 214 | f = open(os.path.join(data_dir, 'lists', 'downloadLinksAvi.txt'), 'rb') 215 | files = f.readlines() 216 | f.close() 217 | f = open(os.path.join(data_dir, 'lists', 'annotations-someone.csv'), 'rb') 218 | annots = f.readlines() 219 | f.close() 220 | f = open(os.path.join(data_dir, 'lists', 'dataSplit.txt'), 'rb') 221 | splits_file = f.readlines() 222 | splits = {} 223 | 224 | annotations = {} 225 | train_clip_names = [] 226 | valid_clip_names = [] 227 | test_clip_names = [] 228 | 229 | if testing: 230 | tuples = [(f, a) for f, a in zip(files, annots)] 231 | np.random.shuffle(tuples) 232 | tuples = tuples[:testing] 233 | files = [a[0] for a in tuples] 234 | annots = [b[1] for b in tuples] 235 | 236 | train_path = os.path.join(pkl_dir, 'train.pkl') 237 | if not os.path.exists(train_path): 238 | for line in splits_file: 239 | film_name = line.split('\t')[0] 240 | split = line.split('\t')[1] 241 | splits[film_name] = split.replace('\r\n', '') 242 | 243 | for i, file in enumerate(files): 244 | parts = file.split('/') 245 | 246 | film_name = parts[6] 247 | clip_name = parts[7].replace('\n', '') 248 | clip_name = clip_name.split('.avi')[0] 249 | caption = annots[i].split('\t')[1] 250 | caption = caption.replace('\n', '') 251 | 252 | udata = caption.decode("utf-8") 253 | caption = udata.encode("ascii", "ignore") 254 | 255 | tokens = nltk.word_tokenize(caption) 256 | tokenized = ' '.join(tokens) 257 | tokenized = tokenized.lower() 258 | 259 | annotations[clip_name] = [{'tokenized': tokenized, 'image_id': clip_name, 'cap_id': 1, 'caption': caption}] 260 | 261 | if splits[film_name] == 'training': 262 | train_clip_names.append(clip_name) 263 | elif splits[film_name] == 'validation': 264 | valid_clip_names.append(clip_name) 265 | elif splits[film_name] == 'test': 266 | test_clip_names.append(clip_name) 267 | 268 | if not os.path.exists(pkl_dir): 269 | os.mkdir(pkl_dir) 270 | 271 | all_vids = [] 272 | 273 | train_path = os.path.join(pkl_dir, 'train.pkl') 274 | if not os.path.exists(train_path): 275 | dump_pkl(train_clip_names, train_path) 276 | else: 277 | train_clip_names = load_pkl(train_path) 278 | 279 | all_vids = all_vids + train_clip_names 280 | 281 | valid_path = os.path.join(pkl_dir, 'valid.pkl') 282 | if not os.path.exists(valid_path): 283 | dump_pkl(valid_clip_names, valid_path) 284 | else: 285 | valid_clip_names = load_pkl(valid_path) 286 | 287 | all_vids = all_vids + valid_clip_names 288 | 289 | test_path = os.path.join(pkl_dir, 'test.pkl') 290 | if not os.path.exists(test_path): 291 | dump_pkl(test_clip_names, test_path) 292 | else: 293 | test_clip_names = load_pkl(test_path) 294 | 295 | all_vids = all_vids + test_clip_names 296 | 297 | cap_path = os.path.join(pkl_dir, 'CAP.pkl') 298 | if not os.path.exists(cap_path): 299 | dump_pkl(annotations, cap_path) 300 | 301 | dict_path = os.path.join(pkl_dir, 'worddict.pkl') 302 | if not os.path.exists(dict_path): 303 | worddict = create_dictionary(annotations, dict_path) 304 | dump_pkl(worddict, dict_path) 305 | 306 | if testing and local_dir: 307 | logger.info("Copying required features...") 308 | if not os.path.isdir(local_dir): 309 | os.makedirs(local_dir) 310 | 311 | for vid_name in all_vids: 312 | ft_path = os.path.join(feats_dir, vid_name) 313 | local_ft_path = os.path.join(local_dir, vid_name) 314 | shutil.copy2(ft_path, local_ft_path) 315 | 316 | logger.info('done creating dataset') 317 | 318 | 319 | def mvad(params): 320 | feats_dir = params.feats_dir 321 | data_dir = params.data_dir 322 | pkl_dir = params.pkl_dir 323 | 324 | testing = params.unit_test 325 | local_dir = params.local_dir 326 | 327 | annotations = {} 328 | 329 | if not os.path.exists(pkl_dir): 330 | os.mkdir(pkl_dir) 331 | 332 | all_vids = [] 333 | 334 | s_paths = [os.path.join(pkl_dir, 'train.pkl'), 335 | os.path.join(pkl_dir, 'valid.pkl'), 336 | os.path.join(pkl_dir, 'test.pkl') 337 | ] 338 | l_paths = [os.path.join(data_dir, 'lists/TrainList.txt'), 339 | os.path.join(data_dir, 'lists/ValidList.txt'), 340 | os.path.join(data_dir, 'lists/TestList.txt') 341 | ] 342 | c_paths = [os.path.join(data_dir, 'lists/TrainCorpus.txt'), 343 | os.path.join(data_dir, 'lists/ValidCorpus.txt'), 344 | os.path.join(data_dir, 'lists/TestCorpus.txt') 345 | ] 346 | 347 | for i, s_path in enumerate(s_paths): 348 | if not os.path.exists(s_path): 349 | _rows = open(l_paths[i], 'rw').readlines() 350 | _corpus = open(c_paths[i], 'rw').readlines() 351 | 352 | if testing: 353 | _pairs = [(r, c) for r, c in zip(_rows, _corpus)] 354 | np.random.shuffle(_pairs) 355 | num = int(testing * params.split[i]) 356 | _rows = [p[0] for p in _pairs[:num]] 357 | _corpus = [p[1] for p in _pairs[:num]] 358 | 359 | annotations, vids_names = get_annots_mvad(_rows, _corpus, annotations, feats_dir) 360 | _list = vids_names.keys() 361 | dump_pkl(_list, s_path) 362 | else: 363 | _list = load_pkl(s_path) 364 | 365 | all_vids = all_vids + _list 366 | 367 | cap_path = os.path.join(pkl_dir, 'CAP.pkl') 368 | if not os.path.exists(cap_path): 369 | dump_pkl(annotations, cap_path) 370 | 371 | dict_path = os.path.join(pkl_dir, 'worddict.pkl') 372 | if not os.path.exists(dict_path): 373 | worddict = create_dictionary(annotations, dict_path) 374 | dump_pkl(worddict, dict_path) 375 | 376 | if testing and local_dir: 377 | logger.info("Copying required features...") 378 | if not os.path.isdir(local_dir): 379 | os.makedirs(local_dir) 380 | 381 | for vid_name in all_vids: 382 | ft_path = os.path.join(feats_dir, vid_name) 383 | local_ft_path = os.path.join(local_dir, vid_name) 384 | shutil.copy2(ft_path, local_ft_path) 385 | 386 | logger.info('done creating dataset') 387 | 388 | 389 | def get_human_annotations(data_dir): 390 | hannot_path = os.path.join(data_dir, 'human_annotations', 'HumanCaps.csv') 391 | import csv 392 | 393 | hannot = {} 394 | with open(hannot_path, 'rb') as csvfile: 395 | spamreader = csv.reader(csvfile, delimiter=',', quotechar='\"') 396 | for row in spamreader: 397 | logger.info(', '.join(row)) 398 | hannot[row[0]] = row[1] 399 | return hannot 400 | 401 | 402 | def tokenize_cap(caption): 403 | udata = caption.decode("utf-8") 404 | caption = udata.encode("ascii", "ignore") 405 | 406 | tokens = nltk.word_tokenize(caption) 407 | tokenized = ' '.join(tokens) 408 | tokenized = tokenized.lower() 409 | return tokenized 410 | 411 | 412 | if __name__ == '__main__': 413 | 414 | parser = argparse.ArgumentParser() 415 | creation_args = parser.add_argument_group("CreationArgs") 416 | creation_args.add_argument('-s', '--seed', type=int, help="Random seed.", default=SEED, required=False) 417 | creation_args.add_argument('-d', '--data_dir', dest='data_dir', help='Example: /path/to/dataset/annotations', 418 | required=True) 419 | creation_args.add_argument('-p', '--pkl_dir', dest='pkl_dir', help='Example: /path/to/dataset/pkls', required=True) 420 | creation_args.add_argument('-dbname', '--dbname', dest='dbname', help='Dataset type.', required=True, 421 | choices=['mvad', 'mpii', 'lsmdc16']) 422 | creation_args.add_argument('-st', '--do_skip_thoughts', dest='do_skip_thoughts', action='store_true', default=False) 423 | 424 | ut_args = parser.add_argument_group("UnitTestArgs") 425 | ut_args.add_argument('-t', '--unit_test', dest='unit_test', type=int, default=0, 426 | help='Perform small test. Takes number of samples in unit test dataset.') 427 | ut_args.add_argument('-l', '--local_dir', dest='local_dir', help="Where to copy unit_test features.", default=None) 428 | ut_args.add_argument('-sp', '--split', dest='split', nargs='+', 429 | help='Space delimited [train val test] Data split to use in unit test dataset', 430 | default=[0.50, 0.25, 0.25], type=float) 431 | ut_args.add_argument('-feat', '--feats_dir', dest='feats_dir', help='Example: /path/to/dataset/features_googlenet', 432 | required=False) 433 | 434 | args = parser.parse_args() 435 | 436 | if not len(sys.argv) > 1: 437 | parser.print_help() 438 | sys.exit(0) 439 | 440 | np.random.seed(args.seed) 441 | 442 | if not args.feats_dir: 443 | if args.local_dir: 444 | logger.critical( 445 | "You must provide an argument for --feats_dir to create a local copy of features (--local_dir)") 446 | sys.exit(1) 447 | 448 | if args.dbname == 'mvad': 449 | mvad(args) 450 | if args.dbname == 'mpii': 451 | mpii(args) 452 | if args.dbname == 'lsmdc16': 453 | lsmdc16(args) 454 | -------------------------------------------------------------------------------- /data/create_skip_vectors.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import util 4 | sys.path.append('skip-thoughts') 5 | import skipthoughts 6 | 7 | 8 | def main(params): 9 | captions_file = params.captions_file 10 | output_file = params.output_file 11 | 12 | vids = util.load_pkl(captions_file) 13 | st_model = skipthoughts.load_model() 14 | 15 | skip_vectors = {} 16 | for vid in vids.keys(): 17 | 18 | caps = vids[vid] 19 | num_caps = len(caps) 20 | 21 | raw_caps = [ '' for x in range(num_caps)] 22 | 23 | for cap in caps: 24 | raw_caps[int(cap['cap_id'])]=cap['tokenized'] 25 | 26 | vector = skipthoughts.encode(st_model, raw_caps, verbose=False) 27 | 28 | skip_vectors[vid] = vector 29 | 30 | util.dump_pkl(skip_vectors, output_file) 31 | 32 | 33 | if __name__=='__main__': 34 | arg_parser = argparse.ArgumentParser() 35 | 36 | arg_parser.add_argument('-i','--input',dest ='captions_file',type=str, required=True) 37 | arg_parser.add_argument('-o','--output',dest ='output_file',type=str, required=True, help="/path/to/dataset/skip_vectors.pkl") 38 | 39 | args = arg_parser.parse_args() 40 | 41 | main(args) 42 | -------------------------------------------------------------------------------- /data/create_tacos.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import nltk 3 | 4 | from util import * 5 | 6 | 7 | SEED = 9 8 | 9 | 10 | def get_annots_tacos(vid_feat_files, id_to_cap_dict, unittest, splits): 11 | vids_train = [] 12 | vids_val = [] 13 | vids_test = [] 14 | all_vids = {} 15 | annotations = {} 16 | 17 | print 'Retrieving annotations...' 18 | if unittest: 19 | print 'UNIT TEST: On' 20 | n = unittest 21 | else: 22 | n = len(id_to_cap_dict) 23 | 24 | # We are going to create the valid and test datasets ourselves. 25 | train_split, valid_split, test_split = splits.split(',') 26 | 27 | n_as_float = float(n) 28 | 29 | num_train = int(n_as_float * float(train_split)) 30 | num_valid = int(n_as_float * float(valid_split)) 31 | num_test = int(n - (num_valid + num_train)) 32 | assert n == num_train + num_valid + num_test 33 | 34 | count_train = 0 35 | count_valid = 0 36 | count_test = 0 37 | 38 | for enum, vid_id in enumerate(vid_feat_files): 39 | if unittest and enum > unittest: 40 | break 41 | 42 | cap = id_to_cap_dict[vid_id] 43 | 44 | if vid_id not in all_vids: 45 | all_vids[vid_id] = 1 46 | else: 47 | all_vids[vid_id] += 1 48 | 49 | ocaption = cap 50 | ocaption = ocaption.replace('\n', '') 51 | ocaption = ocaption.strip() 52 | 53 | udata = ocaption.decode("utf-8", "ignore") 54 | ocaption = udata.encode("ascii", "ignore") 55 | 56 | tokens = nltk.word_tokenize(ocaption.replace('.', '')) 57 | 58 | if len(tokens) == 0: 59 | continue 60 | 61 | tokenized = ' '.join(tokens) 62 | tokenized = tokenized.lower() 63 | 64 | if annotations.has_key(vid_id): 65 | cap_id = str(len(annotations[vid_id])) 66 | annotations[vid_id].append({'tokenized': tokenized, 'image_id': vid_id, 'cap_id': cap_id, 'caption': ocaption}) 67 | else: 68 | annotations[vid_id]= [] 69 | cap_id = str(0) 70 | annotations[vid_id].append({'tokenized': tokenized, 'image_id': vid_id, 'cap_id': cap_id, 'caption': ocaption}) 71 | 72 | if count_train < num_train: 73 | vids_train.append(vid_id) 74 | count_train += 1 75 | elif count_valid < num_valid: 76 | vids_val.append(vid_id) 77 | count_valid += 1 78 | elif count_test < num_test: 79 | vids_test.append(vid_id) 80 | count_test += 1 81 | 82 | np.random.shuffle(vids_train) 83 | np.random.shuffle(vids_val) 84 | np.random.shuffle(vids_test) 85 | 86 | return annotations, vids_train, vids_val, vids_test, all_vids 87 | 88 | 89 | def build_ground_truth_dict(gt_dir): 90 | csv_file = open(os.path.join(gt_dir, 'index.tsv'), 'r') 91 | 92 | id_to_cap_dict = {} 93 | for line in csv_file: 94 | groups = line.replace('\n', '').split('\t') 95 | dest_vid = groups[0] 96 | sentence = groups[1] 97 | 98 | # vidID -> sentence 99 | id_to_cap_dict[dest_vid] = sentence 100 | 101 | return id_to_cap_dict 102 | 103 | 104 | def tacos(params): 105 | pkl_dir = params.pkl_dir 106 | feats_dir = params.feats_dir 107 | gt_dir = params.gt_dir 108 | unittest = params.test 109 | splits = params.splits 110 | 111 | if not os.path.exists(pkl_dir): 112 | os.mkdir(pkl_dir) 113 | 114 | train_path = os.path.join(pkl_dir, 'train.pkl') 115 | valid_path = os.path.join(pkl_dir, 'valid.pkl') 116 | test_path = os.path.join(pkl_dir, 'test.pkl') 117 | cap_path = os.path.join(pkl_dir, 'CAP.pkl') 118 | dict_path = os.path.join(pkl_dir, 'worddict.pkl') 119 | 120 | id_to_cap_dict = build_ground_truth_dict(gt_dir) 121 | vid_feat_files = os.listdir(feats_dir) 122 | 123 | annotations, vids_train, vids_val, vids_test, all_vids = get_annots_tacos(vid_feat_files, id_to_cap_dict, unittest, splits) 124 | 125 | dump_pkl(vids_train, train_path) 126 | print('train.pkl created') 127 | dump_pkl(vids_val, valid_path) 128 | print('valid.pkl created') 129 | dump_pkl(vids_test, test_path) 130 | print('test.pkl created') 131 | 132 | dump_pkl(all_vids.keys(), os.path.join(pkl_dir, 'allvids.pkl')) 133 | dump_pkl(annotations, cap_path) 134 | print('CAP.pkl created') 135 | 136 | worddict = create_dictionary(annotations, dict_path) 137 | dump_pkl(worddict, dict_path) 138 | print('worddict.pkl created') 139 | 140 | 141 | if __name__=='__main__': 142 | arg_parser = argparse.ArgumentParser() 143 | 144 | arg_parser.add_argument('-f', '--feats_dir', dest='feats_dir', type=str, default='') 145 | arg_parser.add_argument('-gt','--gt_dir',dest ='gt_dir',type=str, default='') 146 | arg_parser.add_argument('-p','--pkl_dir',dest ='pkl_dir',type=str, default='') 147 | arg_parser.add_argument('-t','--test',dest = 'test', type=int, default=0, 148 | help='perform small unit test. If value 0 not unit test if greater than 0 gets a dataset with that numbers of videos') 149 | arg_parser.add_argument('-sp', '--splits', dest='splits', type=str, default='0.61,0.05,0.34', 150 | help='Create validation and test datasets. Usage: floats delimited by commas, ' 151 | 'of the form Tr,Val. ex: {-s 0.60,0.20,0.20}. Default: 0.61,0.05,0.34') 152 | arg_parser.add_argument('-s', '--seed', type=int, help="Random seed.", default=SEED, required=False) 153 | arg_parser.add_argument('-st', '--do_skip_thoughts', dest='do_skip_thoughts', action='store_true', default=False) 154 | 155 | args = arg_parser.parse_args() 156 | 157 | np.random.seed(args.seed) 158 | 159 | if not len(sys.argv) > 1: 160 | print arg_parser.print_help() 161 | sys.exit(0) 162 | 163 | np.random.seed(args.seed) 164 | tacos(args) 165 | -------------------------------------------------------------------------------- /data/create_trecvid.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import nltk 4 | import sys 5 | import numpy as np 6 | 7 | from util import * 8 | 9 | SEED = 9 10 | 11 | 12 | def get_annots_trecvid(vid_feat_files, id_to_A_B_cap_dict, unittest, splits): 13 | vids_train = [] 14 | vids_val = [] 15 | vids_test = [] 16 | all_vids = {} 17 | annotations = {} 18 | 19 | print 'Retrieving annotations...' 20 | if unittest: 21 | print 'UNIT TEST: On' 22 | id_to_A_B_cap_dict = {i: id_to_A_B_cap_dict[i] for enum, i in enumerate(id_to_A_B_cap_dict) if enum < unittest} 23 | 24 | n = len(id_to_A_B_cap_dict) 25 | 26 | # We are going to create the valid and test datasets ourselves. 27 | train_split, valid_split, test_split = splits.split(',') 28 | 29 | n_as_float = float(n) 30 | 31 | num_train = int(n_as_float * float(train_split)) 32 | num_valid = int(n_as_float * float(valid_split)) 33 | num_test = int(n - (num_valid + num_train)) 34 | assert n == num_train + num_valid + num_test 35 | 36 | count_train = 0 37 | count_valid = 0 38 | count_test = 0 39 | 40 | for vid_id in vid_feat_files: 41 | if vid_id not in id_to_A_B_cap_dict: 42 | continue 43 | 44 | for enum, cap in enumerate(id_to_A_B_cap_dict[vid_id]): 45 | if not all_vids.has_key(vid_id): 46 | all_vids[vid_id] = 1 47 | else: 48 | all_vids[vid_id] += 1 49 | 50 | ocaption = cap 51 | ocaption = ocaption.replace('\n', '') 52 | ocaption = ocaption.strip() 53 | 54 | udata = ocaption.decode("utf-8", "ignore") 55 | ocaption = udata.encode("ascii", "ignore") 56 | 57 | tokens = nltk.word_tokenize(ocaption.replace('.', '')) 58 | 59 | if len(tokens) == 0: 60 | continue 61 | 62 | tokenized = ' '.join(tokens) 63 | tokenized = tokenized.lower() 64 | 65 | if annotations.has_key(vid_id): 66 | annotations[vid_id].append({'tokenized': tokenized, 'image_id': vid_id, 'cap_id': str(enum), 'caption': ocaption}) 67 | else: 68 | annotations[vid_id]= [] 69 | annotations[vid_id].append({'tokenized': tokenized, 'image_id': vid_id, 'cap_id': str(enum), 'caption': ocaption}) 70 | 71 | if count_train < num_train: 72 | vids_train.extend([vid_id + '_' + str(enum) for enum, i in enumerate(annotations[vid_id])]) 73 | count_train += 1 74 | elif count_valid < num_valid: 75 | vids_val.extend([vid_id + '_' + str(enum) for enum, i in enumerate(annotations[vid_id])]) 76 | count_valid += 1 77 | elif count_test < num_test: 78 | vids_test.extend([vid_id + '_' + str(enum) for enum, i in enumerate(annotations[vid_id])]) 79 | count_test += 1 80 | 81 | np.random.shuffle(vids_train) 82 | np.random.shuffle(vids_val) 83 | np.random.shuffle(vids_test) 84 | 85 | return annotations, vids_train, vids_val, vids_test, all_vids 86 | 87 | 88 | def get_features_from_dir(vid_ids, feats_dir, feat_type): 89 | feats = {} 90 | 91 | for i, vid_id in enumerate(vid_ids): 92 | feat_file_path = os.path.join(feats_dir, vid_id.split('vid')[-1]) 93 | 94 | if feat_type == 'c3d': 95 | feats[vid_id] = load_c3d_feat(feat_file_path) 96 | print('features extracted successfuly: ' + feat_file_path) 97 | else: 98 | if os.path.exists(feat_file_path): 99 | feat = np.load(feat_file_path) 100 | feats[vid_id] = feat 101 | print('features extracted successfuly: ' + feat_file_path) 102 | else: 103 | print('No features found!: ' + feat_file_path) 104 | 105 | print str(i) + '/' + str(len(vid_ids)) 106 | return feats 107 | 108 | 109 | def build_ground_truth_dict(gt_dir): 110 | gt_map_file = open(os.path.join(gt_dir, 'vtt.gt'), 'r') 111 | gt_A_file = open(os.path.join(gt_dir, 'vines.textDescription.A.testingSet'), 'r') 112 | gt_B_file = open(os.path.join(gt_dir, 'vines.textDescription.B.testingSet'), 'r') 113 | 114 | gt_A_index_to_cap_dict = {} 115 | for line in gt_A_file: 116 | cap_id, cap = line.replace('\n', '').split(' ') 117 | gt_A_index_to_cap_dict[cap_id] = cap 118 | gt_B_index_to_cap_dict = {} 119 | for line in gt_B_file: 120 | cap_id, cap = line.replace('\n', '').split(' ') 121 | gt_B_index_to_cap_dict[cap_id] = cap 122 | id_to_A_B_cap_dict = {} 123 | for line in gt_map_file: 124 | vid_id, cap_id_A, cap_id_B = line.replace('\n', '').split(' ') 125 | # vidID -> (capA, capB) 126 | id_to_A_B_cap_dict['vid' + vid_id] = (gt_A_index_to_cap_dict[cap_id_A], gt_B_index_to_cap_dict[cap_id_B]) 127 | 128 | return id_to_A_B_cap_dict 129 | 130 | 131 | def trecvid(params): 132 | pkl_dir = params.pkl_dir 133 | feats_dir = params.feats_dir 134 | gt_dir = params.gt_dir 135 | unittest = params.test 136 | splits = params.splits 137 | feat_type = params.type 138 | protocol = params.protocol 139 | 140 | if not os.path.exists(pkl_dir): 141 | os.mkdir(pkl_dir) 142 | 143 | train_path = os.path.join(pkl_dir, 'train.pkl') 144 | valid_path = os.path.join(pkl_dir, 'valid.pkl') 145 | test_path = os.path.join(pkl_dir, 'test.pkl') 146 | cap_path = os.path.join(pkl_dir, 'CAP.pkl') 147 | dict_path = os.path.join(pkl_dir, 'worddict.pkl') 148 | 149 | if protocol != '': 150 | filename = 'FEATS_{}_{}.pkl'.format(feat_type, protocol) 151 | else: 152 | filename = 'FEATS_{}.pkl'.format(feat_type) 153 | 154 | feats_path = os.path.join(pkl_dir, filename) 155 | 156 | id_to_A_B_cap_dict = build_ground_truth_dict(gt_dir) 157 | vid_feat_files = ['vid' + i for i in os.listdir(feats_dir)] 158 | 159 | annotations, vids_train, vids_val, vids_test, all_vids = get_annots_trecvid(vid_feat_files, id_to_A_B_cap_dict, unittest, splits) 160 | 161 | dump_pkl(vids_train, train_path) 162 | print('train.pkl created') 163 | dump_pkl(vids_val, valid_path) 164 | print('valid.pkl created') 165 | dump_pkl(vids_test, test_path) 166 | print('test.pkl created') 167 | 168 | dump_pkl(all_vids.keys(), os.path.join(pkl_dir, 'allvids.pkl')) 169 | dump_pkl(annotations, cap_path) 170 | print('CAP.pkl created') 171 | 172 | worddict = create_dictionary(annotations, dict_path) 173 | dump_pkl(worddict, dict_path) 174 | print('worddict.pkl created') 175 | 176 | features = get_features_from_dir(annotations.keys(), feats_dir, feat_type) 177 | dump_pkl(features, feats_path) 178 | 179 | print 'FEAT file created! Path: {}'.format(feats_path) 180 | 181 | if params.do_skip_thoughts: 182 | logger.info("Generating skip-thoughts...") 183 | import create_skip_vectors 184 | class ArgsFaker(): 185 | captions_file = cap_path 186 | output_file = os.path.join(pkl_dir, 'skip_vectors.pkl') 187 | 188 | fake_args = ArgsFaker() 189 | create_skip_vectors.main(fake_args) 190 | 191 | 192 | if __name__=='__main__': 193 | arg_parser = argparse.ArgumentParser() 194 | 195 | arg_parser.add_argument('-s', '--seed', type=int, help="Random seed.", default=SEED, required=False) 196 | arg_parser.add_argument('-f','--feats_dir',dest ='feats_dir',type=str,default='') 197 | arg_parser.add_argument('-gt','--gt_dir',dest ='gt_dir',type=str,default='') 198 | arg_parser.add_argument('-p','--pkl_dir',dest ='pkl_dir',type=str,default='') 199 | arg_parser.add_argument('-type','--type',dest ='type',type=str,default='googlenet') 200 | arg_parser.add_argument('-t','--test',dest = 'test',type=int,default=0, 201 | help='perform small unit test. If value 0 not unit test if greater than 0 gets a dataset with that numbers of videos') 202 | arg_parser.add_argument('-sp', '--splits', dest='splits', type=str, default='0.61,0.05,0.34', 203 | help='Create validation and test datasets. Usage: floats delimited by commas, ' 204 | 'of the form Tr,Val. ex: {-s 0.60,0.20,0.20}. Default: 0.61,0.05,0.34') 205 | arg_parser.add_argument('-proc', '--protocol', dest='protocol', type=str, default='') 206 | arg_parser.add_argument('-st', '--do_skip_thoughts', dest='do_skip_thoughts', action='store_true', default=False) 207 | 208 | args = arg_parser.parse_args() 209 | 210 | np.random.seed(args.seed) 211 | 212 | if not len(sys.argv) > 1: 213 | print arg_parser.print_help() 214 | sys.exit(0) 215 | 216 | trecvid(args) 217 | -------------------------------------------------------------------------------- /data/create_y2t.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import nltk 4 | import cPickle 5 | import sys 6 | import numpy as np 7 | 8 | from util import * 9 | import create_msr_vtt 10 | 11 | 12 | SEED = 9 13 | 14 | def get_features_from_dir(vid_frame_folder_names, feats_dir, feat_type): 15 | feats = {} 16 | 17 | for i, files in enumerate(vid_frame_folder_names): 18 | ext = '.' + files.split('.')[-1] 19 | feat_filename = files.split('/')[-1].split(ext)[0] 20 | 21 | feat_file_path = os.path.join(feats_dir, feat_filename) 22 | 23 | if feat_type == 'c3d': 24 | feats[feat_filename] = load_c3d_feat(feat_file_path) 25 | print('features extracted successfuly: ' + feat_file_path) 26 | else: 27 | if os.path.exists(feat_file_path): 28 | feat = np.load(feat_file_path) 29 | feats[feat_filename] = feat 30 | print('features extracted successfuly: ' + feat_file_path) 31 | else: 32 | print('No features found!: ' + feat_file_path) 33 | 34 | print str(i) + '/' + str(len(vid_frame_folder_names)) 35 | return feats 36 | 37 | 38 | def get_annots_y2t(vid_caption_dict, youtube_map_dict, unittest=0, splits=''): 39 | vids_train = [] 40 | vids_val = [] 41 | vids_test = [] 42 | all_vids = {} 43 | annotations = {} 44 | 45 | print 'Retrieving annotations...' 46 | 47 | pkl = youtube_map_dict 48 | if unittest: 49 | print 'UNIT TEST: On' 50 | keys = pkl.keys() 51 | np.random.shuffle(keys) 52 | keys = keys[:unittest] 53 | pkl = {key: pkl[key] for key in keys} 54 | 55 | n = len(pkl) 56 | 57 | if splits == 'yao': 58 | num_train = 1201 59 | num_valid = 100 60 | num_test = 670 61 | else: 62 | train_split, valid_split, test_split = splits.split(',') 63 | 64 | n_as_float = float(n) 65 | 66 | num_train = int(n_as_float * float(train_split)) 67 | num_valid = int(n_as_float * float(valid_split)) 68 | num_test = int(n_as_float * float(test_split)) 69 | assert n == num_train + num_valid + num_test 70 | 71 | count_train = 0 72 | count_valid = 0 73 | count_test = 0 74 | 75 | for vid_name in pkl.keys(): 76 | vid = youtube_map_dict[vid_name] 77 | 78 | for cap_id, cap in enumerate(vid_caption_dict[vid_name]): 79 | if not all_vids.has_key(vid_name): 80 | all_vids[vid_name] = 1 81 | else: 82 | all_vids[vid_name] += 1 83 | 84 | ocaption = cap 85 | ocaption = ocaption.replace('\n', '') 86 | ocaption = ocaption.strip() 87 | 88 | udata = ocaption.decode("utf-8") 89 | ocaption = udata.encode("ascii", "ignore") 90 | 91 | tokens = nltk.word_tokenize(ocaption.replace('.', '')) 92 | 93 | if len(tokens) == 0: 94 | continue 95 | 96 | tokenized = ' '.join(tokens) 97 | tokenized = tokenized.lower() 98 | 99 | if annotations.has_key(vid): 100 | annotations[vid].append({'tokenized': tokenized, 'image_id': vid, 'cap_id': str(cap_id), 'caption': ocaption}) 101 | else: 102 | annotations[vid]= [] 103 | annotations[vid].append({'tokenized': tokenized, 'image_id': vid, 'cap_id': str(cap_id), 'caption': ocaption}) 104 | 105 | if count_train < num_train: 106 | vids_train.extend([vid + '_' + str(enum) for enum, i in enumerate(annotations[vid])]) 107 | count_train += 1 108 | elif count_valid < num_valid: 109 | vids_val.extend([vid + '_' + str(enum) for enum, i in enumerate(annotations[vid])]) 110 | count_valid += 1 111 | elif count_test < num_test: 112 | vids_test.extend([vid + '_' + str(enum) for enum, i in enumerate(annotations[vid])]) 113 | count_test += 1 114 | 115 | np.random.shuffle(vids_train) 116 | np.random.shuffle(vids_val) 117 | np.random.shuffle(vids_test) 118 | 119 | return annotations, vids_train, vids_val, vids_test, all_vids 120 | 121 | 122 | def get_features_from_pkl(from_pkl_file, all_vids_dict, youtube_map_dict): 123 | pkl = cPickle.load(open(from_pkl_file)) 124 | feats = {} 125 | 126 | for key in all_vids_dict: 127 | # key is going to be of the form xxxxxxxxxx_##_## but we want vid#### 128 | vid = youtube_map_dict[key] 129 | feats[vid] = pkl[vid] 130 | 131 | return feats 132 | 133 | 134 | def fix_feature_file_names(youtube_map_dict, feats_dir, pkl_dir): 135 | feat_files = os.listdir(feats_dir) 136 | work_order = [] 137 | for original in feat_files: 138 | if original not in youtube_map_dict.values(): 139 | new_name = youtube_map_dict[original] 140 | did = "{} to {}".format(original, new_name) 141 | work_order.append(did) 142 | #print did 143 | orig_path = os.path.join(feats_dir, original) 144 | new_path = os.path.join(feats_dir, new_name) 145 | os.rename(orig_path, new_path) 146 | 147 | # Print to file a record of what names were changed 148 | work_order_path = os.path.join(pkl_dir, 'feat_name_changes.txt') 149 | f = open(work_order_path, 'w') 150 | for i in work_order: 151 | f.write(i + '\n') 152 | 153 | print "Saved name changes to {}".format(work_order_path) 154 | 155 | 156 | def y2t(params): 157 | pkl_dir = params.pkl_dir 158 | feats_dir = params.feats_dir 159 | json_dir = params.json_dir 160 | unittest = params.test 161 | splits = 'yao' if params.yao else params.splits 162 | feat_type = params.type 163 | protocol = params.protocol 164 | from_pkl = params.from_pkl 165 | 166 | if not os.path.exists(pkl_dir): 167 | os.mkdir(pkl_dir) 168 | 169 | if splits == 'yao': 170 | print("Using Yao2015 splits.") 171 | 172 | f = open(os.path.join(json_dir, 'dict_movieID_caption.pkl'), 'r') 173 | vid_caption_dict = cPickle.load(f) 174 | 175 | f = open(os.path.join(json_dir, 'dict_youtube_mapping.pkl'), 'r') 176 | youtube_map_dict = cPickle.load(f) 177 | 178 | if os.path.isdir(feats_dir): 179 | feat_files = set(os.listdir(feats_dir)) 180 | vidX_formatted_files = set(youtube_map_dict.values()) 181 | 182 | diff = feat_files - vidX_formatted_files 183 | if len(diff) > 0 and not from_pkl: 184 | print "Found mismatch of feature file names and youtube_mapping_dict." \ 185 | "Feature files will be re-named according to youtube_map_dict.pkl" 186 | fix_feature_file_names(youtube_map_dict, feats_dir, pkl_dir) 187 | 188 | else: 189 | print "Feature directroy not found at {}.\nExiting.".format(feats_dir) 190 | sys.exit(0) 191 | 192 | train_path = os.path.join(pkl_dir, 'train.pkl') 193 | valid_path = os.path.join(pkl_dir, 'valid.pkl') 194 | test_path = os.path.join(pkl_dir, 'test.pkl') 195 | cap_path = os.path.join(pkl_dir, 'CAP.pkl') 196 | dict_path = os.path.join(pkl_dir, 'worddict.pkl') 197 | 198 | if protocol != '': 199 | filename = 'FEATS_{}_{}.pkl'.format(feat_type, protocol) 200 | else: 201 | filename = 'FEATS_{}.pkl'.format(feat_type) 202 | 203 | feats_path = os.path.join(pkl_dir, filename) 204 | 205 | if os.path.exists(train_path) or os.path.exists(valid_path) or os.path.exists(test_path): 206 | var = raw_input("Pickle files found in [{}]. Do you want to erase them? type: yes/[no] ".format(pkl_dir)) 207 | 208 | if var == 'yes': 209 | print 'Removing old pkls...' 210 | create_msr_vtt.remove_pickle_files(cap_path, dict_path, feats_path, test_path, train_path, valid_path) 211 | 212 | else: 213 | print('Loading previous pickle files and creating new FEATS_ file at path: {}'.format(feats_path)) 214 | if os.path.exists(feats_path): 215 | os.remove(feats_path) 216 | 217 | annotations = create_msr_vtt.load_annots_vtt(cap_path) 218 | 219 | features = get_features_from_dir(annotations.keys(), feats_dir, feat_type) 220 | create_msr_vtt.dump_pkl(features, feats_path) 221 | print 'FEAT file created! Path: {}'.format(feats_path) 222 | sys.exit(0) 223 | 224 | annotations, vids_train, vids_val, vids_test, all_vids = get_annots_y2t(vid_caption_dict, youtube_map_dict, 225 | unittest, splits) 226 | 227 | dump_pkl(vids_train, train_path) 228 | print('train.pkl created') 229 | dump_pkl(vids_val, valid_path) 230 | print('valid.pkl created') 231 | dump_pkl(vids_test, test_path) 232 | print('test.pkl created') 233 | 234 | dump_pkl(all_vids.keys(), os.path.join(pkl_dir, 'allvids.pkl')) 235 | dump_pkl(annotations, cap_path) 236 | print('CAP.pkl created') 237 | 238 | worddict = create_dictionary(annotations, dict_path) 239 | dump_pkl(worddict, dict_path) 240 | print('worddict.pkl created') 241 | 242 | if from_pkl: 243 | # Getting features from pkl file. 244 | from_pkl_file = os.path.join(feats_dir, 'FEAT_key_vidID_value_features.pkl') 245 | print "Loading features from pkl file." 246 | features = get_features_from_pkl(from_pkl_file, all_vids, youtube_map_dict) 247 | else: 248 | features = get_features_from_dir(annotations.keys(), feats_dir, feat_type) 249 | dump_pkl(features, feats_path) 250 | print 'FEAT file created! Path: {}'.format(feats_path) 251 | 252 | if params.do_skip_thoughts: 253 | print("Generating skip-thoughts...") 254 | import create_skip_vectors 255 | class ArgsFaker(): 256 | captions_file = cap_path 257 | output_file = os.path.join(pkl_dir, 'skip_vectors.pkl') 258 | 259 | fake_args = ArgsFaker() 260 | create_skip_vectors.main(fake_args) 261 | 262 | 263 | def _validate(args): 264 | if args.type not in args.feats_dir: 265 | print("FATAL : Requested feature type {}, but directories are something else:\tfeats_dir={}".format(args.type, args.feats_dir)) 266 | sys.exit(0) 267 | 268 | 269 | if __name__=='__main__': 270 | arg_parser = argparse.ArgumentParser() 271 | 272 | arg_parser.add_argument('-s', '--seed', type=int, help="Random seed.", default=SEED, required=False) 273 | arg_parser.add_argument('-f','--feats_dir',dest ='feats_dir',type=str, required=True) 274 | arg_parser.add_argument('-j','--json_dir',dest ='json_dir',type=str,required=True) 275 | arg_parser.add_argument('-p','--pkl_dir',dest ='pkl_dir',type=str,required=True) 276 | arg_parser.add_argument('-type','--type',dest ='type',type=str, choices=['resnet', 'googlenet', 'nasnetalarge', 'resnet152', 'pnasnet5large', 'polynet', 'senet154']) 277 | arg_parser.add_argument('-t','--test',dest = 'test',type=int,default=0, 278 | help='perform small unit test. If value 0 not unit test if greater than 0 gets a dataset with that numbers of videos') 279 | arg_parser.add_argument('-sp', '--splits', dest='splits', type=str, default='0.61,0.05,0.34', 280 | help='Create validation and test datasets. Usage: floats delimited by commas, ' 281 | 'of the form Tr,Val. ex: {-s 0.60,0.40}. Off by default.', required=False) 282 | arg_parser.add_argument('-proc', '--protocol', dest='protocol', type=str, default='') 283 | arg_parser.add_argument('-from_pkl', '--from_pkl', dest='from_pkl', type=int, default=0, 284 | help='If >=1, load features from pickle file instead of raw feature files.' 285 | 'Note that this is negated if loading pre-existing pickle files.') 286 | arg_parser.add_argument('-st', '--do_skip_thoughts', dest='do_skip_thoughts', action='store_true', default=False) 287 | arg_parser.add_argument('-y', '--yao', dest='yao', action='store_true', default=False, help='Use Yao2015 split.') 288 | 289 | args = arg_parser.parse_args() 290 | 291 | np.random.seed(args.seed) 292 | 293 | if not len(sys.argv) > 1: 294 | print arg_parser.print_help() 295 | sys.exit(0) 296 | 297 | _validate(args) 298 | 299 | y2t(args) 300 | -------------------------------------------------------------------------------- /data/process_frames.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import time 5 | from multiprocessing import Pool 6 | 7 | 8 | def main(args): 9 | src_dir = args.src_dir 10 | dst_dir = args.dst_dir 11 | start = int(args.start) 12 | end = int(args.end) 13 | PREPEND = args.prepend 14 | 15 | src_files = os.listdir(src_dir) 16 | 17 | if not os.path.isdir(dst_dir): 18 | os.mkdir(dst_dir) 19 | 20 | tuple_list = [] 21 | 22 | for video_file in src_files[start:end]: 23 | src_path = os.path.join(src_dir, video_file) 24 | dst_path = os.path.join(dst_dir, video_file) 25 | 26 | tuple_list.append((PREPEND, video_file, src_path, dst_path)) 27 | 28 | pool = Pool() # Default to number cores 29 | pool.map(process_vid, tuple_list) 30 | pool.close() 31 | pool.join() 32 | 33 | 34 | def process_vid(args): 35 | (PREPEND, video_file, src_path, dst_path) = args 36 | if not os.path.isdir(dst_path): 37 | os.mkdir(dst_path) 38 | # command = 'ffmpeg -i '+ src_path+' -s 256x256 '+ dst_path + '/%5d.jpg' #with resize 39 | command = PREPEND + 'ffmpeg -i '+ src_path+' -r 20 '+ dst_path + '/%6d.jpg > /dev/null 2>&1' #6 is to be in accordance with C3D features. 40 | print(command) 41 | 42 | os.system(command) 43 | else: 44 | print("Frames directory already found at {}".format(dst_path)) 45 | 46 | 47 | if __name__=='__main__': 48 | arg_parser = argparse.ArgumentParser() 49 | arg_parser.add_argument( 50 | 'src_dir', 51 | help='directory where videos are' 52 | ) 53 | arg_parser.add_argument( 54 | 'dst_dir', 55 | help='directory where to store frames' 56 | ) 57 | arg_parser.add_argument( 58 | 'start', 59 | help='start index (inclusive)' 60 | ) 61 | arg_parser.add_argument( 62 | 'end', 63 | help='end index (noninclusive)' 64 | ) 65 | arg_parser.add_argument( 66 | '--prepend', 67 | default='', 68 | help='optional prepend to start of ffmpeg command (in case you want to use a non-system wide version of ffmpeg)' 69 | 'For example: --prepend ~/anaconda2/bin/ will use ffmpeg installed in anaconda2' 70 | ) 71 | 72 | if not len(sys.argv) > 1: 73 | print(arg_parser.print_help()) 74 | sys.exit(0) 75 | 76 | args = arg_parser.parse_args() 77 | 78 | start_time = time.time() 79 | main(args) 80 | print("Job took %s mins" % ((time.time() - start_time)/60)) -------------------------------------------------------------------------------- /data/process_pca.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import PCA 2 | 3 | import sys 4 | import argparse 5 | import numpy as np 6 | import os 7 | import shutil 8 | from data import create_msr_vtt 9 | 10 | 11 | def gather_feats(feats_dir, unittest): 12 | sampling = True 13 | 14 | feats_orig = os.listdir(feats_dir) 15 | if unittest: 16 | feats_orig = feats_orig[:unittest] 17 | 18 | assert len(feats_orig) >= 2 19 | 20 | # Get first feature so np.concatenate has something to use 21 | with open(os.path.join(feats_dir, feats_orig[0])) as f: 22 | feats = np.load(f) 23 | if sampling: 24 | feats = create_msr_vtt.get_sub_frames(feats) 25 | counter = 1 26 | 27 | for key in feats_orig[1:]: 28 | with open(os.path.join(feats_dir, key)) as f: 29 | feat = np.load(f) 30 | if sampling: 31 | feat = create_msr_vtt.get_sub_frames(feat) 32 | feats = np.concatenate((feats, feat), axis=0) 33 | sys.stdout.write('\r' + '{' + key + '} ' + str(counter) + '/' + str(len(feats_orig)) + '\n') 34 | sys.stdout.flush() 35 | counter+=1 36 | print "saving concatenated feats.." 37 | 38 | return feats 39 | 40 | 41 | def main(): 42 | ap = argparse.ArgumentParser() 43 | ap.add_argument('-f', '--feats_dir', dest='feats_dir', type=str, default='') 44 | ap.add_argument('-ft', '--feats_testing_dir', dest='feats_testing_dir', type=str, default='') 45 | ap.add_argument('-pca', '--pca_dir', dest='pca_dir', type=str, default='') 46 | ap.add_argument('-pca_test', '--pca_test_dir', dest='pca_test_dir', type=str, default='') 47 | ap.add_argument('-type', '--type', dest='type', type=str, default='googlenet') 48 | ap.add_argument('-t', '--test', dest='test', type=int, default=0, 49 | help='perform small unit test. If value 0 not unit test if greater than 0 gets a dataset with that numbers of videos') 50 | ap.add_argument('-train_pkl', '--training_pkl', dest='train_pkl', type=str, default='') 51 | ap.add_argument('-test_pkl', '--testing_pkl', dest='test_pkl', type=str, default='') 52 | 53 | if not len(sys.argv) > 1: 54 | print ap.print_help() 55 | sys.exit(0) 56 | 57 | args = ap.parse_args() 58 | 59 | feats_dir = args.feats_dir 60 | feats_test_dir = args.feats_testing_dir 61 | pca_dir = args.pca_dir 62 | pca_test_dir = args.pca_test_dir 63 | type = args.type 64 | unittest = args.test 65 | 66 | given_train_pkl = args.train_pkl 67 | given_test_pkl = args.test_pkl 68 | 69 | print "Extracting regular feature files..." 70 | extract_and_write_pca(feats_dir, feats_dir, pca_dir, type, unittest, given_train_pkl) 71 | 72 | #print "Extracting test feature files..." 73 | #extract_and_write_pca(feats_test_dir, feats_test_dir, pca_test_dir, type, unittest) 74 | 75 | 76 | def extract_and_write_pca(transforming_feats_dir, fit_feats_dir, pca_dir, type, unittest, given_train_pkl): 77 | if given_train_pkl: 78 | pca = create_msr_vtt.load_pkl(given_train_pkl) 79 | else: 80 | # Refactor later to allow for mixing of fit feat files 81 | feats = gather_feats(fit_feats_dir, unittest) 82 | pca = PCA(n_components=1024).fit(feats) 83 | 84 | #dump_pkl(pca, os.path.join(pca_dir, 'pca_{}.pkl'.format(type))) 85 | 86 | if os.path.isdir(pca_dir): 87 | if raw_input("Found PCA folder, remove? [y/n]") == 'y': 88 | shutil.rmtree(pca_dir) 89 | else: 90 | print "Bye" 91 | sys.exit(0) 92 | 93 | os.mkdir(pca_dir) 94 | 95 | t_feat_files = os.listdir(transforming_feats_dir) 96 | if unittest: 97 | t_feat_files = t_feat_files[:unittest] 98 | 99 | for i, key in enumerate(t_feat_files, start=1): 100 | orig_feat_path = os.path.join(transforming_feats_dir, key) 101 | pca_feat_path = os.path.join(pca_dir, key) 102 | 103 | if type == 'c3d': 104 | feat = create_msr_vtt.load_c3d_feat(orig_feat_path) 105 | pca_feat = pca.transform(feat) 106 | 107 | elif type == 'resnet': 108 | with open(orig_feat_path) as f: 109 | feat = np.load(f) 110 | pca_feat = pca.transform(feat) 111 | else: 112 | print "Invalid feature type. Exiting." 113 | sys.exit(0) 114 | 115 | np.save(open(pca_feat_path, 'wb'), pca_feat) 116 | 117 | print str(i) + '/' + str(len(t_feat_files)) 118 | print 'processed: ' + str(len(t_feat_files)) + " features." 119 | 120 | 121 | if __name__ == '__main__': 122 | main() -------------------------------------------------------------------------------- /data/py3_process_features.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import pretrainedmodels 4 | import pretrainedmodels.utils as utils 5 | import torch.nn as nn 6 | import argparse 7 | import time 8 | import data.validate_feats as validate_feats 9 | import os 10 | import numpy as np 11 | import logging 12 | import shutil 13 | 14 | from multiprocessing import Pool 15 | 16 | logging.basicConfig() 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(logging.DEBUG) 19 | 20 | available_features = ['nasnetalarge', 'resnet152', 'pnasnet5large', 'densenet121', 'senet154', 'polynet'] 21 | 22 | args = None 23 | 24 | 25 | def init_model(gpu_ids, model_name): 26 | # model_name = 'pnasnet5large' 27 | # could be fbresnet152 or inceptionresnetv2 28 | model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet') 29 | model.eval() 30 | load_img = utils.LoadImage() 31 | 32 | # transformations depending on the model 33 | # rescale, center crop, normalize, and others (ex: ToBGR, ToRange255) 34 | tf_img = utils.TransformImage(model) 35 | 36 | """ 37 | TODO(WG): Would be nice to use something like DataParallel, but that only does forward pass on given module. 38 | Need to stop before logits step. 39 | Should create wrapper for pretrainedmodels that does the MPI-like ops across GPUs on model.features modules: 40 | 1) replicated 41 | 2) scatter 42 | 3) parallel_apply 43 | 4) gather 44 | Would have to know what layers are being used on each model. 45 | """ 46 | if torch.cuda.is_available(): 47 | model = model.cuda(device=gpu_ids[0]) 48 | 49 | return load_img, tf_img, model 50 | 51 | 52 | def extract_features(args): 53 | root_frames_dir = args.frames_dir 54 | root_feats_dir = args.feats_dir 55 | work = args.work 56 | autofill = int(args.autofill) 57 | ftype = args.type 58 | gpu_list = args.gpu_list 59 | 60 | frames_dirs = os.listdir(root_frames_dir) 61 | 62 | if not os.path.isdir(root_feats_dir): 63 | os.mkdir(root_feats_dir) 64 | # else: 65 | # if autofill: 66 | # logger.info('AUTOFILL ON: Attempting to autofill missing features.') 67 | # frames_dirs = validate_feats.go(featsd=root_feats_dir, framesd=root_frames_dir) 68 | 69 | # Difficulty of each job is measured by # of frames to process in each chunk. 70 | # Can't be randomized since autofill list woudld be no longer valid. 71 | # np.random.shuffle(frames_dirs) 72 | work = len(frames_dirs) if not work else work 73 | 74 | load_img, tf_img, model = init_model(args.gpu_list, args.type) 75 | 76 | work_done = 0 77 | while work_done != work: 78 | frames_dirs_avail = diff_feats(root_frames_dir, root_feats_dir) 79 | if len(frames_dirs_avail) == 0: 80 | break 81 | 82 | frames_dir = np.random.choice(frames_dirs_avail) 83 | ext = '.' + frames_dir.split('.')[-1] 84 | feat_filename = frames_dir.split('/')[-1].split(ext)[0] 85 | video_feats_path = os.path.join(args.feats_dir, feat_filename) 86 | 87 | if os.path.exists(video_feats_path): 88 | logger.info('Features already extracted:\t{}'.format(video_feats_path)) 89 | continue 90 | 91 | try: 92 | frames_to_do = [os.path.join(args.frames_dir, frames_dir, p) for p in 93 | os.listdir(os.path.join(args.frames_dir, frames_dir))] 94 | except Exception as e: 95 | logger.exception(e) 96 | continue 97 | 98 | # Must sort so frames follow numerical order. os.listdir does not guarantee order. 99 | frames_to_do.sort() 100 | 101 | if len(frames_to_do) == 0: 102 | logger.warning("Frame folder has no frames! Skipping...") 103 | continue 104 | 105 | # Save a flag copy 106 | with open(video_feats_path, 'wb') as pf: 107 | np.save(pf, []) 108 | 109 | try: 110 | batches = create_batches(frames_to_do, load_img, tf_img, batch_size=args.batch_size) 111 | except OSError as e: 112 | logger.exception(e) 113 | logger.warning("Corrupt image file. Skipping...") 114 | os.remove(video_feats_path) 115 | continue 116 | 117 | logger.debug("Start video {}".format(work_done)) 118 | 119 | feats = process_batches(batches, ftype, gpu_list, model) 120 | 121 | with open(video_feats_path, 'wb') as pf: 122 | np.save(pf, feats) 123 | logger.info('Saved complete features to {}.'.format(video_feats_path)) 124 | work_done += 1 125 | 126 | 127 | def process_batches(batches, ftype, gpu_list, model): 128 | done_batches = [] 129 | for i, batch in enumerate(batches): 130 | if torch.cuda.is_available(): 131 | batch = batch.cuda(device=gpu_list[0]) 132 | 133 | output_features = model.features(batch) 134 | output_features = output_features.data.cpu() 135 | 136 | conv_size = output_features.shape[-1] 137 | 138 | if ftype == 'nasnetalarge' or ftype == 'pnasnet5large': 139 | relu = nn.ReLU() 140 | rf = relu(output_features) 141 | avg_pool = nn.AvgPool2d(conv_size, stride=1, padding=0) 142 | out_feats = avg_pool(rf) 143 | else: 144 | avg_pool = nn.AvgPool2d(conv_size, stride=1, padding=0) 145 | out_feats = avg_pool(output_features) 146 | 147 | out_feats = out_feats.view(out_feats.size(0), -1) 148 | logger.info('Processed {}/{} batches.\r'.format(i + 1, len(batches))) 149 | 150 | done_batches.append(out_feats) 151 | feats = np.concatenate(done_batches, axis=0) 152 | return feats 153 | 154 | 155 | def create_batches(frames_to_do, load_img_fn, tf_img_fn, batch_size=8): 156 | n = len(frames_to_do) 157 | if n < batch_size: 158 | logger.warning("Sample size less than batch size: Cutting batch size.") 159 | batch_size = n 160 | 161 | logger.info("Generating {} batches...".format(n // batch_size)) 162 | batches = [] 163 | frames_to_do = np.array(frames_to_do) 164 | 165 | for idx in range(0, n, batch_size): 166 | frames_idx = list(range(idx, min(idx+batch_size, n))) 167 | batch_frame_paths = frames_to_do[frames_idx] 168 | 169 | batch_tensor = torch.zeros((len(batch_frame_paths),) + tuple(tf_img_fn.input_size)) 170 | for i, frame_path in enumerate(batch_frame_paths): 171 | input_img = load_img_fn(frame_path) 172 | input_tensor = tf_img_fn(input_img) # 3x400x225 -> 3x299x299 size may differ 173 | # input_tensor = input_tensor.unsqueeze(0) # 3x299x299 -> 1x3x299x299 174 | batch_tensor[i] = input_tensor 175 | 176 | batch_ag = torch.autograd.Variable(batch_tensor, requires_grad=False) 177 | batches.append(batch_ag) 178 | 179 | return batches 180 | 181 | 182 | def diff_feats(frames_dir, feats_dir): 183 | feats = set(os.listdir(feats_dir)) 184 | frames_to_ext = {'.'.join(i.split('.')[:-1]): i.split('.')[-1] for i in os.listdir(frames_dir)} 185 | frames = set(frames_to_ext.keys()) 186 | needed_feats = frames - feats 187 | needed_feats = [i + '.' + frames_to_ext[i] for i in needed_feats] 188 | return needed_feats 189 | 190 | 191 | if __name__ == '__main__': 192 | arg_parser = argparse.ArgumentParser() 193 | arg_parser.add_argument('frames_dir',help = 'Directory where there are frame directories.') 194 | arg_parser.add_argument('feats_dir',help = 'Root directory of dataset\'s processed videos.') 195 | arg_parser.add_argument('-w', '--work', help = 'Number of features to process. Defaults to all.', default=0, type=int) 196 | arg_parser.add_argument('-gl', '--gpu_list', required=True, nargs='+', type=int, help="Space delimited list of GPU indices to use. Example for 4 GPUs: -gl 0 1 2 3") 197 | arg_parser.add_argument('-bs', '--batch_size', type=int, help="Batch size to use during feature extraction. Larger batch size = more VRAM usage", default=8) 198 | arg_parser.add_argument('--type', required=True, help = 'ConvNet to use for processing features.', choices=available_features) 199 | arg_parser.add_argument('--autofill', action='store_true', default=False, help="Perform diff between frames_dir and feats_dir and fill them in.") 200 | 201 | args = arg_parser.parse_args() 202 | 203 | start_time = time.time() 204 | 205 | logger.info("Found {} GPUs, using {}.".format(torch.cuda.device_count(), len(args.gpu_list))) 206 | 207 | extract_features(args) 208 | 209 | logger.info("Job took %s mins" % ((time.time() - start_time)/60)) 210 | -------------------------------------------------------------------------------- /data/subsect_videos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import re 5 | import argparse 6 | from math import floor 7 | from multiprocessing import Pool 8 | 9 | 10 | def do_command(command): 11 | os.system(command) 12 | 13 | 14 | def general_case(args): 15 | if args.annots_path.endswith('.json'): 16 | # Load user specified json file. 17 | json_file = open(args.annots_path) 18 | else: 19 | json_file = open(os.path.join(args.annots_path, 'videodatainfo_2017.json')) 20 | 21 | json_str = json_file.read() 22 | json_data = json.loads(json_str) 23 | 24 | src_dir = args.src_dir 25 | dst_dir = args.dst_dir 26 | start = int(args.start) 27 | end = int(args.end) 28 | 29 | src_files = os.listdir(src_dir) 30 | 31 | 32 | if not os.path.isdir(dst_dir): 33 | os.mkdir(dst_dir) 34 | 35 | command_list = [] 36 | 37 | for video_file in src_files[start:end]: 38 | # Get index from video file name 39 | video_index = int(re.findall('\d+', video_file)[0]) 40 | 41 | # Two scenarios: 42 | # Subsecting training videos, which go video0 to video9999 43 | # Subsecting test videos, which go video10000 to vieo12999 44 | # To account for either case, take mod 10000 to get the correct 0-based index to use in json lookup. 45 | video_index %= 10000 46 | 47 | start_time = float(json_data['videos'][video_index]['start time']) 48 | end_time = float(json_data['videos'][video_index]['end time']) 49 | duration = end_time - start_time 50 | 51 | src_path = os.path.join(src_dir, video_file) 52 | 53 | dst_path = os.path.join(dst_dir, video_file) 54 | 55 | if os.path.isfile(dst_path): 56 | print 'File at {} already exists!'.format(dst_path) 57 | continue 58 | 59 | ffmpeg_subsection_cmd = "ffmpeg -ss {} -i {} -t {} -vcodec copy -acodec copy {}".format( 60 | start_time, src_path, duration, dst_path) 61 | command_list.append(ffmpeg_subsection_cmd) 62 | 63 | threadPool = Pool() 64 | threadPool.map(do_command, command_list) 65 | threadPool.close() 66 | threadPool.join() 67 | 68 | 69 | def tacos(args): 70 | def frame_to_timeestamp(frame_rate, frame_num): 71 | return float("%.3f" % (float(frame_num) / float(frame_rate))) 72 | 73 | if args.annots_path.endswith('.tsv'): 74 | # Load user specified json file. 75 | tsv_file = open(args.annots_path) 76 | else: 77 | tsv_file = open(os.path.join(args.annots_path, 'index.tsv')) 78 | 79 | data = [i for i in tsv_file] 80 | 81 | src_dir = args.src_dir 82 | dst_dir = args.dst_dir 83 | start = int(args.start) 84 | end = int(args.end) 85 | 86 | if not os.path.isdir(dst_dir): 87 | os.makedirs(dst_dir) 88 | 89 | command_list = [] 90 | 91 | for line in data: 92 | groups = line.replace('\n', '').split('\t') 93 | dest_vid = groups[0] 94 | sentence = groups[1] 95 | src_vid = groups[2] 96 | start_frame = float(groups[3]) 97 | end_frame = float(groups[4]) 98 | 99 | start_time = frame_to_timeestamp(29.40, start_frame) 100 | duration_time = frame_to_timeestamp(29.40, end_frame - start_frame) 101 | src_path = os.path.join(src_dir, src_vid + '.avi') 102 | dst_path = os.path.join(dst_dir, dest_vid + '.avi') 103 | 104 | if os.path.isfile(dst_path): 105 | print 'File at {} already exists!'.format(dst_path) 106 | continue 107 | 108 | ffmpeg_subsection_cmd = "ffmpeg -ss {} -i {} -t {} -vcodec copy -acodec copy {}".format( 109 | start_time, src_path, duration_time, dst_path) 110 | command_list.append(ffmpeg_subsection_cmd) 111 | 112 | threadPool = Pool() 113 | threadPool.map(do_command, command_list) 114 | threadPool.close() 115 | threadPool.join() 116 | 117 | 118 | if __name__ == '__main__': 119 | arg_parser = argparse.ArgumentParser() 120 | arg_parser.add_argument('src_dir', help='directory where to get full videos') 121 | arg_parser.add_argument('dst_dir',help = 'directory where to store subsections') 122 | arg_parser.add_argument('annots_path', help='directory where annotations file is stored') 123 | arg_parser.add_argument('start',help = 'start video index') 124 | arg_parser.add_argument('end',help = 'end video index') 125 | arg_parser.add_argument('--dataset', help='dataset being worked on') 126 | 127 | args = arg_parser.parse_args() 128 | 129 | if args.dataset == 'tacos': 130 | tacos(args) 131 | else: 132 | general_case(args) 133 | -------------------------------------------------------------------------------- /data/util.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import os 3 | import numpy as np 4 | import sys 5 | import logging 6 | 7 | from collections import OrderedDict 8 | 9 | 10 | logging.basicConfig() 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(logging.DEBUG) 13 | 14 | 15 | def dump_pkl(obj, path): 16 | """ 17 | Save a Python object into a pickle file. 18 | """ 19 | f = open(path, 'wb') 20 | try: 21 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 22 | 23 | finally: 24 | f.close() 25 | print path+' created' 26 | 27 | 28 | def load_pkl(path): 29 | """ 30 | Load a pickled file. 31 | 32 | :param path: Path to the pickled file. 33 | 34 | :return: The unpickled Python object. 35 | """ 36 | f = open(path, 'rb') 37 | try: 38 | rval = cPickle.load(f) 39 | finally: 40 | f.close() 41 | return rval 42 | 43 | def create_dictionary(annotations,pkl_dir): 44 | worddict = OrderedDict() 45 | word_idx = 2 46 | for a in annotations: 47 | caps = annotations[a] 48 | 49 | for cap in caps: 50 | tokens = cap['tokenized'].split() 51 | for token in tokens: 52 | if token not in ['','\t','\n',' ']: 53 | if not worddict.has_key(token): 54 | worddict[token]=word_idx 55 | word_idx+=1 56 | 57 | return worddict 58 | 59 | 60 | def pad_frames(frames, limit): 61 | last_frame = frames[-1] 62 | padding = np.asarray([last_frame * 0.]*(limit-len(frames))) 63 | frames_padded = np.concatenate([frames, padding], axis=0) 64 | return frames_padded 65 | 66 | 67 | def extract_frames_equally_spaced(frames, K): 68 | # chunk frames into 'how_many' segments and use the first frame 69 | # from each segment 70 | n_frames = len(frames) 71 | splits = np.array_split(range(n_frames), K) 72 | idx_taken = [s[0] for s in splits] 73 | sub_frames = frames[idx_taken] 74 | return sub_frames 75 | 76 | 77 | def get_sub_frames(frames): 78 | 79 | K=28 80 | if len(frames) < K: 81 | frames_ = pad_frames(frames, K) 82 | else: 83 | frames_ = extract_frames_equally_spaced(frames, K) 84 | 85 | return frames_ 86 | 87 | 88 | def load_c3d_feat(feat_file_path): 89 | if os.path.exists(feat_file_path): 90 | files = os.listdir(feat_file_path) 91 | files.sort() 92 | allftrs = np.zeros((len(files), 4101),dtype=np.float32) 93 | 94 | for j in range(0, len(files)): 95 | feat = np.fromfile(os.path.join(feat_file_path, files[j]),dtype=np.float32) 96 | allftrs[j,:] = feat 97 | allftrs = get_sub_frames(allftrs) 98 | 99 | return allftrs 100 | else: 101 | print 'error feature file doesnt exist'+feat_file_path 102 | sys.exit(0) 103 | 104 | 105 | def mkdirs_safe(dir): 106 | try: 107 | if not os.path.isdir(dir): 108 | os.makedirs(dir) 109 | except OSError as e: 110 | logger.exception(e) 111 | 112 | 113 | def create_line(seed, dataset, annots_dir, feature_type, pickle_dir, feature_dir, feature_test_dir, ut=0, st=False): 114 | if dataset == 'mvad' or dataset == 'mpii' or dataset == 'lsmdc16': 115 | line = "python create_mvad_mpii_lsmdc.py " 116 | line += "-s {} ".format(seed) 117 | line += "-d {} ".format(annots_dir) 118 | line += "-p {} ".format(pickle_dir) 119 | line += "-dbname {} ".format(dataset) 120 | elif dataset == 'tacos': 121 | line = "python create_tacos.py " 122 | line += "-s {} ".format(seed) 123 | line += "-f {} ".format(feature_dir) 124 | line += "-gt {} ".format(annots_dir) 125 | line += "-p {} ".format(pickle_dir) 126 | elif dataset == 'youtube2text': 127 | line = "python create_y2t.py " 128 | line += "-s {} ".format(seed) 129 | line += "-f {} ".format(feature_dir) 130 | line += "-j {} ".format(annots_dir) 131 | line += "-p {} ".format(pickle_dir) 132 | line += "-type {} ".format(feature_type) 133 | elif dataset == 'vtt16': 134 | line = "python create_msr_vtt.py " 135 | line += "-s {} ".format(seed) 136 | line += "-f {} ".format(feature_dir) 137 | line += "-ft {} ".format(feature_test_dir) 138 | line += "-j {} ".format(annots_dir) 139 | line += "-p {} ".format(pickle_dir) 140 | line += "-type {} ".format(feature_type) 141 | line += "-v 2016 " 142 | line += "-ws " 143 | elif dataset == 'vtt17': 144 | line = "python create_msr_vtt.py " 145 | line += "-s {} ".format(seed) 146 | line += "-f {} ".format(feature_dir) 147 | line += "-ft {} ".format(feature_test_dir) 148 | line += "-j {} ".format(annots_dir) 149 | line += "-p {} ".format(pickle_dir) 150 | line += "-type {} ".format(feature_type) 151 | line += "-v 2017 " 152 | line += "-ws " 153 | elif dataset == 'trecvid': 154 | line = "python create_trecvid.py " 155 | line += "-s {} ".format(seed) 156 | line += "-f {} ".format(feature_dir) 157 | line += "-gt {} ".format(annots_dir) 158 | line += "-p {} ".format(pickle_dir) 159 | line += "-type {} ".format(feature_type) 160 | else: 161 | raise NotImplementedError("Dataset not implemented: {}".format(dataset)) 162 | 163 | if ut: 164 | line += "-t {} ".format(ut) 165 | if st: 166 | line += "-st " 167 | 168 | return line 169 | -------------------------------------------------------------------------------- /data/validate_feats.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import logging 4 | 5 | logging.basicConfig() 6 | logger = logging.getLogger(__name__) 7 | logger.setLevel(logging.DEBUG) 8 | 9 | 10 | def go(args=None, featsd=None, framesd=None): 11 | 12 | logger.info("\nParsing frame and feature directories.") 13 | if args is not None: 14 | feats_dir = args.feats_dir 15 | frames_dir = args.frames_dir 16 | else: 17 | feats_dir = featsd 18 | frames_dir = framesd 19 | 20 | feats = set(os.listdir(feats_dir)) 21 | # '.'.join(i.split('.')[:-1]): Get video name up to the extension (last group) 22 | frames_to_ext = {'.'.join(i.split('.')[:-1]): i.split('.')[-1] for i in os.listdir(frames_dir)} 23 | frames = set(frames_to_ext.keys()) 24 | 25 | logger.info('There are {} feature files and {} frame folders.'.format(len(feats), len(frames))) 26 | assert len(frames) >= len(feats) 27 | 28 | logger.info("Validate existing features...") 29 | bad_feats = set() 30 | invalid_paths = [] 31 | sizes = {} 32 | 33 | for feat in feats: 34 | fpath = os.path.join(feats_dir, feat) 35 | stat = os.stat(fpath) 36 | sizes[fpath] = stat.st_size 37 | 38 | if stat.st_size <= 130: # Empty npy file is usually 80 bytes. Flag file is 128 39 | bad_feats.add(feat) 40 | invalid_paths.append(fpath) 41 | 42 | if bad_feats: 43 | logger.warning("There are {} nil features.".format(len(bad_feats))) 44 | feats = feats - bad_feats 45 | logger.info("Invalid paths start:") 46 | for fpath in invalid_paths: 47 | print("-> " + fpath) 48 | if args.rm_nil: 49 | os.remove(fpath) 50 | print("--> Removed!") 51 | else: 52 | logger.info("Existing features are valid (filesize > 130B).") 53 | 54 | if sizes: 55 | logger.info("Smallest feature was {} Bytes\n------------------".format(min(sizes.values()))) 56 | 57 | logger.info("In total, there are {} missing features.".format(len(frames - feats))) 58 | 59 | if args is None: 60 | needed_feats = frames - feats 61 | # Put back together extension since intersection is finished 62 | needed_feats = [i + '.' + frames_to_ext[i] for i in needed_feats] 63 | return needed_feats 64 | 65 | 66 | if __name__ == '__main__': 67 | ap = argparse.ArgumentParser() 68 | ap.add_argument('frames_dir', help='Frames directory') 69 | ap.add_argument('feats_dir', help='Features directory') 70 | ap.add_argument('-rm', '--rm_nil', help="Remove nil/invalid features.", default=False, action='store_true') 71 | 72 | args = ap.parse_args() 73 | 74 | go(args=args) 75 | -------------------------------------------------------------------------------- /data_engine.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os, socket, shutil 3 | import sys, re 4 | import time 5 | from collections import OrderedDict 6 | import numpy 7 | # import tables 8 | import theano 9 | import theano.tensor as T 10 | import common 11 | import numpy as np 12 | 13 | # sys.path.append('skip-thoughts') 14 | # import skipthoughts 15 | from scipy import spatial 16 | from nltk.corpus import stopwords 17 | 18 | from multiprocessing import Process, Queue, Manager 19 | 20 | hostname = socket.gethostname() 21 | 22 | 23 | class Movie2Caption(object): 24 | 25 | def __init__(self, model_type, signature, video_feature, 26 | mb_size_train, mb_size_test, maxlen, n_words,dec,proc, 27 | n_frames=None, outof=None, data_dir='', feats_dir='' 28 | ): 29 | self.signature = signature 30 | self.model_type = model_type 31 | self.video_feature = video_feature 32 | self.maxlen = maxlen 33 | self.n_words = n_words 34 | self.K = n_frames 35 | self.OutOf = outof 36 | self.dec = dec 37 | 38 | self.mb_size_train = mb_size_train 39 | self.mb_size_test = mb_size_test 40 | self.non_pickable = [] 41 | self.proc = proc 42 | self.host = socket.gethostname() 43 | self.data_dir=data_dir 44 | self.feats_dir = feats_dir 45 | 46 | # self.test_mode = 0 #don't chage this when in production 47 | self.load_data() 48 | 49 | 50 | 51 | if dec=='multi-stdist': 52 | # self.st_model = skipthoughts.load_model() #refactoring ... 53 | # vectors = skipthoughts.encode(engine.st_model, captions) 54 | 55 | self.cap_distances = {} 56 | 57 | 58 | def _filter_feature(self, vidID): 59 | feat = self.FEAT[vidID] 60 | # print vidID 61 | # print feat 62 | feat = self.get_sub_frames(feat) 63 | return feat 64 | 65 | def _filter_c3d_resnet(self, vidID): 66 | feat = self.FEAT[vidID] 67 | feat2 = self.FEAT2[vidID] 68 | # print vidID 69 | # print feat 70 | feat = self.get_sub_frames(feat) 71 | feat2 = self.get_sub_frames(feat2) 72 | 73 | cfeat =np.concatenate((feat,feat2),axis=1) 74 | return cfeat 75 | 76 | def _load_feat_file(self, vidID): 77 | 78 | # feats_dir =os.path.join(data_dir,'features_chal') 79 | feat = [] 80 | feats_dir = self.feats_dir 81 | 82 | feat_filename = vidID#files.split('/')[-1].split('.avi')[0] 83 | feat_file_path = os.path.join(feats_dir,feat_filename) 84 | 85 | if os.path.exists(feat_file_path): 86 | feat = np.load(feat_file_path) 87 | 88 | if len(feat) > 0: 89 | feat = self.get_sub_frames(feat) 90 | else: 91 | print 'feature file is empty '+feat_file_path 92 | print feat 93 | else: 94 | print 'error feature file doesnt exist'+feat_file_path 95 | 96 | 97 | return feat 98 | 99 | def _load_c3d_feat_file(self,vidID): 100 | feats_dir = 'vid-desc/vtt/features_c3d' 101 | feat_filename = vidID 102 | feat_file_path = os.path.join(feats_dir,feat_filename) 103 | 104 | if os.path.exists(feat_file_path): 105 | files = os.listdir(feat_file_path) 106 | files.sort() 107 | allftrs = np.zeros((len(files), 4101),dtype=np.float32) 108 | 109 | for j in range(0, len(files)): 110 | 111 | feat = np.fromfile(os.path.join(feat_file_path, files[j]),dtype=np.float32) 112 | allftrs[j,:] = feat 113 | allftrs = self.get_sub_frames(allftrs) 114 | 115 | return allftrs 116 | else: 117 | print 'error feature file doesnt exist'+feat_file_path 118 | sys.exit(0) 119 | 120 | 121 | def get_video_features(self, vidID): 122 | # hack to be fixed 123 | available_features = ['googlenet', 'resnet', 'c3d', 'resnet152', 'nasnetalarge', 'pnasnet5large', 'densenet152', 'polynet', 'senet154'] 124 | if self.video_feature in available_features: 125 | if self.signature == 'youtube2text' or self.signature == 'ysvd' or self.signature == 'vtt16' or self.signature == 'vtt17' or self.signature == 'trecvid': 126 | y = self._filter_feature(vidID) 127 | elif self.signature == 'lsmdc' or self.signature == 'lsmdc16' or self.signature == 'mpii' or self.signature == 'mvad' or self.signature == 'tacos': 128 | y = self._load_feat_file(vidID) #this is for large datasets, needs to be fixed with something better. Mpii might need this.. 129 | # elif self.signature == 'vtt': 130 | # y = self._load_c3d_feat_file(vidID) 131 | else: 132 | raise NotImplementedError() 133 | elif self.video_feature == 'c3d_resnet': 134 | y = self._filter_c3d_resnet(vidID) 135 | else: 136 | raise NotImplementedError() 137 | return y 138 | 139 | def pad_frames(self, frames, limit, jpegs): 140 | # pad frames with 0, compatible with both conv and fully connected layers 141 | last_frame = frames[-1] 142 | if jpegs: 143 | frames_padded = frames + [last_frame]*(limit-len(frames)) 144 | else: 145 | padding = numpy.asarray([last_frame * 0.]*(limit-len(frames))) 146 | frames_padded = numpy.concatenate([frames, padding], axis=0) 147 | return frames_padded 148 | 149 | def extract_frames_equally_spaced(self, frames, how_many): 150 | # chunk frames into 'how_many' segments and use the first frame 151 | # from each segment 152 | n_frames = len(frames) 153 | splits = numpy.array_split(range(n_frames), self.K) 154 | idx_taken = [s[0] for s in splits] 155 | sub_frames = frames[idx_taken] 156 | return sub_frames 157 | 158 | def add_end_of_video_frame(self, frames): 159 | if len(frames.shape) == 4: 160 | # feat from conv layer 161 | _,a,b,c = frames.shape 162 | eos = numpy.zeros((1,a,b,c),dtype='float32') - 1. 163 | elif len(frames.shape) == 2: 164 | # feat from full connected layer 165 | _,b = frames.shape 166 | eos = numpy.zeros((1,b),dtype='float32') - 1. 167 | else: 168 | import pdb; pdb.set_trace() 169 | raise NotImplementedError() 170 | frames = numpy.concatenate([frames, eos], axis=0) 171 | return frames 172 | 173 | def get_sub_frames(self, frames, jpegs=False): 174 | # from all frames, take K of them, then add end of video frame 175 | # jpegs: to be compatible with visualizations 176 | if self.OutOf: 177 | raise NotImplementedError('OutOf has to be None') 178 | frames_ = frames[:self.OutOf] 179 | if len(frames_) < self.OutOf: 180 | frames_ = self.pad_frames(frames_, self.OutOf, jpegs) 181 | else: 182 | if len(frames) < self.K: 183 | #frames_ = self.add_end_of_video_frame(frames) 184 | 185 | frames_ = self.pad_frames(frames, self.K, jpegs) 186 | 187 | else: 188 | 189 | frames_ = self.extract_frames_equally_spaced(frames, self.K) 190 | #frames_ = self.add_end_of_video_frame(frames_) 191 | if jpegs: 192 | frames_ = numpy.asarray(frames_) 193 | return frames_ 194 | 195 | def prepare_data_for_blue(self, whichset): 196 | # assume one-to-one mapping between ids and features 197 | feats = [] 198 | feats_mask = [] 199 | if whichset == 'valid': 200 | ids = self.valid_ids 201 | elif whichset == 'test': 202 | ids = self.test_ids 203 | elif whichset == 'train': 204 | ids = self.train_ids 205 | elif whichset == 'blind': 206 | ids = self.btest_ids 207 | 208 | for i, vidID in enumerate(ids): 209 | feat = self.get_video_features(vidID) 210 | feats.append(feat) 211 | feat_mask = self.get_ctx_mask(feat) 212 | feats_mask.append(feat_mask) 213 | # print i, vidID 214 | return feats, feats_mask 215 | 216 | def get_ctx_mask(self, ctx): 217 | if ctx.ndim == 3: 218 | rval = (ctx[:,:,:self.ctx_dim].sum(axis=-1) != 0).astype('int32').astype('float32') 219 | elif ctx.ndim == 2: 220 | rval = (ctx[:,:self.ctx_dim].sum(axis=-1) != 0).astype('int32').astype('float32') 221 | elif ctx.ndim == 5 or ctx.ndim == 4: 222 | assert self.video_feature == 'oxfordnet_conv3_512' 223 | # in case of oxfordnet features 224 | # (m, 26, 512, 14, 14) 225 | rval = (ctx.sum(-1).sum(-1).sum(-1) != 0).astype('int32').astype('float32') 226 | else: 227 | import pdb; pdb.set_trace() 228 | raise NotImplementedError() 229 | 230 | return rval 231 | 232 | def load_feats(self,dataset_path): 233 | if self.video_feature=='c3d': 234 | if self.proc=='pca': 235 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_c3d_'+self.proc+'.pkl')) 236 | elif self.proc=='pca512': 237 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_c3d_'+self.proc+'.pkl')) 238 | elif self.proc=='pca_c3d': 239 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_c3d_pca.pkl')) 240 | else: 241 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_c3d.pkl')) 242 | 243 | elif self.video_feature=='c3d_resnet': 244 | if self.proc=='pca': 245 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_c3d_'+self.proc+'.pkl')) 246 | self.FEAT2 = common.load_pkl(os.path.join(dataset_path , 'FEATS_resnet_'+self.proc+'.pkl')) 247 | elif self.proc=='pca512': 248 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_c3d_'+self.proc+'.pkl')) 249 | self.FEAT2 = common.load_pkl(os.path.join(dataset_path ,'FEATS_resnet_'+self.proc+'.pkl')) 250 | elif self.proc=='pca_c3d': 251 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_c3d_pca.pkl')) 252 | self.FEAT2 = common.load_pkl(os.path.join(dataset_path ,'FEATS_resnet_nostd.pkl')) 253 | else: 254 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_c3d.pkl')) 255 | self.FEAT2 = common.load_pkl(os.path.join(dataset_path ,'FEATS_resnet.pkl')) 256 | 257 | elif self.video_feature == 'googlenet': 258 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEATS_googlenet.pkl')) 259 | elif self.video_feature == 'resnet': 260 | if self.proc=='pca': 261 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEATS_resnet_'+self.proc+'.pkl')) 262 | else: 263 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEATS_resnet.pkl')) 264 | elif self.video_feature == 'nasnetalarge': 265 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEATS_nasnetalarge.pkl')) 266 | elif self.video_feature == 'resnet152': 267 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEATS_resnet152.pkl')) 268 | elif self.video_feature == 'pnasnet5large': 269 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEATS_pnasnet5large.pkl')) 270 | elif self.video_feature == 'polynet': 271 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEATS_polynet.pkl')) 272 | elif self.video_feature == 'senet154': 273 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEATS_senet154.pkl')) 274 | else: 275 | self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEATS_'+self.proc+'.pkl')) 276 | return self 277 | 278 | def load_data(self): 279 | 280 | 281 | if self.signature == 'youtube2text' or self.signature == 'trecvid': 282 | print 'loading {} {} features'.format(self.signature, self.video_feature) 283 | if self.data_dir=='': 284 | dataset_path = common.get_rab_dataset_base_path()+'youtube2text/'+self.video_feature 285 | else: 286 | dataset_path = self.data_dir 287 | 288 | # dataset_path = common.get_rab_dataset_base_path() 289 | self.train = common.load_pkl(os.path.join(dataset_path ,'train.pkl')) 290 | self.valid = common.load_pkl(os.path.join(dataset_path ,'valid.pkl')) 291 | self.test = common.load_pkl(os.path.join(dataset_path ,'test.pkl')) 292 | self.CAP = common.load_pkl(os.path.join(dataset_path , 'CAP.pkl')) 293 | 294 | 295 | # self.FEAT = common.load_pkl(os.path.join(dataset_path , 'FEAT_key_vidID_value_features_'+self.proc+'.pkl')) 296 | self.load_feats(dataset_path) 297 | 298 | self.train_ids = list(set(self.train[i].split('_')[0] for i in range(len(self.train)))) 299 | self.valid_ids = list(set(self.valid[i].split('_')[0] for i in range(len(self.valid)))) 300 | self.test_ids = list(set(self.test[i].split('_')[0] for i in range(len(self.test)))) 301 | 302 | 303 | elif self.signature == 'lsmdc' or self.signature == 'lsmdc16' or self.signature == 'mvad' or self.signature == 'mpii' or self.signature == 'tacos': 304 | print 'loading {} {} features'.format(self.signature, self.video_feature) 305 | dataset_path = self.data_dir 306 | self.train = common.load_pkl(os.path.join(dataset_path, 'train.pkl')) 307 | self.valid = common.load_pkl(os.path.join(dataset_path, 'valid.pkl')) 308 | self.test = common.load_pkl(os.path.join(dataset_path, 'test.pkl')) 309 | self.CAP = common.load_pkl(os.path.join(dataset_path, 'CAP.pkl')) 310 | 311 | self.train_ids = self.train 312 | self.valid_ids = self.valid 313 | self.test_ids = self.test 314 | 315 | if self.signature == 'lsmdc16': 316 | self.btest = common.load_pkl(os.path.join(dataset_path, 'blindtest.pkl')) 317 | self.btest_ids = self.btest 318 | 319 | 320 | elif self.signature == 'ysvd': 321 | print 'loading ysvd %s features'%self.video_feature 322 | dataset_path = common.get_rab_dataset_base_path()+'ysvd/' 323 | 324 | self.all = common.load_pkl(os.path.join(dataset_path, 'all_vids.pkl')) 325 | self.CAP = common.load_pkl(os.path.join(dataset_path, 'CAP.pkl')) 326 | self.FEAT = common.load_pkl(os.path.join(dataset_path, 'FEAT_key_vidID_value_features.pkl')) 327 | 328 | self.train = self.all[0:500] 329 | self.valid = self.all[501:750] 330 | self.test = self.all[751:1000] 331 | 332 | self.train_ids = self.train 333 | self.valid_ids = self.valid 334 | self.test_ids = self.test 335 | 336 | elif self.signature == 'vtt16' or self.signature == 'vtt17': 337 | print 'loading {} {} features'.format(self.signature, self.video_feature) 338 | 339 | if self.data_dir=='': 340 | dataset_path = common.get_rab_dataset_base_path()+'vtt/'+self.video_feature 341 | else: 342 | dataset_path = self.data_dir 343 | 344 | self.train = common.load_pkl(os.path.join(dataset_path, 'train.pkl')) 345 | self.valid = common.load_pkl(os.path.join(dataset_path, 'valid.pkl')) 346 | self.test = common.load_pkl(os.path.join(dataset_path, 'test.pkl')) 347 | self.CAP = common.load_pkl(os.path.join(dataset_path, 'CAP.pkl')) 348 | 349 | 350 | self.load_feats(dataset_path) 351 | 352 | # Get list of just the videoID, instead of videoID_CapID. Use set to ignore duplicates, then recast to list 353 | self.train_ids = list(set(self.train[i].split('_')[0] for i in range(len(self.train)))) 354 | self.valid_ids = list(set(self.valid[i].split('_')[0] for i in range(len(self.valid)))) 355 | self.test_ids = list(set(self.test[i].split('_')[0] for i in range(len(self.test)))) 356 | 357 | self.test_ids = self.test_ids #only for testing 358 | 359 | else: 360 | raise NotImplementedError() 361 | 362 | self.worddict = common.load_pkl(os.path.join(dataset_path ,'worddict.pkl')) 363 | self.word_idict = dict() 364 | # wordict start with index 2 365 | for kk, vv in self.worddict.iteritems(): 366 | self.word_idict[vv] = kk 367 | self.word_idict[0] = '' 368 | self.word_idict[1] = 'UNK' 369 | 370 | if self.video_feature == 'googlenet': 371 | self.ctx_dim = 1024 372 | elif self.video_feature == 'resnet' or self.video_feature == 'resnet152': 373 | if self.proc=='nostd': 374 | self.ctx_dim = 2048 375 | elif self.proc=='pca': 376 | self.ctx_dim=1024 377 | elif self.video_feature == 'nasnetalarge': 378 | self.ctx_dim = 4032 379 | elif self.video_feature == 'pnasnet5large': 380 | self.ctx_dim = 4320 381 | elif self.video_feature == 'polynet': 382 | self.ctx_dim = 2048 383 | elif self.video_feature == 'senet154': 384 | self.ctx_dim = 2048 385 | elif self.video_feature == 'densenet121': 386 | raise NotImplementedError() 387 | elif self.video_feature == 'c3d': 388 | if self.proc=='nostd': 389 | self.ctx_dim = 4101 390 | elif self.proc=='pca': 391 | self.ctx_dim=1024 392 | elif self.video_feature == 'c3d_resnet': 393 | if self.proc=='nostd': 394 | self.ctx_dim = 6149 395 | elif self.proc=='pca': 396 | self.ctx_dim=2048 397 | elif self.proc=='pca512': 398 | self.ctx_dim=1024 399 | elif self.proc=='pca_c3d': 400 | self.ctx_dim=3072 401 | else: 402 | raise NotImplementedError() 403 | 404 | print "ctx_dim: "+str(self.ctx_dim) 405 | self.kf_train = common.generate_minibatch_idx( 406 | len(self.train), self.mb_size_train) 407 | self.kf_valid = common.generate_minibatch_idx( 408 | len(self.valid), self.mb_size_test) 409 | self.kf_test = common.generate_minibatch_idx( 410 | len(self.test), self.mb_size_test) 411 | 412 | if self.dec == 'multi-stdist': 413 | self.skip_vectors = common.load_pkl(os.path.join(dataset_path,'skip_vectors.pkl')) 414 | 415 | 416 | def prepare_data(engine, IDs): 417 | # print "Preparing engine "+engine.dec 418 | seqs = [] 419 | z_seqs = [] 420 | feat_list = [] 421 | 422 | def get_words(vidID, capID): 423 | rval = None 424 | if engine.signature == 'youtube2text' or engine.signature == 'vtt16' or engine.signature == 'vtt17' or engine.signature == 'trecvid': 425 | caps = engine.CAP[vidID] 426 | for cap in caps: 427 | if cap['cap_id'] == capID: 428 | rval = cap['tokenized'].split(' ') 429 | break 430 | elif engine.signature == 'lsmdc' or engine.signature == 'lsmdc16': 431 | cap = engine.CAP[vidID][0] 432 | rval = cap['tokenized'].split() 433 | elif engine.signature == 'mvad' or engine.signature == 'tacos': 434 | cap = engine.CAP[vidID][0] 435 | rval = cap['tokenized'].split() 436 | elif engine.signature == 'mpii': 437 | cap = engine.CAP[vidID][0] 438 | rval = cap['tokenized'].split() 439 | elif engine.signature == 'ysvd': 440 | cap = engine.CAP[vidID][capID] 441 | rval = cap['tokenized'].split() 442 | 443 | assert rval is not None 444 | return rval 445 | 446 | def get_z_seq(): 447 | caps = engine.CAP[vidID] 448 | num_caps = len(caps) 449 | #print vidID+" "+str(num_caps) 450 | 451 | if engine.dec == 'multi-stdist': #'stdist' 452 | 453 | # common.dump_pkl(caps,'/media/onina/SSD/projects/skip-thoughts/caps') 454 | 455 | if not engine.cap_distances.has_key(vidID): 456 | 457 | captions = [ caps[0]['caption'] for x in range(num_caps)] #initialized all with the firs caption 458 | for i in range(0,num_caps): 459 | cap = caps[i] 460 | 461 | if engine.signature != 'vtt16' or engine.signature != 'vtt17': 462 | id = int(cap['cap_id']) 463 | 464 | caption = cap['caption'] 465 | # print str(id)+" "+caption 466 | # print len(captions) 467 | # print vidID 468 | udata=caption.decode("utf-8") 469 | 470 | # if id>=num_caps: 471 | # continue 472 | captions[id] = udata.encode("ascii","ignore") 473 | 474 | if captions[id].isspace(): 475 | captions[id] = captions[0] 476 | else: 477 | captions[i] = cap['tokenized'] 478 | # print captions[id] 479 | 480 | # common.dump_pkl(captions,'captions') 481 | # vectors = skipthoughts.encode(engine.st_model,captions) #refactoring this line 482 | vectors = engine.skip_vectors[vidID] 483 | caps_dist = spatial.distance.cdist(vectors, vectors, 'cosine') 484 | engine.cap_distances[vidID] = caps_dist 485 | 486 | caps_dist = engine.cap_distances[vidID] 487 | query_id = int(capID) 488 | js =range(0, query_id) + range(query_id+1,num_caps) 489 | 490 | 491 | if len(js)>0 and engine.signature != 'mvad': 492 | # print js,query_id 493 | most_distant = np.argmax(caps_dist[query_id,js]) 494 | else: 495 | most_distant = 0 496 | 497 | z_words = get_words(vidID, str(most_distant)) 498 | z_seq = [engine.worddict[w] if engine.worddict[w] < engine.n_words else 1 for w in z_words] 499 | 500 | 501 | elif engine.dec == 'generative': 502 | z_words = get_words(vidID, str(1)) 503 | z_words = [word for word in z_words if word not in stopwords.words('english')] 504 | z_seq = [engine.worddict[w] if engine.worddict[w] < engine.n_words else 1 for w in z_words] 505 | 506 | elif engine.dec == 'generative.2': 507 | 508 | z_words = get_words(vidID, str(1)) 509 | z_words = [word for word in z_words if word not in stopwords.words('english')] 510 | # print z_words 511 | 512 | def get_hypernyms(z_words): 513 | 514 | from nltk.corpus import wordnet 515 | new_z_words = [] 516 | for word in z_words: 517 | hypernyms = wordnet.synsets(word) 518 | if len(hypernyms) > 1 : 519 | h = hypernyms[0].hypernyms() 520 | if len(h) >0: 521 | nwords = h[0].lemma_names() 522 | nword = str(nwords[0]) 523 | if '_' not in nword and '-' not in nword and engine.worddict.has_key(nword): 524 | new_z_words.append(nword) 525 | # print word+' replaced with '+ nword 526 | else: 527 | new_z_words.append(word) 528 | else: 529 | new_z_words.append(word) 530 | else: 531 | new_z_words.append(word) 532 | 533 | return new_z_words 534 | 535 | import random 536 | if random.randint(0,1): #only change to hypernyms every .5 percent the time 537 | z_words = get_hypernyms(z_words) 538 | # print z_words 539 | 540 | z_seq = [engine.worddict[w] if engine.worddict[w] < engine.n_words else 1 for w in z_words] 541 | 542 | 543 | # print new_z_words 544 | 545 | return z_seq 546 | 547 | def clean_sequences(seqs,z_seqs,feat_list): 548 | 549 | if engine.dec=="standard": 550 | 551 | lengths = [len(s) for s in seqs] 552 | if engine.maxlen != None: 553 | new_seqs = [] 554 | new_feat_list = [] 555 | new_lengths = [] 556 | new_caps = [] 557 | for l, s, y, c in zip(lengths, seqs, feat_list, IDs): 558 | # sequences that have length >= maxlen will be thrown away 559 | if l < engine.maxlen: 560 | new_seqs.append(s) 561 | new_feat_list.append(y) 562 | new_lengths.append(l) 563 | new_caps.append(c) 564 | lengths = new_lengths 565 | feat_list = new_feat_list 566 | seqs = new_seqs 567 | 568 | return seqs,None,feat_list,lengths 569 | 570 | else: 571 | lengths = [len(s) for s in seqs] 572 | z_lengths = [len(s) for s in z_seqs] 573 | if engine.maxlen != None: 574 | new_seqs = [] 575 | new_zseqs = [] 576 | new_feat_list = [] 577 | new_lengths = [] 578 | new_caps = [] 579 | new_zlengths = [] 580 | for l,z_l, s, y, c in zip(lengths,z_lengths, seqs, feat_list, IDs): 581 | # sequences that have length >= maxlen will be thrown away 582 | if l < engine.maxlen and z_l < engine.maxlen : 583 | new_seqs.append(s) 584 | new_zseqs.append(s) 585 | new_feat_list.append(y) 586 | new_lengths.append(l) 587 | new_caps.append(c) 588 | lengths = new_lengths 589 | feat_list = new_feat_list 590 | seqs = new_seqs 591 | z_seqs = new_zseqs 592 | 593 | return seqs,z_seqs,feat_list,lengths 594 | 595 | for i, ID in enumerate(IDs): 596 | #print 'processed %d/%d caps'%(i,len(IDs)) 597 | # print ID 598 | if engine.signature == 'youtube2text' or engine.signature == 'vtt16' or engine.signature == 'vtt17' or engine.signature == 'trecvid': 599 | # load GNet feature 600 | vidID, capID = ID.split('_') 601 | elif engine.signature == 'tacos': 602 | vidID = ID 603 | capID = 0 604 | elif engine.signature == 'lsmdc' or engine.signature == 'lsmdc16': 605 | # t = ID.split('_') 606 | # vidID = '_'.join(t[:-1]) 607 | # capID = t[-1] 608 | vidID = ID 609 | capID = 1 610 | elif engine.signature == 'mvad': 611 | # t = ID.split('_') 612 | # vidID = '_'.join(t[:-1]) 613 | # capID = t[-1] 614 | vidID = ID 615 | capID = 1 616 | elif engine.signature == 'ysvd': 617 | # t = ID.split('_') 618 | # vidID = '_'.join(t[:-1]) 619 | # capID = t[-1] 620 | vidID = ID 621 | capID = 0 622 | elif engine.signature == 'mpii': 623 | vidID = ID 624 | capID = 1 625 | else: 626 | raise NotImplementedError() 627 | 628 | feat = engine.get_video_features(vidID) 629 | 630 | # if len(feat[0])!= engine.ctx_dim: 631 | # print 'dim error on '+vidID 632 | # sys.exit(0) 633 | 634 | feat_list.append(feat) 635 | words = get_words(vidID, capID) 636 | # print words 637 | seqs.append([engine.worddict[w] if engine.worddict[w] < engine.n_words else 1 for w in words]) 638 | 639 | # print engine.dec 640 | if engine.dec != "standard": 641 | z_seq = get_z_seq() 642 | z_seqs.append(z_seq) 643 | 644 | 645 | seqs,z_seqs,feat_list,lengths = clean_sequences(seqs,z_seqs,feat_list) 646 | 647 | if len(lengths) < 1: 648 | return None, None, None, None 649 | 650 | y = numpy.asarray(feat_list) 651 | # print len(y[1,1]) 652 | y_mask = engine.get_ctx_mask(y) 653 | 654 | n_samples = len(seqs) 655 | maxlen = numpy.max(lengths)+1 656 | 657 | x = numpy.zeros((maxlen, n_samples)).astype('int64') 658 | x_mask = numpy.zeros((maxlen, n_samples)).astype('float32') 659 | for idx, s in enumerate(seqs): 660 | x[:lengths[idx],idx] = s 661 | x_mask[:lengths[idx]+1,idx] = 1. 662 | 663 | if engine.dec=="standard": 664 | return x, x_mask, y, y_mask 665 | else: 666 | z = numpy.zeros((maxlen, n_samples)).astype('int64') #This is the other label 667 | z_mask = numpy.zeros((maxlen, n_samples)).astype('float32') 668 | for idx, s in enumerate(z_seqs): 669 | z[:lengths[idx],idx] = s 670 | z_mask[:lengths[idx]+1,idx] = 1. 671 | 672 | return x, x_mask, y, y_mask,z,z_mask 673 | 674 | 675 | def test_data_engine(): 676 | video_feature = 'googlenet' 677 | out_of = None 678 | maxlen = 100 679 | mb_size_train = 64 680 | mb_size_test = 128 681 | maxlen = 50 682 | n_words = 30000 # 25770 683 | signature = 'youtube2text' #'youtube2text' 684 | engine = Movie2Caption('attention', signature, video_feature, 685 | mb_size_train, mb_size_test, maxlen, 686 | n_words,'standard','nostd', 687 | n_frames=26, 688 | outof=out_of) 689 | i = 0 690 | t = time.time() 691 | for idx in engine.kf_train: 692 | t0 = time.time() 693 | i += 1 694 | ids = [engine.train[index] for index in idx] 695 | x, mask, ctx, ctx_mask = prepare_data(engine, ids) 696 | print 'seen %d minibatches, used time %.2f '%(i,time.time()-t0) 697 | if i == 10: 698 | break 699 | 700 | print 'used time %.2f'%(time.time()-t) 701 | 702 | 703 | if __name__ == '__main__': 704 | test_data_engine() 705 | 706 | 707 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup as Soup, SoupStrainer 2 | import urllib 3 | import os 4 | import shutil 5 | import json 6 | import argparse 7 | import sys 8 | from multiprocessing import Pool 9 | 10 | 11 | def download_mvad(command): 12 | os.system(command) 13 | 14 | 15 | def video_mvad(args): 16 | dst_dir = args.dst_dir 17 | json_dir = args.json_path 18 | start = int(args.start) 19 | end = int(args.end) 20 | 21 | base_url = 'http://courvila_contact:59db938f6d@lisaweb.iro.umontreal.ca/transfert/lisa/users/courvila' 22 | 23 | with open(os.path.join(json_dir, 'TrainList.txt'), 'r') as f: 24 | train_list = [i.replace('\n', '') for i in f] 25 | with open(os.path.join(json_dir, 'TestList.txt'), 'r') as f: 26 | test_list = [i.replace('\n', '') for i in f] 27 | with open(os.path.join(json_dir, 'ValidList.txt'), 'r') as f: 28 | valid_list = [i.replace('\n', '') for i in f] 29 | 30 | big_list = train_list + test_list + valid_list 31 | big_list = big_list[start:end] 32 | print "There are {} videos to get.".format(len(big_list)) 33 | 34 | if not os.path.exists(dst_dir): 35 | os.mkdir(dst_dir) 36 | 37 | present_vids = os.listdir(dst_dir) 38 | print "There are currently {} videos in dst_dir.".format(len(present_vids)) 39 | 40 | count = 0 41 | 42 | if int(args.filter): 43 | print "FILTER: ON" 44 | filter_dir = os.path.join(dst_dir, '../trash/') 45 | if not os.path.isdir(filter_dir): 46 | os.makedirs(filter_dir) 47 | big_list_names = [i.split('/')[-1] for i in big_list] 48 | vids_to_move = [] 49 | for i in present_vids: 50 | if i not in big_list_names: 51 | vids_to_move.append(i) 52 | 53 | for v in vids_to_move: 54 | print "Move {} -> {}".format(v, filter_dir) 55 | shutil.move(os.path.join(dst_dir, v), os.path.join(filter_dir, v)) 56 | 57 | present_vids = os.listdir(dst_dir) 58 | print "There are now {} videos in dst_dir.".format(len(present_vids)) 59 | 60 | command_list = [] 61 | for i in big_list: 62 | video_name = i.split('/')[-1] 63 | if video_name not in present_vids: 64 | count += 1 65 | dst_path = os.path.join(dst_dir, video_name) 66 | #print video_name 67 | command_list.append('wget -O {} {}'.format(dst_path, base_url + i)) 68 | 69 | threadPool = Pool() 70 | 71 | try: 72 | threadPool.map(download_mvad, command_list) 73 | threadPool.close() 74 | threadPool.join() 75 | except Exception: 76 | threadPool.close() 77 | threadPool.join() 78 | raise Exception 79 | 80 | 81 | def video_mpii(video_dir,video_name,video_clip): 82 | 83 | 84 | # url='http://courvila_contact:59db938f6d@lisaweb.iro.umontreal.ca/transfert/lisa/users/courvila/data/lisatmp2/torabi/DVDtranscription/'+video_name+'/video/'+video_clip 85 | url='http://97H5:thoNohyee7@datasets.d2.mpi-inf.mpg.de/movieDescription/protected/avi/'+video_name+'/'+video_clip 86 | 87 | 88 | 89 | u2 = urllib.urlopen(url) 90 | video_dir_dst = os.path.join(video_dir,video_name) 91 | if not os.path.exists(video_dir_dst): 92 | os.mkdir(video_dir_dst) 93 | 94 | f = open(video_dir_dst+'/'+video_clip, 'wb') 95 | meta = u2.info() 96 | file_size = int(meta.getheaders("Content-Length")[0]) 97 | print "Downloading: %s Bytes: %s" % (video_name, file_size) 98 | 99 | file_size_dl = 0 100 | block_sz = 8192 101 | while True: 102 | buffer = u2.read(block_sz) 103 | if not buffer: 104 | break 105 | 106 | file_size_dl += len(buffer) 107 | f.write(buffer) 108 | status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) 109 | status = status + chr(8)*(len(status)+1) 110 | print status, 111 | 112 | f.close() 113 | 114 | 115 | def download_video((video_id, video_url)): 116 | dst_dir = args.dst_dir 117 | mp4_dst_path = "{}/{}.mp4".format(dst_dir, video_id) 118 | webm_dst_path = "{}/{}.webm".format(dst_dir, video_id) 119 | mkv_dst_path = "{}/{}.mkv".format(dst_dir, video_id) 120 | 121 | # Don't know the extension beforehand so check all of them 122 | if os.path.isfile(mp4_dst_path) or os.path.isfile(webm_dst_path) or os.path.isfile(mkv_dst_path): 123 | print 'File already downloaded!' 124 | return 125 | 126 | dst_path = "\'{}/{}.%(ext)s\'".format(dst_dir, video_id) 127 | cmd = "youtube-dl " + video_url + " -o {}".format(dst_path) 128 | os.system(cmd) 129 | 130 | 131 | def video_vtt(args): 132 | 133 | def fill_info_list(videoID_to_info_tuple_list): 134 | if args.json_path.endswith('.json'): 135 | # Load user-specified json file 136 | json_file = open(args.json_path) 137 | else: 138 | json_file = open(os.path.join(args.json_path, 'videodatainfo_2017.json')) 139 | 140 | json_str = json_file.read() 141 | json_data = json.loads(json_str) 142 | 143 | start = int(args.start) 144 | end = int(args.end) # Max vids to do 145 | 146 | for vid_meta in json_data['videos'][start:end]: 147 | video_id = vid_meta['video_id'] 148 | video_url = vid_meta['url'] 149 | 150 | videoID_to_info_tuple_list.append((video_id, video_url)) 151 | 152 | dst_dir = args.dst_dir 153 | 154 | videoID_to_info_tuple_list = [] 155 | 156 | fill_info_list(videoID_to_info_tuple_list) 157 | 158 | if not os.path.isdir(dst_dir): 159 | os.mkdir(dst_dir) 160 | 161 | threadPool = Pool(1) # Bottlenecked by network. Change to blank if otherwise 162 | threadPool.map(download_video, videoID_to_info_tuple_list) 163 | threadPool.close() 164 | threadPool.join() 165 | 166 | 167 | def download_vine(command): 168 | print command 169 | os.system(command) 170 | 171 | 172 | def video_trecvid(args): 173 | def fill_info_list(command_list): 174 | f = open(os.path.join(args.json_path, 'vines.url.testingSet')) 175 | 176 | start = int(args.start) 177 | end = int(args.end) # Max vids to do 178 | dst_dir = args.dst_dir 179 | 180 | f = [l for l in f][start:end] 181 | 182 | for line in f: 183 | id, url = line.replace('\n', '').split(' ') 184 | dst_path = os.path.join(dst_dir, id + '.mp4') 185 | if not os.path.isfile(dst_path): 186 | command_list.append('wget -O {} {}'.format(dst_path, url)) 187 | else: 188 | print "File already found! {}".format(dst_path) 189 | dst_dir = args.dst_dir 190 | 191 | command_list = [] 192 | 193 | fill_info_list(command_list) 194 | 195 | if not os.path.isdir(dst_dir): 196 | os.mkdir(dst_dir) 197 | 198 | threadPool = Pool() 199 | threadPool.map(download_vine, command_list) 200 | threadPool.close() 201 | threadPool.join() 202 | 203 | 204 | if __name__== '__main__': 205 | arg_parser = argparse.ArgumentParser() 206 | arg_parser.add_argument('dst_dir',help = 'directory where to store videos') 207 | arg_parser.add_argument('json_path', help='directory where json file is stored') 208 | arg_parser.add_argument('start',help = 'start video index') 209 | arg_parser.add_argument('end',help = 'end video index') 210 | arg_parser.add_argument('dataset', help = 'Which dataset to download. ' 211 | 'Options: vtt | trecvid | mvad') 212 | arg_parser.add_argument('--filter', help = 'Special mode which will filter out videos present in dst_dir but not in json file to dst_dir/../trash' 213 | 'Options: 0 or 1', default=0) 214 | args = arg_parser.parse_args() 215 | 216 | if not len(sys.argv) > 1: 217 | print arg_parser.print_help() 218 | sys.exit(0) 219 | 220 | try: 221 | if args.dataset == 'vtt': 222 | video_vtt(args) 223 | elif args.dataset == 'trecvid': 224 | video_trecvid(args) 225 | elif args.dataset == 'mvad': 226 | video_mvad(args) 227 | except KeyboardInterrupt: 228 | print 'Interrupted' 229 | try: 230 | sys.exit(0) 231 | except SystemExit: 232 | os._exit(0) 233 | 234 | -------------------------------------------------------------------------------- /hyperband.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(1,'jobman') 3 | sys.path.insert(1,'coco-caption') 4 | 5 | import os 6 | import random 7 | import copy 8 | import subprocess 9 | import numpy as np 10 | 11 | from math import * 12 | from numpy import argsort 13 | from multiprocessing import Pool 14 | 15 | 16 | def args_as_typed(args): 17 | result = "" 18 | for key in args: 19 | result += key 20 | result += "=" 21 | result += str(args[key]) 22 | result += " " 23 | 24 | return result 25 | 26 | 27 | def get_random_hyperparameter_configuration(): 28 | hp_dict = {'dim_word': int(random.uniform(100, 1000)), 29 | 'dim': int(random.uniform(100, 5000)), 30 | 'encoder_dim': int(random.uniform(100, 900)), 31 | 'cost_type': np.random.choice(['v1', 'v3', 'v4', 'v5', 'v6'])} 32 | 33 | return hp_dict 34 | 35 | 36 | def run_then_return_val_loss(args, num_iters, hyperparameters, gpu_id): 37 | # -7: BLEU1 38 | # -6: BLEU2 39 | # -5: BLEU3 40 | # -4: BLEU4 41 | # -3: Meteor 42 | # -2: Rouge 43 | # -1: Cider 44 | colnum = -4 45 | 46 | # Parse through arguments and replace as necessary 47 | model = args['model'].replace('\'', '') 48 | 49 | # Do save_model_dir and logging for this run 50 | save_model_key = model + '.save_model_dir' 51 | save_model_dir = args[save_model_key].replace('\'', '') 52 | 53 | run_name = model + '_' 54 | run_name += 'HYPERBAND_{}-iters-{}'\ 55 | .format('_'.join(['{}-{}'.format(k, hyperparameters[k]) for k in hyperparameters]), num_iters) 56 | 57 | logging_dir = os.path.join(save_model_dir, 'logs', run_name) 58 | if not os.path.isdir(logging_dir): 59 | os.makedirs(logging_dir) 60 | 61 | save_model_dir = os.path.join(save_model_dir, run_name) 62 | if not os.path.isdir(save_model_dir): 63 | os.makedirs(save_model_dir) 64 | 65 | args[save_model_key] = '\'' + save_model_dir + '\'' 66 | 67 | # Do Epochs 68 | num_epochs_key = model + '.max_epochs' 69 | args[num_epochs_key] = num_iters 70 | 71 | # Set hyper-parameters 72 | for k in hyperparameters: 73 | args[model + '.' + k] = hyperparameters[k] 74 | 75 | theano_flag = "THEANO_FLAGS=\'device=gpu{}\'".format(gpu_id) 76 | # "/dev/null 2>&1" 77 | command = "{} {} {} > {} 2>&1".format(theano_flag, "python train_model.py", args_as_typed(args), os.path.join(logging_dir, 'record.txt')) 78 | print " ----- \n{}".format(command) 79 | 80 | os.system(command) 81 | 82 | print " %%%%% Job finished! \n{}".format(args_as_typed(args)) 83 | train_loss_path = os.path.join(save_model_dir, 'train_valid_test.txt') 84 | if os.path.isfile(train_loss_path): 85 | train_loss_file = open(train_loss_path) 86 | lines = [i.replace('\n', '').split(' ') for i in train_loss_file] 87 | return float(lines[-1][colnum]) 88 | else: 89 | print "Validation results were not found for this run! validFreq value must be lowered, or the training crashed." 90 | return 0.000 91 | 92 | 93 | def HYPERBAND(args): 94 | """ 95 | Adapted from: 96 | https://people.eecs.berkeley.edu/~kjamieson/hyperband.html 97 | 98 | Performs HYPERBAND across available GPUs using Theano flags. 99 | This version uses BLEU4 as the score. 100 | :param args: 101 | :return: 102 | """ 103 | max_iter = 81 # maximum iterations/epochs per configuration 104 | eta = 3 # defines downsampling rate (default=3) 105 | logeta = lambda x: log(x) / log(eta) 106 | s_max = int(logeta(max_iter)) # number of unique executions of Successive Halving (minus one) 107 | B = (s_max + 1) * max_iter # total number of iterations (without reuse) per execution of Succesive Halving (n,r) 108 | 109 | # Modify this for your needs 110 | models_per_gpu = 2 111 | avail_gpus = [0, 1] 112 | #avail_gpus = range(num_gpu) 113 | 114 | num_gpu = len(avail_gpus) 115 | 116 | #### Begin Finite Horizon Hyperband outlerloop. Repeat indefinetely. 117 | for s in reversed(range(s_max + 1)): 118 | n = int(ceil(B / max_iter / (s + 1) * eta ** s)) # initial number of configurations 119 | r = max_iter * eta ** (-s) # initial number of iterations to run configurations for 120 | 121 | #### Begin Finite Horizon Successive Halving with (n,r) 122 | T = [get_random_hyperparameter_configuration() for _ in range(n)] 123 | 124 | for i in range(s + 1): 125 | val_losses = [] 126 | 127 | # Run each of the n_i configs for r_i iterations and keep best n_i/eta 128 | n_i = n * eta ** (-i) 129 | r_i = int(floor(int(r * eta ** (i)))) 130 | r_i += 3 # Add 3 iterations since only see results after 4-8 epochs 131 | if r_i > 60: 132 | continue 133 | 134 | print ' ---- \nAt s: {}, i: {}, r_i: {}, T is: {}'.format(s, i, r_i, T) 135 | #val_losses = [run_then_return_val_loss(args=copy.deepcopy(args), num_iters=r_i, hyperparameters=t) for t in T] 136 | # First figure out what runs must be done 137 | runs = [(copy.deepcopy(args), r_i, t) for t in T] 138 | 139 | # Now tag runs with a GPU id and add to pending jobs, until no more runs 140 | while len(runs) > 0: 141 | gpuPool = Pool(num_gpu * models_per_gpu) 142 | gpu_subprocess_params_list = [] 143 | 144 | for gpu_id in avail_gpus: 145 | # First build the params by tagging on correct gpu_id 146 | model_params_per_gpu = [runs.pop() + (gpu_id,) 147 | for i in range(models_per_gpu) if len(runs) != 0] 148 | # Use params to build list of async functions on new threads 149 | model_params_per_gpu = [gpuPool.apply_async(run_then_return_val_loss, i) 150 | for i in model_params_per_gpu] 151 | 152 | gpu_subprocess_params_list.extend(model_params_per_gpu) 153 | 154 | # Execute all pending jobs, getting results as jobs finish 155 | val_losses = map(lambda x: x.get(), gpu_subprocess_params_list) 156 | gpuPool.close() 157 | gpuPool.join() 158 | 159 | print 'val_losses was: {}'.format(val_losses) 160 | T = [T[i] for i in argsort(val_losses)[0:int(n_i / eta)]] 161 | 162 | 163 | #### End Finite Horizon Successive Halving with (n,r) 164 | 165 | if __name__ == '__main__': 166 | args = {} 167 | try: 168 | for arg in sys.argv[1:]: 169 | k, v = arg.split('=') 170 | args[k] = v 171 | except: 172 | print 'args must be like a=X b.c=X' 173 | exit(1) 174 | 175 | HYPERBAND(args) 176 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import argparse, os, pdb, sys, time 2 | import numpy 3 | import cPickle as pkl 4 | import copy 5 | import glob 6 | import subprocess 7 | from multiprocessing import Process, Queue, Manager 8 | from collections import OrderedDict 9 | 10 | import data_engine 11 | from cocoeval import COCOScorer 12 | import common 13 | 14 | MAXLEN = 50 15 | 16 | 17 | def gen_model(queue, rqueue, pid, model, options, beam, 18 | model_params, shared_params): 19 | import theano 20 | from theano import tensor 21 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 22 | 23 | trng = RandomStreams(1234, use_cuda=False) 24 | # this makes sure it allocates on CPU 25 | use_noise = theano.tensor._shared(numpy.asarray(numpy.float32(0.)), 26 | name='use_noise') 27 | 28 | params = model.init_params(options) 29 | for kk, vv in params.iteritems(): 30 | if kk not in model_params: 31 | raise Exception('%s is not in the archive' % kk) 32 | assert params[kk].shape == model_params[kk].shape 33 | params[kk] = model_params[kk] 34 | if params[kk].shape == (): 35 | # theano.tensor._shared only takes ndarray 36 | # thus, converting numpy.float32 to numpy.adarray first 37 | params[kk] = numpy.asarray(params[kk]) 38 | tparams = model.init_tparams(params, force_cpu=True) 39 | mode = theano.compile.get_default_mode().excluding('gpu') 40 | f_init, f_next = model.build_sampler(tparams, options, use_noise, trng, mode=mode) 41 | 42 | curridx = shared_params['id'] 43 | 44 | def _gencap(ctx, ctx_mask): 45 | sample, score, next_state, next_memory = model.gen_sample( 46 | tparams, f_init, f_next, ctx, ctx_mask, 47 | options, 48 | trng=trng, k=k, maxlen=MAXLEN, stochastic=False) 49 | 50 | sidx = numpy.argmin(score) 51 | return sample[sidx], next_state, next_memory 52 | 53 | while True: 54 | req = queue.get() 55 | if req == None: 56 | break 57 | idx, context, context_mask = req[0], req[1], req[2] 58 | if curridx < shared_params['id']: 59 | print 'Updating parameters...' 60 | for kk in shared_params.keys(): 61 | if kk in tparams: 62 | tparams[kk].set_value(shared_params[kk]) 63 | curridx = shared_params['id'] 64 | 65 | print pid, '-', idx 66 | seq, next_state, next_memory = _gencap(context, context_mask) 67 | 68 | rqueue.put((idx, seq, next_state, next_memory)) 69 | 70 | return 71 | 72 | 73 | manager = Manager() 74 | 75 | 76 | def update_params(shared_params, model_params): 77 | for kk, vv in model_params.iteritems(): 78 | shared_params[kk] = vv 79 | shared_params['id'] = shared_params['id'] + 1 80 | 81 | 82 | def build_sample_pairs(samples, vidIDs): 83 | D = OrderedDict() 84 | for sample, vidID in zip(samples, vidIDs): 85 | D[vidID] = [{'image_id': vidID, 'caption': sample}] 86 | return D 87 | 88 | def save_test_samples_youtube2text(samples_test, engine): 89 | 90 | out_dir = 'predictions/' + engine.signature + '_' + engine.video_feature + '_' + engine.model_type + '/' 91 | 92 | if not os.path.exists('predictions/'): 93 | os.mkdir('predictions/') 94 | if not os.path.exists(out_dir): 95 | os.mkdir(out_dir) 96 | 97 | f = open(out_dir + 'samplestest.csv', 'wr') 98 | 99 | gts_test = OrderedDict() 100 | 101 | results = OrderedDict() 102 | results['version'] = "1.2" 103 | D = None 104 | 105 | if engine.signature == 'youtube2text': 106 | import cPickle 107 | d = open(os.path.join(engine.data_dir,'dict_youtube_mapping.pkl'), 'rb') 108 | D = cPickle.load(d) 109 | D = dict((y, x) for x, y in D.iteritems()) 110 | 111 | samples = [] 112 | for vidID in sorted(engine.test_ids): 113 | gts_test[vidID] = engine.CAP[vidID] 114 | # print samples_test[vidID] 115 | sample = OrderedDict() 116 | sample['video_id'] = vidID 117 | sample['caption'] = samples_test[vidID][0]['caption'] 118 | samples.append(sample) 119 | 120 | if engine.signature == 'youtube2text': 121 | f.write(D[vidID] + ',' + samples_test[vidID][0]['caption'] + ',' + gts_test[vidID][0]['caption'] + '\n') 122 | # elif engine.signature == 'trecvid': 123 | # f.write(vidID + ' ' + samples_test[vidID][0]['caption'] + '\n') 124 | else: 125 | f.write(vidID + ',' + samples_test[vidID][0]['caption'] + ',' + gts_test[vidID][0]['caption'] + '\n') 126 | 127 | f.close() 128 | 129 | results['result'] = samples 130 | results['external_data'] = {'used': 'true', 'details': 'Resnet trained on Imagenet.'} 131 | 132 | import json 133 | with open(out_dir + 'prediction.json', 'w') as outfile: 134 | json.dump(results, outfile, indent=4) 135 | 136 | 137 | def save_test_samples_acm_trecvid_y2t(samples_test, engine): # for acm/trecvid/y2t challenge 138 | 139 | out_dir = 'predictions/' + engine.signature + '_' + engine.video_feature + '_' + engine.model_type + '/' 140 | 141 | if not os.path.exists('predictions/'): 142 | os.mkdir('predictions/') 143 | if not os.path.exists(out_dir): 144 | os.mkdir(out_dir) 145 | 146 | if engine.signature == 'trecvid': 147 | f = open(out_dir + 'trecvid.txt', 'wr') 148 | else: 149 | f = open(out_dir + 'samplestest.csv', 'wr') 150 | 151 | gts_test = OrderedDict() 152 | 153 | results = OrderedDict() 154 | results['version'] = "1.2" 155 | # D = None 156 | # if engine.signature == 'youtube2text': 157 | # import cPickle 158 | # d = open('data/youtube2text_iccv15/original/dict_youtube_mapping.pkl', 'rb') 159 | # D = cPickle.load(d) 160 | # D = dict((y, x) for x, y in D.iteritems()) 161 | 162 | samples = [] 163 | for vidID in sorted(engine.test_ids): 164 | gts_test[vidID] = engine.CAP[vidID] 165 | # print samples_test[vidID] 166 | sample = OrderedDict() 167 | sample['video_id'] = vidID 168 | sample['caption'] = samples_test[vidID][0]['caption'] 169 | samples.append(sample) 170 | 171 | # if engine.signature == 'youtube2text': 172 | # f.write(D[vidID] + ',' + samples_test[vidID][0]['caption'] + ',' + gts_test[vidID][0]['caption'] + '\n') 173 | # if engine.signature == 'trecvid': 174 | # f.write(vidID + ' ' + samples_test[vidID][0]['caption'] + '\n') 175 | # else: 176 | f.write(vidID + ',' + samples_test[vidID][0]['caption'] + ',' + gts_test[vidID][0]['caption'] + '\n') 177 | 178 | f.close() 179 | 180 | results['result'] = samples 181 | results['external_data'] = {'used': 'true', 'details': 'Resnet trained on Imagenet.'} 182 | 183 | import json 184 | with open(out_dir + 'submission.json', 'w') as outfile: 185 | json.dump(results, outfile, indent=4) 186 | 187 | def save_test_samples_vtt(samples_test, engine): # for acm/trecvid/y2t challenge 188 | 189 | out_dir = 'predictions/' + engine.signature + '_' + engine.video_feature + '_' + engine.model_type + '/' 190 | 191 | if not os.path.exists('predictions/'): 192 | os.mkdir('predictions/') 193 | if not os.path.exists(out_dir): 194 | os.mkdir(out_dir) 195 | 196 | # if engine.signature == 'trecvid': 197 | # f = open(out_dir + 'trecvid.txt', 'wr') 198 | # else: 199 | f = open(out_dir + 'samplestest.csv', 'wr') 200 | 201 | gts_test = OrderedDict() 202 | 203 | results = OrderedDict() 204 | results['version'] = "1.2" 205 | # D = None 206 | # if engine.signature == 'youtube2text': 207 | # import cPickle 208 | # d = open('data/youtube2text_iccv15/original/dict_youtube_mapping.pkl', 'rb') 209 | # D = cPickle.load(d) 210 | # D = dict((y, x) for x, y in D.iteritems()) 211 | 212 | samples = [] 213 | for vidID in sorted(engine.test_ids): 214 | gts_test[vidID] = engine.CAP[vidID] 215 | # print samples_test[vidID] 216 | sample = OrderedDict() 217 | sample['video_id'] = vidID 218 | sample['caption'] = samples_test[vidID][0]['caption'] 219 | samples.append(sample) 220 | 221 | # if engine.signature == 'youtube2text': 222 | # f.write(D[vidID] + ',' + samples_test[vidID][0]['caption'] + ',' + gts_test[vidID][0]['caption'] + '\n') 223 | # if engine.signature == 'trecvid': 224 | # f.write(vidID + ' ' + samples_test[vidID][0]['caption'] + '\n') 225 | # else: 226 | f.write(vidID + ',' + samples_test[vidID][0]['caption'] + ',' + gts_test[vidID][0]['caption'] + '\n') 227 | 228 | f.close() 229 | 230 | results['result'] = samples 231 | results['external_data'] = {'used': 'true', 'details': 'Resnet trained on Imagenet.'} 232 | 233 | import json 234 | with open(out_dir + 'submission.json', 'w') as outfile: 235 | json.dump(results, outfile, indent=4) 236 | 237 | def save_test_samples_lsmdc(samples_test, engine): # for lsmdc16 challenge 238 | 239 | out_dir = 'predictions/' + engine.signature + '_' + engine.video_feature + '_' + engine.model_type + '/' 240 | 241 | if not os.path.exists('predictions/'): 242 | os.mkdir('predictions/') 243 | if not os.path.exists(out_dir): 244 | os.mkdir(out_dir) 245 | 246 | f = open(out_dir + 'samplestest.csv', 'wr') 247 | 248 | gts_test = OrderedDict() 249 | 250 | results = OrderedDict() 251 | results['version'] = "1" 252 | 253 | dict_path = os.path.join('/PATH/TO/lsmdc16/pkls16', 'dict_vids_mapping.pkl') 254 | vids_names = common.load_pkl(dict_path) 255 | # D= None 256 | # if engine.signature=='youtube2text': 257 | # import cPickle 258 | # d= open('data/youtube2text_iccv15/original/dict_youtube_mapping.pkl','rb') 259 | # D = cPickle.load(d) 260 | # D = dict((y,x) for x,y in D.iteritems()) 261 | 262 | samples = [] 263 | # for vidID in engine.test_ids: 264 | for vidID in samples_test.keys(): 265 | gts_test[vidID] = engine.CAP[vidID] 266 | # print samples_test[vidID] 267 | sample = OrderedDict() 268 | sample['video_id'] = vids_names[vidID] 269 | # sample['ovid_id']=vidID 270 | sample['caption'] = samples_test[vidID][0]['caption'] 271 | # sample['ocaption']=gts_test[vidID][0]['caption'] 272 | samples.append(sample) 273 | 274 | # if engine.signature=='youtube2text': 275 | # f.write(D[vidID]+','+ samples_test[vidID][0]['caption']+','+gts_test[vidID][0]['caption']+'\n') 276 | # else: 277 | f.write(vidID + ',' + samples_test[vidID][0]['caption'] + ',' + gts_test[vidID][0]['caption'] + '\n') 278 | 279 | f.close() 280 | 281 | # results['result']= samples 282 | # results['external_data']={'used': 'true','details':'First fully connected of C3D pretrained on Sports1M'} 283 | 284 | samples = sorted(samples, key=lambda x: x['video_id']) 285 | 286 | import json 287 | with open(out_dir + 'publictest_burka_results.json', 'w') as outfile: 288 | json.dump(samples, outfile, indent=4) 289 | 290 | 291 | def save_blind_test_samples(samples_test, engine): # for lsmdc16 challenge 292 | 293 | out_dir = 'submissions/' + engine.signature + '_' + engine.video_feature + '_' + engine.model_type + '/' 294 | 295 | if not os.path.exists('submissions/'): 296 | os.mkdir('submissions/') 297 | if not os.path.exists(out_dir): 298 | os.mkdir(out_dir) 299 | 300 | # f=open(out_dir+'samplesbtest.csv','wr') 301 | 302 | gts_test = OrderedDict() 303 | 304 | results = OrderedDict() 305 | results['version'] = "1" 306 | 307 | dict_path = os.path.join('data/lsmdc16/', 'dict_bvids_mapping.pkl') 308 | vids_names = common.load_pkl(dict_path) 309 | 310 | samples = [] 311 | # for vidID in engine.test_ids: 312 | for vidID in samples_test.keys(): 313 | # gts_test[vidID] = engine.CAP[vidID] 314 | sample = OrderedDict() 315 | sample['video_id'] = vids_names[vidID] 316 | sample['caption'] = samples_test[vidID][0]['caption'] 317 | samples.append(sample) 318 | # f.write(vidID+','+ samples_test[vidID][0]['caption']+','+gts_test[vidID][0]['caption']+'\n') 319 | 320 | # f.close() 321 | 322 | samples = sorted(samples, key=lambda x: x['video_id']) 323 | 324 | import json 325 | with open(out_dir + 'blindtest_burka_results.json', 'w') as outfile: 326 | json.dump(samples, outfile, indent=4) 327 | 328 | 329 | def score_with_cocoeval(samples_valid, samples_test, engine): 330 | scorer = COCOScorer() 331 | if samples_valid: 332 | gts_valid = OrderedDict() 333 | for vidID in engine.valid_ids: 334 | # TODO(WG) Check for sampling type 335 | gts_valid[vidID] = engine.CAP[vidID] 336 | valid_score = scorer.score(gts_valid, samples_valid, engine.valid_ids) 337 | else: 338 | valid_score = None 339 | 340 | if samples_test: 341 | gts_test = OrderedDict() 342 | for vidID in engine.test_ids: 343 | gts_test[vidID] = engine.CAP[vidID] 344 | test_score = scorer.score(gts_test, samples_test, engine.test_ids) 345 | 346 | else: 347 | test_score = None 348 | return valid_score, test_score 349 | 350 | 351 | def generate_sample_gpu_single_process( 352 | model_type, model_archive, options, engine, model, 353 | f_init, f_next, 354 | save_dir='./samples', beam=5, 355 | whichset='both'): 356 | def _seqs2words(caps): 357 | capsw = [] 358 | for cc in caps: 359 | ww = [] 360 | for w in cc: 361 | if w == 0: 362 | break 363 | ww.append(engine.word_idict[1] 364 | if w > len(engine.word_idict) else engine.word_idict[w]) 365 | capsw.append(' '.join(ww)) 366 | return capsw 367 | 368 | def sample(whichset): 369 | samples = [] 370 | ctxs, ctx_masks = engine.prepare_data_for_blue(whichset) 371 | # i = 0 372 | for i, ctx, ctx_mask in zip(range(len(ctxs)), ctxs, ctx_masks): 373 | print 'sampling %d/%d' % (i, len(ctxs)) 374 | sample, score, _, _ = model.gen_sample( 375 | None, f_init, f_next, ctx, ctx_mask, options, 376 | None, beam, maxlen=MAXLEN) 377 | 378 | sidx = numpy.argmin(score) 379 | sample = sample[sidx] 380 | # print _seqs2words([sample])[0] 381 | samples.append(sample) 382 | 383 | # if i>10: # hack to test it is working OK 384 | # samples = _seqs2words(samples) 385 | # return samples 386 | # i+=1 387 | 388 | # print "finished sampling" 389 | samples = _seqs2words(samples) 390 | # print 'finished _seq2words' 391 | return samples 392 | 393 | samples_valid = None 394 | samples_test = None 395 | samples_btest = None 396 | 397 | if whichset == 'valid' or whichset == 'both': 398 | print 'Valid Set...', 399 | samples_valid = sample('valid') 400 | with open(save_dir + '/valid_samples.txt', 'w') as f: 401 | print >> f, '\n'.join(samples_valid) 402 | if whichset == 'test' or whichset == 'both': 403 | print 'Test Set...', 404 | samples_test = sample('test') 405 | with open(save_dir + '/test_samples.txt', 'w') as f: 406 | print >> f, '\n'.join(samples_test) 407 | if whichset == 'blind': 408 | print 'Blind Test Set...', 409 | samples_btest = sample('blind') 410 | with open(save_dir + '/blind_test_samples.txt', 'w') as f: 411 | print >> f, '\n'.join(samples_btest) 412 | 413 | if samples_valid != None: 414 | samples_valid = build_sample_pairs(samples_valid, engine.valid_ids) 415 | if samples_test != None: 416 | samples_test = build_sample_pairs(samples_test, engine.test_ids) 417 | if samples_btest != None: 418 | # print 'build sample pairs' 419 | samples_btest = build_sample_pairs(samples_btest, engine.btest_ids) 420 | 421 | return samples_valid, samples_test, samples_btest 422 | 423 | 424 | def compute_score( 425 | model_type, model_archive, options, engine, save_dir, 426 | beam, n_process, 427 | whichset='both', on_cpu=True, 428 | processes=None, queue=None, rqueue=None, shared_params=None, 429 | one_time=False, metric=None, 430 | f_init=None, f_next=None, model=None): 431 | assert metric != 'perplexity' 432 | if on_cpu: 433 | raise NotImplementedError() 434 | else: 435 | assert model is not None 436 | samples_valid, samples_test, samples_btest = generate_sample_gpu_single_process( 437 | model_type, model_archive, options, 438 | engine, model, f_init, f_next, 439 | save_dir=save_dir, 440 | beam=beam, 441 | whichset=whichset) 442 | 443 | valid_score, test_score = score_with_cocoeval(samples_valid, samples_test, engine) 444 | 445 | scores_final = {} 446 | scores_final['valid'] = valid_score 447 | scores_final['test'] = test_score 448 | 449 | if one_time: 450 | return scores_final 451 | 452 | return scores_final, processes, queue, rqueue, shared_params 453 | 454 | 455 | def save_samples( 456 | model_type, model_archive, options, engine, save_dir, 457 | beam, n_process, 458 | whichset='both', on_cpu=True, 459 | processes=None, queue=None, rqueue=None, shared_params=None, 460 | one_time=False, metric=None, 461 | f_init=None, f_next=None, model=None): 462 | assert metric != 'perplexity' 463 | if on_cpu: 464 | raise NotImplementedError() 465 | else: 466 | assert model is not None 467 | samples_valid, samples_test, samples_btest = generate_sample_gpu_single_process( 468 | model_type, model_archive, options, 469 | engine, model, f_init, f_next, 470 | save_dir=save_dir, 471 | beam=beam, 472 | whichset=whichset) 473 | print samples_test 474 | 475 | if whichset == 'test': 476 | if engine.signature == 'trecvid': 477 | save_test_samples_acm_trecvid_y2t(samples_test, engine) 478 | if engine.signature == 'youtube2text': 479 | save_test_samples_youtube2text(samples_test, engine) 480 | if engine.signature == 'vtt': 481 | save_test_samples_vtt(samples_test, engine) 482 | if engine.signature == 'lsmdc16': 483 | save_test_samples_lsmdc(samples_test, engine) 484 | else: 485 | save_test_samples_acm_trecvid_y2t(samples_test, engine) 486 | elif whichset == 'blind': 487 | save_blind_test_samples(samples_btest, engine) 488 | 489 | 490 | def test_cocoeval(): 491 | engine = data_engine.Movie2Caption('attention', 'lsmdc16', 492 | video_feature='googlenet', 493 | mb_size_train=20, 494 | mb_size_test=20, 495 | maxlen=50, n_words=20000, 496 | dec='standard', proc='nostd', 497 | n_frames=20, outof=None) 498 | # samples_valid = common.load_txt_file('./test/valid_samples.txt') 499 | # samples_test = common.load_txt_file('./test/test_samples.txt') 500 | samples_valid = common.load_txt_file('/PATH/TO/valid_samples.txt') 501 | samples_test = common.load_txt_file('/PATH/TO/test_samples.txt') 502 | samples_valid = [sample.strip() for sample in samples_valid] 503 | samples_test = [sample.strip() for sample in samples_test] 504 | 505 | samples_valid = build_sample_pairs(samples_valid, engine.valid_ids) 506 | samples_test = build_sample_pairs(samples_test, engine.test_ids) 507 | valid_score, test_score = score_with_cocoeval(samples_valid, samples_test, engine) 508 | print valid_score, test_score 509 | 510 | 511 | def test_cocoeval_vtt(): 512 | engine = data_engine.Movie2Caption('attention', 'lsmdc16', 513 | video_feature='googlenet', 514 | mb_size_train=20, 515 | mb_size_test=20, 516 | maxlen=50, n_words=20000, 517 | dec='standard', proc='nostd', 518 | n_frames=20, outof=None, 519 | data_dir='/PATH/TO/data/lsmdc16/pkls/', 520 | feats_dir='/PATH/TO/lsmdc16/features_googlenet') 521 | samples_valid = common.load_txt_file( 522 | '/PATH/TO/valid_samples.txt') 523 | samples_test = common.load_txt_file( 524 | 'PATH/TO/test_samples.txt') 525 | samples_valid = [sample.strip() for sample in samples_valid] 526 | samples_test = [sample.strip() for sample in samples_test] 527 | 528 | samples_valid = build_sample_pairs(samples_valid, engine.valid_ids) 529 | samples_test = build_sample_pairs(samples_test, engine.test_ids) 530 | valid_score, test_score = score_with_cocoeval(samples_valid, samples_test, engine) 531 | print valid_score, test_score 532 | 533 | 534 | if __name__ == '__main__': 535 | test_cocoeval_vtt() 536 | -------------------------------------------------------------------------------- /py2-vid-desc_requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | ca-certificates=2018.03.07=0 5 | certifi=2018.4.16=py27_0 6 | intel-openmp=2018.0.3=0 7 | libedit=3.1.20170329=h6b74fdf_2 8 | libffi=3.2.1=hd88cf55_4 9 | libgcc-ng=7.2.0=hdf63c60_3 10 | libgfortran-ng=7.2.0=hdf63c60_3 11 | libopenblas=0.2.20=h9ac9557_7 12 | libstdcxx-ng=7.2.0=hdf63c60_3 13 | mkl=2018.0.3=1 14 | mkl_fft=1.0.1=py27h3010b51_0 15 | mkl_random=1.0.1=py27h629b387_0 16 | ncurses=6.1=hf484d3e_0 17 | nltk=3.3.0=py27_0 18 | openssl=1.0.2o=h20670df_0 19 | pip=10.0.1=py27_0 20 | python=2.7.15=h1571d57_0 21 | readline=7.0=ha6073c6_4 22 | scikit-learn=0.19.1=py27h445a80a_0 23 | setuptools=39.2.0=py27_0 24 | six=1.11.0=py27h5f960f1_1 25 | sqlite=3.23.1=he433501_0 26 | tk=8.6.7=hc745277_3 27 | wheel=0.31.1=py27_0 28 | zlib=1.2.11=ha838bed_2 29 | -------------------------------------------------------------------------------- /py2_pip_freeze.txt: -------------------------------------------------------------------------------- 1 | backports.functools-lru-cache==1.5 2 | beautifulsoup4==4.6.0 3 | certifi==2018.4.16 4 | cloudpickle==0.5.3 5 | cycler==0.10.0 6 | Cython==0.28.3 7 | dask==0.17.5 8 | decorator==4.3.0 9 | kiwisolver==1.0.1 10 | matplotlib==2.2.2 11 | mkl-fft==1.0.0 12 | mkl-random==1.0.1 13 | networkx==2.1 14 | nltk==3.3 15 | numpy==1.14.4 16 | Pillow==5.1.0 17 | protobuf==3.5.2.post1 18 | pyparsing==2.2.0 19 | python-dateutil==2.7.3 20 | pytz==2018.4 21 | PyWavelets==0.5.2 22 | scikit-image==0.14.0 23 | scikit-learn==0.19.1 24 | scipy==1.1.0 25 | six==1.11.0 26 | subprocess32==3.5.2 27 | Theano==0.8.1 28 | toolz==0.9.0 29 | -------------------------------------------------------------------------------- /py3-vid-desc_requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | blas=1.0=mkl 5 | ca-certificates=2018.03.07=0 6 | certifi=2018.4.16=py36_0 7 | cffi=1.11.5=py36h9745a5d_0 8 | coverage=4.5.1=py36h14c3975_0 9 | cudatoolkit=8.0=3 10 | cudnn=7.0.5=cuda8.0_0 11 | freetype=2.8=hab7d2ae_1 12 | intel-openmp=2018.0.0=8 13 | java-jre=8.45.14=0 14 | jpeg=9b=h024ee3a_2 15 | libedit=3.1.20170329=h6b74fdf_2 16 | libffi=3.2.1=hd88cf55_4 17 | libgcc-ng=7.2.0=hdf63c60_3 18 | libgfortran-ng=7.2.0=hdf63c60_3 19 | libpng=1.6.34=hb9fc6fc_0 20 | libstdcxx-ng=7.2.0=hdf63c60_3 21 | libtiff=4.0.9=he85c1e1_1 22 | mkl=2018.0.2=1 23 | mkl_fft=1.0.1=py36h3010b51_0 24 | mkl_random=1.0.1=py36h629b387_0 25 | nccl=1.3.4=cuda8.0_1 26 | ncurses=6.1=hf484d3e_0 27 | ninja=1.8.2=py36h6bb024c_1 28 | numpy=1.14.3=py36hcd700cb_1 29 | numpy-base=1.14.3=py36h9be14a7_1 30 | olefile=0.45.1=py36_0 31 | openssl=1.0.2o=h20670df_0 32 | pep8=1.7.1=py36_0 33 | pillow=5.1.0=py36h3deb7b8_0 34 | pip=10.0.1=py36_0 35 | pycparser=2.18=py36hf9f622e_1 36 | python=3.6.5=hc3d631a_2 37 | pytorch=0.4.0=py36_cuda8.0.61_cudnn7.1.2_1 38 | pyyaml=3.12=py36hafb9ca4_1 39 | readline=7.0=ha6073c6_4 40 | setuptools=39.1.0=py36_0 41 | six=1.11.0=py36h372c433_1 42 | sqlite=3.23.1=he433501_0 43 | tk=8.6.7=hc745277_3 44 | torchvision=0.2.1=py36_1 45 | wheel=0.31.1=py36_0 46 | xz=5.2.4=h14c3975_4 47 | yaml=0.1.7=had09818_2 48 | zlib=1.2.11=ha838bed_2 49 | -------------------------------------------------------------------------------- /py3_pip_freeze.txt: -------------------------------------------------------------------------------- 1 | backcall==0.1.0 2 | certifi==2018.4.16 3 | cffi==1.11.5 4 | chardet==3.0.4 5 | coverage==4.5.1 6 | Cython==0.28.2 7 | decorator==4.3.0 8 | easydict==1.7 9 | idna==2.6 10 | ipaddress==1.0.22 11 | ipython==6.4.0 12 | ipython-genutils==0.2.0 13 | jedi==0.12.0 14 | mkl-fft==1.0.0 15 | mkl-random==1.0.1 16 | munch==2.3.2 17 | numpy==1.14.3 18 | olefile==0.45.1 19 | parso==0.2.1 20 | pep8==1.7.1 21 | pexpect==4.6.0 22 | pickleshare==0.7.4 23 | Pillow==5.1.0 24 | pretrainedmodels==0.7.0 25 | prompt-toolkit==1.0.15 26 | protobuf==3.5.2.post1 27 | ptyprocess==0.5.2 28 | pycparser==2.18 29 | Pygments==2.2.0 30 | pyre==0.3.2 31 | PyYAML==3.12 32 | pyzmq==17.0.0 33 | requests==2.18.4 34 | simplegeneric==0.8.1 35 | six==1.11.0 36 | skipthoughts==0.0.0 37 | torch==0.4.0 38 | torchtext==0.2.3 39 | torchvision==0.2.1 40 | tqdm==4.23.4 41 | traitlets==4.3.2 42 | urllib3==1.22 43 | wcwidth==0.1.7 44 | xmlrunner==1.7.7 45 | -------------------------------------------------------------------------------- /train_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.insert(1,'jobman') 4 | sys.path.insert(1,'coco-caption') 5 | 6 | import numpy 7 | import os, sys, socket 8 | import time 9 | import logging 10 | from config import config 11 | from jobman import DD, expand 12 | import common 13 | import numpy as np 14 | 15 | import model_attention 16 | import model_lstmdd 17 | import model_mtle 18 | 19 | logging.basicConfig() 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def set_config(conf, args, add_new_key=False): 24 | # add_new_key: if conf does not contain the key, creates it 25 | for key in args: 26 | if key != 'jobman': 27 | v = args[key] 28 | if isinstance(v, DD): 29 | set_config(conf[key], v) 30 | else: 31 | if conf.has_key(key): 32 | conf[key] = convert_from_string(v) 33 | elif add_new_key: 34 | # create a new key in conf 35 | conf[key] = convert_from_string(v) 36 | else: 37 | raise KeyError(key) 38 | 39 | def convert_from_string(x): 40 | """ 41 | Convert a string that may represent a Python item to its proper data type. 42 | It consists in running `eval` on x, and if an error occurs, returning the 43 | string itself. 44 | """ 45 | try: 46 | return eval(x, {}, {}) 47 | except Exception: 48 | return x 49 | 50 | def train_from_scratch(config, state, channel): 51 | # Model options 52 | save_model_dir = config[config.model].save_model_dir 53 | 54 | np.random.seed(int(config.random_seed)) 55 | 56 | if save_model_dir == 'current': 57 | config[config.model].save_model_dir = './' 58 | save_model_dir = './' 59 | # to facilitate the use of cluster for multiple jobs 60 | save_path = './model_config.pkl' 61 | else: 62 | # run locally, save locally 63 | save_path = os.path.join(save_model_dir ,'model_config.pkl') 64 | print 'current save dir ',save_model_dir 65 | common.create_dir_if_not_exist(save_model_dir) 66 | 67 | reload_ = config[config.model].reload_ 68 | if reload_: 69 | print 'preparing reload' 70 | save_dir_backup = config[config.model].save_model_dir 71 | from_dir_backup = config[config.model].from_dir 72 | # never start retrain in the same folder 73 | assert save_dir_backup != from_dir_backup 74 | print 'save dir ',save_dir_backup 75 | print 'from_dir ',from_dir_backup 76 | print 'setting current model config with the old one' 77 | 78 | 79 | if config[config.model].mode=='train': 80 | model_config_old = common.load_pkl(from_dir_backup+'/model_config.pkl') 81 | set_config(config, model_config_old) 82 | config[config.model].save_model_dir = save_dir_backup 83 | config[config.model].from_dir = from_dir_backup 84 | config[config.model].reload_ = True 85 | if config.erase_history: 86 | print 'erasing everything in ',save_model_dir 87 | os.system('rm %s/*'%save_model_dir) 88 | 89 | 90 | 91 | # for stdout file logging 92 | #sys.stdout = Unbuffered(sys.stdout, state.save_model_path + 'stdout.log') 93 | print 'saving model config into %s'%save_path 94 | common.dump_pkl(config, save_path) 95 | # Also copy back from config into state. 96 | for key in config: 97 | setattr(state, key, config[key]) 98 | 99 | 100 | model_type = config.model 101 | print 'Model Type: %s'%model_type 102 | print 'Host: %s' % socket.gethostname() 103 | print 'Command: %s' % ' '.join(sys.argv) 104 | 105 | if config.model == 'attention': 106 | model_attention.train_from_scratch(state, channel) 107 | elif config.model == 'lstmdd': 108 | model_lstmdd.train_from_scratch(state, channel) 109 | elif config.model == 'mtle': 110 | model_mtle.train_from_scratch(state, channel) 111 | else: 112 | raise NotImplementedError() 113 | 114 | 115 | def main(state, channel=None): 116 | set_config(config, state) 117 | train_from_scratch(config, state, channel) 118 | 119 | 120 | if __name__ == '__main__': 121 | args = {} 122 | try: 123 | for arg in sys.argv[1:]: 124 | k, v = arg.split('=') 125 | args[k] = v 126 | except: 127 | print 'args must be like a=X b.c=X' 128 | exit(1) 129 | 130 | state = expand(args) 131 | 132 | try: 133 | main(state) 134 | except Exception as e: 135 | logger.exception(e) 136 | --------------------------------------------------------------------------------