├── GCN_lib ├── Rs_GCN.py └── __init__.py ├── README.md ├── __init__.py ├── coco-caption ├── LICENSE ├── pyciderevalcap │ ├── __init__.py │ ├── cider │ │ ├── __init__.py │ │ ├── cider.py │ │ └── cider_scorer.py │ ├── ciderD │ │ ├── __init__.py │ │ ├── ciderD.py │ │ └── ciderD_scorer.py │ ├── eval.py │ └── tokenizer │ │ ├── __init__.py │ │ ├── ptbtokenizer.py │ │ └── stanford-corenlp-3.4.1.jar ├── pycocoevalcap │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ └── __init__.cpython-36.pyc │ ├── bleu │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── bleu.cpython-35.pyc │ │ │ ├── bleu.cpython-36.pyc │ │ │ ├── bleu_scorer.cpython-35.pyc │ │ │ └── bleu_scorer.cpython-36.pyc │ │ ├── bleu.py │ │ └── bleu_scorer.py │ ├── cider │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── cider.cpython-35.pyc │ │ │ ├── cider.cpython-36.pyc │ │ │ ├── cider_scorer.cpython-35.pyc │ │ │ └── cider_scorer.cpython-36.pyc │ │ └── cider.py │ ├── eval.py │ ├── meteor │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── meteor.cpython-35.pyc │ │ │ └── meteor.cpython-36.pyc │ │ ├── meteor-1.5.jar │ │ └── meteor.py │ ├── rouge │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── rouge.cpython-35.pyc │ │ │ └── rouge.cpython-36.pyc │ │ └── rouge.py │ └── tokenizer │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── ptbtokenizer.cpython-35.pyc │ │ └── ptbtokenizer.cpython-36.pyc │ │ ├── ptbtokenizer.py │ │ └── stanford-corenlp-3.4.1.jar └── pycocotools │ ├── __init__.py │ ├── _mask.c │ ├── _mask.pyx │ ├── coco.py │ ├── cocoeval.py │ └── mask.py ├── cocoapi-master ├── LuaAPI │ ├── CocoApi.lua │ ├── MaskApi.lua │ ├── cocoDemo.lua │ ├── env.lua │ ├── init.lua │ └── rocks │ │ └── coco-scm-1.rockspec ├── MatlabAPI │ ├── CocoApi.m │ ├── CocoEval.m │ ├── CocoUtils.m │ ├── MaskApi.m │ ├── cocoDemo.m │ ├── evalDemo.m │ ├── gason.m │ └── private │ │ ├── gasonMex.cpp │ │ ├── gasonMex.mexa64 │ │ ├── gasonMex.mexmaci64 │ │ ├── getPrmDflt.m │ │ └── maskApiMex.c ├── PythonAPI │ ├── Makefile │ ├── build │ │ ├── common │ │ │ └── maskApi.o │ │ ├── lib.linux-x86_64-2.7 │ │ │ └── pycocotools │ │ │ │ ├── __init__.py │ │ │ │ ├── _mask.so │ │ │ │ ├── coco.py │ │ │ │ ├── cocoeval.py │ │ │ │ └── mask.py │ │ └── temp.linux-x86_64-2.7 │ │ │ └── pycocotools │ │ │ └── _mask.o │ ├── dist │ │ └── pycocotools-2.0-py2.7-linux-x86_64.egg │ ├── pycocotools.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── requires.txt │ │ └── top_level.txt │ ├── pycocotools │ │ ├── __init__.py │ │ ├── _mask.c │ │ ├── _mask.pyx │ │ ├── _mask.so │ │ ├── coco.py │ │ ├── cocoeval.py │ │ └── mask.py │ └── setup.py ├── README.txt ├── common │ ├── gason.cpp │ ├── gason.h │ ├── maskApi.c │ └── maskApi.h ├── license.txt └── results │ ├── captions_val2014_fakecap_results.json │ ├── instances_val2014_fakebbox100_results.json │ ├── instances_val2014_fakesegm100_results.json │ ├── person_keypoints_val2014_fakekeypoints100_results.json │ └── val2014_fake_eval_res.txt ├── data.py ├── evaluate_models.py ├── evaluation.py ├── evaluation_models.py ├── fig ├── Q_i2t.png ├── Q_t2i_2.png ├── model.png └── teaser.png ├── misc ├── __init__.py ├── cocoeval.py ├── rewards.py └── utils.py ├── model.py ├── models ├── Attention.py ├── DecoderRNN.py ├── EncoderRNN.py ├── S2VTAttModel.py ├── S2VTModel.py └── __init__.py ├── opts.py ├── requirement.txt ├── train.py ├── vocab.py └── vocab ├── 10crop_precomp_vocab.pkl ├── coco_precomp_vocab.pkl ├── coco_vocab.pkl ├── f30k_precomp_vocab.pkl ├── f30k_vocab.pkl ├── f8k_precomp_vocab.pkl └── f8k_vocab.pkl /GCN_lib/Rs_GCN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | 6 | 7 | class Rs_GCN(nn.Module): 8 | 9 | def __init__(self, in_channels, inter_channels, bn_layer=True): 10 | super(Rs_GCN, self).__init__() 11 | 12 | self.in_channels = in_channels 13 | self.inter_channels = inter_channels 14 | 15 | if self.inter_channels is None: 16 | self.inter_channels = in_channels // 2 17 | if self.inter_channels == 0: 18 | self.inter_channels = 1 19 | 20 | 21 | conv_nd = nn.Conv1d 22 | max_pool = nn.MaxPool1d 23 | bn = nn.BatchNorm1d 24 | 25 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 26 | kernel_size=1, stride=1, padding=0) 27 | 28 | if bn_layer: 29 | self.W = nn.Sequential( 30 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, 31 | kernel_size=1, stride=1, padding=0), 32 | bn(self.in_channels) 33 | ) 34 | nn.init.constant(self.W[1].weight, 0) 35 | nn.init.constant(self.W[1].bias, 0) 36 | else: 37 | self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, 38 | kernel_size=1, stride=1, padding=0) 39 | nn.init.constant(self.W.weight, 0) 40 | nn.init.constant(self.W.bias, 0) 41 | 42 | self.theta = None 43 | self.phi = None 44 | 45 | 46 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 47 | kernel_size=1, stride=1, padding=0) 48 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, 49 | kernel_size=1, stride=1, padding=0) 50 | 51 | 52 | 53 | 54 | def forward(self, v): 55 | ''' 56 | :param v: (B, D, N) 57 | :return: 58 | ''' 59 | batch_size = v.size(0) 60 | 61 | g_v = self.g(v).view(batch_size, self.inter_channels, -1) 62 | g_v = g_v.permute(0, 2, 1) 63 | 64 | theta_v = self.theta(v).view(batch_size, self.inter_channels, -1) 65 | theta_v = theta_v.permute(0, 2, 1) 66 | phi_v = self.phi(v).view(batch_size, self.inter_channels, -1) 67 | R = torch.matmul(theta_v, phi_v) 68 | N = R.size(-1) 69 | R_div_C = R / N 70 | 71 | y = torch.matmul(R_div_C, g_v) 72 | y = y.permute(0, 2, 1).contiguous() 73 | y = y.view(batch_size, self.inter_channels, *v.size()[2:]) 74 | W_y = self.W(y) 75 | v_star = W_y + v 76 | 77 | return v_star 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /GCN_lib/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Visual Semantic Reasoning for Image-Text Matching (VSRN) 2 | PyTorch code for VSRN described in the paper "Visual Semantic Reasoning for Image-Text Matching". The paper will appear in ICCV 2019 as oral presentation. It is built on top of the [VSE++](https://github.com/fartashf/vsepp). 3 | 4 | [Kunpeng Li](https://kunpengli1994.github.io/), [Yulun Zhang](http://yulunzhang.com/), [Kai Li](http://kailigo.github.io/), Yuanyuan Li and [Yun Fu](http://www1.ece.neu.edu/~yunfu/). "Visual Semantic Reasoning for Image-Text Matching", ICCV, 2019. [[pdf](https://arxiv.org/pdf/1909.02701.pdf)] 5 | 6 | ## Introduction 7 | Image-text matching has been a hot research topic bridging the vision and language areas. It remains challenging because the current representation of image usually lacks global semantic concepts as in its corresponding text caption. To address this issue, we propose a simple and interpretable reasoning model to generate visual representation that captures key objects and semantic concepts of a scene. Specifically, we first build up connections between image regions and perform reasoning with Graph Convolutional Networks to generate features with semantic relationships. Then, we propose to use the gate and memory mechanism to perform global semantic reasoning on these relationship-enhanced features, select the discriminative information and gradually generate the representation for the whole scene. 8 | 9 | Experiments validate that our method achieves a new state-of-the-art for the image-text matching on MS-COCO and Flickr30K datasets. It outperforms the current best method SCAN by 6.8\% relatively for image retrieval and 4.8\% relatively for caption retrieval on MS-COCO (Recall@1 using 1K test set). On Flickr30K, our model improves image retrieval by 12.6\% relatively and caption retrieval by 5.8\% relatively (Recall@1). 10 | 11 | Besides, since our method only relies on the simple inner product as the similarity function, it is quite efficient at the inference stage. It is around 30 times faster than the current best method SCAN when tested on MS-COCO 1K dataset. 12 | 13 | ![model](/fig/model.png) 14 | 15 | ## Requirements 16 | We recommended the following dependencies. 17 | 18 | * Python 2.7 19 | * [PyTorch](http://pytorch.org/) (0.4.1) 20 | * [NumPy](http://www.numpy.org/) (>1.12.1) 21 | * [TensorBoard](https://github.com/TeamHG-Memex/tensorboard_logger) 22 | * [pycocotools](https://github.com/cocodataset/cocoapi) 23 | * [torchvision]() 24 | * [matplotlib]() 25 | 26 | 27 | * Punkt Sentence Tokenizer: 28 | ```python 29 | import nltk 30 | nltk.download() 31 | > d punkt 32 | ``` 33 | 34 | ## Download data 35 | 36 | Download the dataset files and pre-trained models. We use splits produced by [Andrej Karpathy](http://cs.stanford.edu/people/karpathy/deepimagesent/). 37 | 38 | We follow [bottom-up attention model](https://github.com/peteanderson80/bottom-up-attention) and [SCAN](https://github.com/kuanghuei/SCAN) to obtain image features for fair comparison. More details about data pre-processing (optional) can be found [here](https://github.com/kuanghuei/SCAN/blob/master/README.md#data-pre-processing-optional). All the data needed for reproducing the experiments in the paper, including image features and vocabularies, can be downloaded from [SCAN](https://github.com/kuanghuei/SCAN) by using: 39 | 40 | ```bash 41 | wget https://scanproject.blob.core.windows.net/scan-data/data.zip 42 | ``` 43 | 44 | You can also get the data from google drive: https://drive.google.com/drive/u/1/folders/1os1Kr7HeTbh8FajBNegW8rjJf6GIhFqC. We refer to the path of extracted files for `data.zip` as `$DATA_PATH`. 45 | 46 | ## Evaluate pre-trained models 47 | Modify the model_path and data_path in the evaluation_models.py file. Then Run `evaluation_models.py`: 48 | 49 | ```bash 50 | python evaluation_models.py 51 | ``` 52 | 53 | To do cross-validation on MSCOCO 1K test set (5 folders average), pass `fold5=True`. Pass `fold5=False` for evaluation on MSCOCO 5K test set. Pretrained models for MSCOCO and Flickr30K can be downloaded from https://drive.google.com/file/d/1y8Ywa2vrPB7m_Q_Ku69z7EdwsLB9gsJW/view?usp=sharing 54 | 55 | You can also use the following code to evaluate each single model on Flickr30K, MSCOCO 1K and MSCOCO 5K : 56 | 57 | ```python 58 | from vocab import Vocabulary 59 | import evaluation 60 | evaluation.evalrank("pretrain_model/flickr/model_fliker_1.pth.tar", data_path="$DATA_PATH", split="test", fold5=False)' 61 | evaluation.evalrank("pretrain_model/coco/model_coco_1.pth.tar", data_path="$DATA_PATH", split="testall", fold5=True)' 62 | evaluation.evalrank("pretrain_model/coco/model_coco_1.pth.tar", data_path="$DATA_PATH", split="testall", fold5=False)' 63 | ``` 64 | 65 | ## Training new models 66 | Run `train.py`: 67 | 68 | For MSCOCO: 69 | 70 | ```bash 71 | python train.py --data_path $DATA_PATH --data_name coco_precomp --logger_name runs/coco_VSRN --max_violation 72 | ``` 73 | 74 | For Flickr30K: 75 | 76 | ```bash 77 | python train.py --data_path $DATA_PATH --data_name f30k_precomp --logger_name runs/flickr_VSRN --max_violation --lr_update 10 --max_len 60 78 | ``` 79 | 80 | 81 | ## Reference 82 | 83 | If you found this code useful, please cite the following paper: 84 | 85 | @inproceedings{li2019vsrn, 86 | title={Visual semantic reasoning for image-text matching}, 87 | author={Li, Kunpeng and Zhang, Yulun and Li, Kai and Li, Yuanyuan and Fu, Yun}, 88 | booktitle={ICCV}, 89 | year={2019} 90 | } 91 | 92 | ## License 93 | 94 | [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) 95 | 96 | 97 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /coco-caption/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 DingXia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # 4 | # Description: Describes the class to compute the CIDEr 5 | # (Consensus-Based Image Description Evaluation) Metric 6 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 7 | # 8 | # Creation Date: Sun Feb 8 14:16:54 2015 9 | # 10 | # Authors: Ramakrishna Vedantam and 11 | # Tsung-Yi Lin 12 | 13 | from cider_scorer import CiderScorer 14 | 15 | 16 | class Cider: 17 | """ 18 | Main Class to compute the CIDEr metric 19 | 20 | """ 21 | def __init__(self, n=4, df="corpus"): 22 | """ 23 | Initialize the CIDEr scoring function 24 | : param n (int): n-gram size 25 | : param df (string): specifies where to get the IDF values from 26 | takes values 'corpus', 'coco-train' 27 | : return: None 28 | """ 29 | # set cider to sum over 1 to 4-grams 30 | self._n = n 31 | self._df = df 32 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df) 33 | 34 | def compute_score(self, gts, res): 35 | """ 36 | Main function to compute CIDEr score 37 | : param gts (dict) : {image:tokenized reference sentence} 38 | : param res (dict) : {image:tokenized candidate sentence} 39 | : return: cider (float) : computed CIDEr score for the corpus 40 | """ 41 | 42 | # clear all the previous hypos and refs 43 | self.cider_scorer.clear() 44 | 45 | for res_id in res: 46 | 47 | hypo = res_id['caption'] 48 | ref = gts[res_id['image_id']] 49 | 50 | # Sanity check. 51 | assert(type(hypo) is list) 52 | assert(len(hypo) == 1) 53 | assert(type(ref) is list) 54 | assert(len(ref) > 0) 55 | self.cider_scorer += (hypo[0], ref) 56 | 57 | (score, scores) = self.cider_scorer.compute_score() 58 | 59 | return score, scores 60 | 61 | def method(self): 62 | return "CIDEr" 63 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | import pickle 7 | from collections import defaultdict 8 | import numpy as np 9 | import math 10 | import os 11 | 12 | def precook(s, n=4, out=False): 13 | """ 14 | Takes a string as input and returns an object that can be given to 15 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 16 | can take string arguments as well. 17 | :param s: string : sentence to be converted into ngrams 18 | :param n: int : number of ngrams for which representation is calculated 19 | :return: term frequency vector for occuring ngrams 20 | """ 21 | words = s.split() 22 | counts = defaultdict(int) 23 | for k in xrange(1,n+1): 24 | for i in xrange(len(words)-k+1): 25 | ngram = tuple(words[i:i+k]) 26 | counts[ngram] += 1 27 | return counts 28 | 29 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 30 | '''Takes a list of reference sentences for a single segment 31 | and returns an object that encapsulates everything that BLEU 32 | needs to know about them. 33 | :param refs: list of string : reference sentences for some image 34 | :param n: int : number of ngrams for which (ngram) representation is calculated 35 | :return: result (list of dict) 36 | ''' 37 | return [precook(ref, n) for ref in refs] 38 | 39 | def cook_test(test, n=4): 40 | '''Takes a test sentence and returns an object that 41 | encapsulates everything that BLEU needs to know about it. 42 | :param test: list of string : hypothesis sentence for some image 43 | :param n: int : number of ngrams for which (ngram) representation is calculated 44 | :return: result (dict) 45 | ''' 46 | return precook(test, n, True) 47 | 48 | class CiderScorer(object): 49 | """CIDEr scorer. 50 | """ 51 | 52 | def copy(self): 53 | ''' copy the refs.''' 54 | new = CiderScorer(n=self.n) 55 | new.ctest = copy.copy(self.ctest) 56 | new.crefs = copy.copy(self.crefs) 57 | return new 58 | 59 | def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0): 60 | ''' singular instance ''' 61 | self.n = n 62 | self.sigma = sigma 63 | self.crefs = [] 64 | self.ctest = [] 65 | self.df_mode = df_mode 66 | if self.df_mode != "corpus": 67 | self.document_frequency = pickle.load(open(os.path.join('data', df_mode + '.p'),'r')) 68 | self.cook_append(test, refs) 69 | self.ref_len = None 70 | 71 | def clear(self): 72 | self.crefs = [] 73 | self.ctest = [] 74 | 75 | def cook_append(self, test, refs): 76 | '''called by constructor and __iadd__ to avoid creating new instances.''' 77 | 78 | if refs is not None: 79 | self.crefs.append(cook_refs(refs)) 80 | if test is not None: 81 | self.ctest.append(cook_test(test)) ## N.B.: -1 82 | else: 83 | self.ctest.append(None) # lens of crefs and ctest have to match 84 | 85 | def size(self): 86 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 87 | return len(self.crefs) 88 | 89 | def __iadd__(self, other): 90 | '''add an instance (e.g., from another sentence).''' 91 | 92 | if type(other) is tuple: 93 | ## avoid creating new CiderScorer instances 94 | self.cook_append(other[0], other[1]) 95 | else: 96 | self.ctest.extend(other.ctest) 97 | self.crefs.extend(other.crefs) 98 | 99 | return self 100 | def compute_doc_freq(self): 101 | ''' 102 | Compute term frequency for reference data. 103 | This will be used to compute idf (inverse document frequency later) 104 | The term frequency is stored in the object 105 | :return: None 106 | ''' 107 | for refs in self.crefs: 108 | # refs, k ref captions of one image 109 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 110 | self.document_frequency[ngram] += 1 111 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 112 | 113 | def compute_cider(self): 114 | def counts2vec(cnts): 115 | """ 116 | Function maps counts of ngram to vector of tfidf weights. 117 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 118 | The n-th entry of array denotes length of n-grams. 119 | :param cnts: 120 | :return: vec (array of dict), norm (array of float), length (int) 121 | """ 122 | vec = [defaultdict(float) for _ in range(self.n)] 123 | length = 0 124 | norm = [0.0 for _ in range(self.n)] 125 | for (ngram,term_freq) in cnts.iteritems(): 126 | # give word count 1 if it doesn't appear in reference corpus 127 | df = np.log(max(1.0, self.document_frequency[ngram])) 128 | # ngram index 129 | n = len(ngram)-1 130 | # tf (term_freq) * idf (precomputed idf) for n-grams 131 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 132 | # compute norm for the vector. the norm will be used for 133 | # computing similarity 134 | norm[n] += pow(vec[n][ngram], 2) 135 | 136 | if n == 1: 137 | length += term_freq 138 | norm = [np.sqrt(n) for n in norm] 139 | return vec, norm, length 140 | 141 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 142 | ''' 143 | Compute the cosine similarity of two vectors. 144 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 145 | :param vec_ref: array of dictionary for vector corresponding to reference 146 | :param norm_hyp: array of float for vector corresponding to hypothesis 147 | :param norm_ref: array of float for vector corresponding to reference 148 | :param length_hyp: int containing length of hypothesis 149 | :param length_ref: int containing length of reference 150 | :return: array of score for each n-grams cosine similarity 151 | ''' 152 | delta = float(length_hyp - length_ref) 153 | # measure consine similarity 154 | val = np.array([0.0 for _ in range(self.n)]) 155 | for n in range(self.n): 156 | # ngram 157 | for (ngram,count) in vec_hyp[n].iteritems(): 158 | val[n] += vec_hyp[n][ngram] * vec_ref[n][ngram] 159 | 160 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 161 | val[n] /= (norm_hyp[n]*norm_ref[n]) 162 | 163 | assert(not math.isnan(val[n])) 164 | return val 165 | 166 | # compute log reference length 167 | if self.df_mode == "corpus": 168 | self.ref_len = np.log(float(len(self.crefs))) 169 | elif self.df_mode == "coco-val": 170 | # if coco option selected, use length of coco-val set 171 | self.ref_len = np.log(float(40504)) 172 | 173 | scores = [] 174 | for test, refs in zip(self.ctest, self.crefs): 175 | # compute vector for test captions 176 | vec, norm, length = counts2vec(test) 177 | # compute vector for ref captions 178 | score = np.array([0.0 for _ in range(self.n)]) 179 | for ref in refs: 180 | vec_ref, norm_ref, length_ref = counts2vec(ref) 181 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 182 | # change by vrama91 - mean of ngram scores, instead of sum 183 | score_avg = np.mean(score) 184 | # divide by number of references 185 | score_avg /= len(refs) 186 | # multiply score by 10 187 | score_avg *= 10.0 188 | # append score of an image to the score list 189 | scores.append(score_avg) 190 | return scores 191 | 192 | def compute_score(self, option=None, verbose=0): 193 | # compute idf 194 | if self.df_mode == "corpus": 195 | self.document_frequency = defaultdict(float) 196 | self.compute_doc_freq() 197 | # assert to check document frequency 198 | assert(len(self.ctest) >= max(self.document_frequency.values())) 199 | # import json for now and write the corresponding files 200 | # compute cider score 201 | score = self.compute_cider() 202 | # debug 203 | # print score 204 | return np.mean(np.array(score)), np.array(score) 205 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/ciderD/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/ciderD/ciderD.py: -------------------------------------------------------------------------------- 1 | # Filename: ciderD.py 2 | # 3 | # Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from .ciderD_scorer import CiderScorer 11 | 12 | 13 | class CiderD: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, n=4, sigma=6.0, df="corpus"): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | # set which where to compute document frequencies from 24 | self._df = df 25 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df) 26 | 27 | def compute_score(self, gts, res): 28 | """ 29 | Main function to compute CIDEr score 30 | :param hypo_for_image (dict) : dictionary with key and value 31 | ref_for_image (dict) : dictionary with key and value 32 | :return: cider (float) : computed CIDEr score for the corpus 33 | """ 34 | 35 | # clear all the previous hypos and refs 36 | self.cider_scorer.clear() 37 | for res_id in res: 38 | 39 | hypo = res_id['caption'] 40 | ref = gts[res_id['image_id']] 41 | 42 | # Sanity check. 43 | assert(type(hypo) is list) 44 | assert(len(hypo) == 1) 45 | assert(type(ref) is list) 46 | assert(len(ref) > 0) 47 | self.cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = self.cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr-D" 55 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/ciderD/ciderD_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | from collections import defaultdict 7 | import numpy as np 8 | import pdb 9 | import math 10 | import pickle 11 | import os 12 | 13 | def precook(s, n=4, out=False): 14 | """ 15 | Takes a string as input and returns an object that can be given to 16 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 17 | can take string arguments as well. 18 | :param s: string : sentence to be converted into ngrams 19 | :param n: int : number of ngrams for which representation is calculated 20 | :return: term frequency vector for occuring ngrams 21 | """ 22 | words = s.split() 23 | counts = defaultdict(int) 24 | for k in range(1,n+1): 25 | for i in range(len(words)-k+1): 26 | ngram = tuple(words[i:i+k]) 27 | counts[ngram] += 1 28 | return counts 29 | 30 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 31 | '''Takes a list of reference sentences for a single segment 32 | and returns an object that encapsulates everything that BLEU 33 | needs to know about them. 34 | :param refs: list of string : reference sentences for some image 35 | :param n: int : number of ngrams for which (ngram) representation is calculated 36 | :return: result (list of dict) 37 | ''' 38 | return [precook(ref, n) for ref in refs] 39 | 40 | def cook_test(test, n=4): 41 | '''Takes a test sentence and returns an object that 42 | encapsulates everything that BLEU needs to know about it. 43 | :param test: list of string : hypothesis sentence for some image 44 | :param n: int : number of ngrams for which (ngram) representation is calculated 45 | :return: result (dict) 46 | ''' 47 | return precook(test, n, True) 48 | 49 | class CiderScorer(object): 50 | """CIDEr scorer. 51 | """ 52 | 53 | def copy(self): 54 | ''' copy the refs.''' 55 | new = CiderScorer(n=self.n) 56 | new.ctest = copy.copy(self.ctest) 57 | new.crefs = copy.copy(self.crefs) 58 | return new 59 | 60 | def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0): 61 | ''' singular instance ''' 62 | self.n = n 63 | self.sigma = sigma 64 | self.crefs = [] 65 | self.ctest = [] 66 | self.df_mode = df_mode 67 | self.ref_len = None 68 | if self.df_mode != "corpus": 69 | pkl_file = pickle.load(open(os.path.join('data', df_mode + '.p'),'rb')) 70 | self.ref_len = pkl_file['ref_len'] 71 | self.document_frequency = pkl_file['document_frequency'] 72 | self.cook_append(test, refs) 73 | 74 | def clear(self): 75 | self.crefs = [] 76 | self.ctest = [] 77 | 78 | def cook_append(self, test, refs): 79 | '''called by constructor and __iadd__ to avoid creating new instances.''' 80 | 81 | if refs is not None: 82 | self.crefs.append(cook_refs(refs)) 83 | if test is not None: 84 | self.ctest.append(cook_test(test)) ## N.B.: -1 85 | else: 86 | self.ctest.append(None) # lens of crefs and ctest have to match 87 | 88 | def size(self): 89 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 90 | return len(self.crefs) 91 | 92 | def __iadd__(self, other): 93 | '''add an instance (e.g., from another sentence).''' 94 | 95 | if type(other) is tuple: 96 | ## avoid creating new CiderScorer instances 97 | self.cook_append(other[0], other[1]) 98 | else: 99 | self.ctest.extend(other.ctest) 100 | self.crefs.extend(other.crefs) 101 | 102 | return self 103 | def compute_doc_freq(self): 104 | ''' 105 | Compute term frequency for reference data. 106 | This will be used to compute idf (inverse document frequency later) 107 | The term frequency is stored in the object 108 | :return: None 109 | ''' 110 | for refs in self.crefs: 111 | # refs, k ref captions of one image 112 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]): 113 | self.document_frequency[ngram] += 1 114 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 115 | 116 | def compute_cider(self): 117 | def counts2vec(cnts): 118 | """ 119 | Function maps counts of ngram to vector of tfidf weights. 120 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 121 | The n-th entry of array denotes length of n-grams. 122 | :param cnts: 123 | :return: vec (array of dict), norm (array of float), length (int) 124 | """ 125 | vec = [defaultdict(float) for _ in range(self.n)] 126 | length = 0 127 | norm = [0.0 for _ in range(self.n)] 128 | for (ngram,term_freq) in cnts.items(): 129 | # give word count 1 if it doesn't appear in reference corpus 130 | df = np.log(max(1.0, self.document_frequency[ngram])) 131 | # ngram index 132 | n = len(ngram)-1 133 | # tf (term_freq) * idf (precomputed idf) for n-grams 134 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 135 | # compute norm for the vector. the norm will be used for computing similarity 136 | norm[n] += pow(vec[n][ngram], 2) 137 | 138 | if n == 1: 139 | length += term_freq 140 | norm = [np.sqrt(n) for n in norm] 141 | return vec, norm, length 142 | 143 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 144 | ''' 145 | Compute the cosine similarity of two vectors. 146 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 147 | :param vec_ref: array of dictionary for vector corresponding to reference 148 | :param norm_hyp: array of float for vector corresponding to hypothesis 149 | :param norm_ref: array of float for vector corresponding to reference 150 | :param length_hyp: int containing length of hypothesis 151 | :param length_ref: int containing length of reference 152 | :return: array of score for each n-grams cosine similarity 153 | ''' 154 | delta = float(length_hyp - length_ref) 155 | # measure consine similarity 156 | val = np.array([0.0 for _ in range(self.n)]) 157 | for n in range(self.n): 158 | # ngram 159 | for (ngram,count) in vec_hyp[n].items(): 160 | # vrama91 : added clipping 161 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 162 | 163 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 164 | val[n] /= (norm_hyp[n]*norm_ref[n]) 165 | 166 | assert(not math.isnan(val[n])) 167 | # vrama91: added a length based gaussian penalty 168 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 169 | return val 170 | 171 | # compute log reference length 172 | if self.df_mode == "corpus": 173 | self.ref_len = np.log(float(len(self.crefs))) 174 | #elif self.df_mode == "coco-val": 175 | # if coco option selected, use length of coco-val set 176 | # self.ref_len = np.log(float(40504)) 177 | 178 | scores = [] 179 | for test, refs in zip(self.ctest, self.crefs): 180 | # compute vector for test captions 181 | vec, norm, length = counts2vec(test) 182 | # compute vector for ref captions 183 | score = np.array([0.0 for _ in range(self.n)]) 184 | for ref in refs: 185 | vec_ref, norm_ref, length_ref = counts2vec(ref) 186 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 187 | # change by vrama91 - mean of ngram scores, instead of sum 188 | score_avg = np.mean(score) 189 | # divide by number of references 190 | score_avg /= len(refs) 191 | # multiply score by 10 192 | score_avg *= 10.0 193 | # append score of an image to the score list 194 | scores.append(score_avg) 195 | return scores 196 | 197 | def compute_score(self, option=None, verbose=0): 198 | # compute idf 199 | if self.df_mode == "corpus": 200 | self.document_frequency = defaultdict(float) 201 | self.compute_doc_freq() 202 | # assert to check document frequency 203 | assert(len(self.ctest) >= max(self.document_frequency.values())) 204 | # import json for now and write the corresponding files 205 | # compute cider score 206 | score = self.compute_cider() 207 | # debug 208 | # print score 209 | return np.mean(np.array(score)), np.array(score) 210 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rama' 2 | from tokenizer.ptbtokenizer import PTBTokenizer 3 | from cider.cider import Cider 4 | from ciderD.ciderD import CiderD 5 | 6 | 7 | class CIDErEvalCap: 8 | def __init__(self, gts, res, df): 9 | print 'tokenization...' 10 | tokenizer = PTBTokenizer('gts') 11 | _gts = tokenizer.tokenize(gts) 12 | print 'tokenized refs' 13 | tokenizer = PTBTokenizer('res') 14 | _res = tokenizer.tokenize(res) 15 | print 'tokenized cands' 16 | 17 | self.gts = _gts 18 | self.res = _res 19 | self.df = df 20 | 21 | def evaluate(self): 22 | # ================================================= 23 | # Set up scorers 24 | # ================================================= 25 | 26 | print 'setting up scorers...' 27 | scorers = [ 28 | (Cider(df=self.df), "CIDEr"), (CiderD(df=self.df), "CIDErD") 29 | ] 30 | 31 | # ================================================= 32 | # Compute scores 33 | # ================================================= 34 | metric_scores = {} 35 | for scorer, method in scorers: 36 | print 'computing %s score...' % (scorer.method()) 37 | score, scores = scorer.compute_score(self.gts, self.res) 38 | print "Mean %s score: %0.3f" % (method, score) 39 | metric_scores[method] = list(scores) 40 | return metric_scores 41 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import pdb # python debugger 13 | import sys 14 | import subprocess 15 | import re 16 | import tempfile 17 | import itertools 18 | 19 | # path to the stanford corenlp jar 20 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 21 | 22 | # punctuations to be removed from the sentences 23 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 24 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 25 | 26 | class PTBTokenizer: 27 | """Python wrapper of Stanford PTBTokenizer""" 28 | def __init__(self, _source='gts'): 29 | self.source = _source 30 | 31 | def tokenize(self, captions_for_image): 32 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 33 | 'edu.stanford.nlp.process.PTBTokenizer', \ 34 | '-preserveLines', '-lowerCase'] 35 | 36 | # ====================================================== 37 | # prepare data for PTB Tokenizer 38 | # ====================================================== 39 | 40 | if self.source == 'gts': 41 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 42 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 43 | final_tokenized_captions_for_image = {} 44 | 45 | elif self.source == 'res': 46 | index = [i for i, v in enumerate(captions_for_image)] 47 | image_id = [v["image_id"] for v in captions_for_image] 48 | sentences = '\n'.join(v["caption"].replace('\n', ' ') for v in captions_for_image ) 49 | final_tokenized_captions_for_index = [] 50 | 51 | # ====================================================== 52 | # save sentences to temporary file 53 | # ====================================================== 54 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 55 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) 56 | tmp_file.write(sentences) 57 | tmp_file.close() 58 | 59 | # ====================================================== 60 | # tokenize sentence 61 | # ====================================================== 62 | cmd.append(os.path.basename(tmp_file.name)) 63 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ 64 | stdout=subprocess.PIPE) 65 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 66 | lines = token_lines.split('\n') 67 | # remove temp file 68 | os.remove(tmp_file.name) 69 | 70 | # ====================================================== 71 | # create dictionary for tokenized captions 72 | # ====================================================== 73 | if self.source == 'gts': 74 | for k, line in zip(image_id, lines): 75 | if not k in final_tokenized_captions_for_image: 76 | final_tokenized_captions_for_image[k] = [] 77 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 78 | if w not in PUNCTUATIONS]) 79 | final_tokenized_captions_for_image[k].append(tokenized_caption) 80 | 81 | return final_tokenized_captions_for_image 82 | 83 | elif self.source == 'res': 84 | for k, img, line in zip(index, image_id, lines): 85 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 86 | if w not in PUNCTUATIONS]) 87 | final_tokenized_captions_for_index.append({'image_id': img, 'caption': [tokenized_caption]}) 88 | 89 | return final_tokenized_captions_for_index 90 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/bleu/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/bleu/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/__pycache__/bleu.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/bleu/__pycache__/bleu.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/__pycache__/bleu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/bleu/__pycache__/bleu.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/__pycache__/bleu_scorer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/bleu/__pycache__/bleu_scorer.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/__pycache__/bleu_scorer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/bleu/__pycache__/bleu_scorer.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from .bleu_scorer import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(sorted(gts.keys()) == sorted(res.keys())) 24 | imgIds = sorted(gts.keys()) 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) >= 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/bleu_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # bleu_scorer.py 4 | # David Chiang 5 | 6 | # Copyright (c) 2004-2006 University of Maryland. All rights 7 | # reserved. Do not redistribute without permission from the 8 | # author. Not for commercial use. 9 | 10 | # Modified by: 11 | # Hao Fang 12 | # Tsung-Yi Lin 13 | 14 | '''Provides: 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). 17 | ''' 18 | 19 | import copy 20 | import sys, math, re 21 | from collections import defaultdict 22 | 23 | def precook(s, n=4, out=False): 24 | """Takes a string as input and returns an object that can be given to 25 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 26 | can take string arguments as well.""" 27 | words = s.split() 28 | counts = defaultdict(int) 29 | for k in range(1,n+1): 30 | for i in range(len(words)-k+1): 31 | ngram = tuple(words[i:i+k]) 32 | counts[ngram] += 1 33 | return (len(words), counts) 34 | 35 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" 36 | '''Takes a list of reference sentences for a single segment 37 | and returns an object that encapsulates everything that BLEU 38 | needs to know about them.''' 39 | 40 | reflen = [] 41 | maxcounts = {} 42 | for ref in refs: 43 | rl, counts = precook(ref, n) 44 | reflen.append(rl) 45 | for (ngram,count) in counts.items(): 46 | maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 47 | 48 | # Calculate effective reference sentence length. 49 | if eff == "shortest": 50 | reflen = min(reflen) 51 | elif eff == "average": 52 | reflen = float(sum(reflen))/len(reflen) 53 | 54 | ## lhuang: N.B.: leave reflen computaiton to the very end!! 55 | 56 | ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) 57 | 58 | return (reflen, maxcounts) 59 | 60 | def cook_test(test, reflen, refmaxcounts, eff=None, n=4): 61 | '''Takes a test sentence and returns an object that 62 | encapsulates everything that BLEU needs to know about it.''' 63 | 64 | testlen, counts = precook(test, n, True) 65 | 66 | result = {} 67 | 68 | # Calculate effective reference sentence length. 69 | 70 | if eff == "closest": 71 | result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1] 72 | else: ## i.e., "average" or "shortest" or None 73 | result["reflen"] = reflen 74 | 75 | result["testlen"] = testlen 76 | 77 | result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)] 78 | 79 | result['correct'] = [0]*n 80 | for (ngram, count) in counts.items(): 81 | result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count) 82 | 83 | return result 84 | 85 | class BleuScorer(object): 86 | """Bleu scorer. 87 | """ 88 | 89 | __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" 90 | # special_reflen is used in oracle (proportional effective ref len for a node). 91 | 92 | def copy(self): 93 | ''' copy the refs.''' 94 | new = BleuScorer(n=self.n) 95 | new.ctest = copy.copy(self.ctest) 96 | new.crefs = copy.copy(self.crefs) 97 | new._score = None 98 | return new 99 | 100 | def __init__(self, test=None, refs=None, n=4, special_reflen=None): 101 | ''' singular instance ''' 102 | 103 | self.n = n 104 | self.crefs = [] 105 | self.ctest = [] 106 | self.cook_append(test, refs) 107 | self.special_reflen = special_reflen 108 | 109 | def cook_append(self, test, refs): 110 | '''called by constructor and __iadd__ to avoid creating new instances.''' 111 | 112 | if refs is not None: 113 | self.crefs.append(cook_refs(refs)) 114 | if test is not None: 115 | cooked_test = cook_test(test, *self.crefs[-1]) 116 | self.ctest.append(cooked_test) ## N.B.: -1 117 | else: 118 | self.ctest.append(None) # lens of crefs and ctest have to match 119 | 120 | self._score = None ## need to recompute 121 | 122 | def ratio(self, option=None): 123 | self.compute_score(option=option) 124 | return self._ratio 125 | 126 | def score_ratio(self, option=None): 127 | '''return (bleu, len_ratio) pair''' 128 | return (self.fscore(option=option), self.ratio(option=option)) 129 | 130 | def score_ratio_str(self, option=None): 131 | return "%.4f (%.2f)" % self.score_ratio(option) 132 | 133 | def reflen(self, option=None): 134 | self.compute_score(option=option) 135 | return self._reflen 136 | 137 | def testlen(self, option=None): 138 | self.compute_score(option=option) 139 | return self._testlen 140 | 141 | def retest(self, new_test): 142 | if type(new_test) is str: 143 | new_test = [new_test] 144 | assert len(new_test) == len(self.crefs), new_test 145 | self.ctest = [] 146 | for t, rs in zip(new_test, self.crefs): 147 | self.ctest.append(cook_test(t, *rs)) 148 | self._score = None 149 | 150 | return self 151 | 152 | def rescore(self, new_test): 153 | ''' replace test(s) with new test(s), and returns the new score.''' 154 | 155 | return self.retest(new_test).compute_score() 156 | 157 | def size(self): 158 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 159 | return len(self.crefs) 160 | 161 | def __iadd__(self, other): 162 | '''add an instance (e.g., from another sentence).''' 163 | 164 | if type(other) is tuple: 165 | ## avoid creating new BleuScorer instances 166 | self.cook_append(other[0], other[1]) 167 | else: 168 | assert self.compatible(other), "incompatible BLEUs." 169 | self.ctest.extend(other.ctest) 170 | self.crefs.extend(other.crefs) 171 | self._score = None ## need to recompute 172 | 173 | return self 174 | 175 | def compatible(self, other): 176 | return isinstance(other, BleuScorer) and self.n == other.n 177 | 178 | def single_reflen(self, option="average"): 179 | return self._single_reflen(self.crefs[0][0], option) 180 | 181 | def _single_reflen(self, reflens, option=None, testlen=None): 182 | 183 | if option == "shortest": 184 | reflen = min(reflens) 185 | elif option == "average": 186 | reflen = float(sum(reflens))/len(reflens) 187 | elif option == "closest": 188 | reflen = min((abs(l-testlen), l) for l in reflens)[1] 189 | else: 190 | assert False, "unsupported reflen option %s" % option 191 | 192 | return reflen 193 | 194 | def recompute_score(self, option=None, verbose=0): 195 | self._score = None 196 | return self.compute_score(option, verbose) 197 | 198 | def compute_score(self, option=None, verbose=0): 199 | n = self.n 200 | small = 1e-9 201 | tiny = 1e-15 ## so that if guess is 0 still return 0 202 | bleu_list = [[] for _ in range(n)] 203 | 204 | if self._score is not None: 205 | return self._score 206 | 207 | if option is None: 208 | option = "average" if len(self.crefs) == 1 else "closest" 209 | 210 | self._testlen = 0 211 | self._reflen = 0 212 | totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n} 213 | 214 | # for each sentence 215 | for comps in self.ctest: 216 | testlen = comps['testlen'] 217 | self._testlen += testlen 218 | 219 | if self.special_reflen is None: ## need computation 220 | reflen = self._single_reflen(comps['reflen'], option, testlen) 221 | else: 222 | reflen = self.special_reflen 223 | 224 | self._reflen += reflen 225 | 226 | for key in ['guess','correct']: 227 | for k in range(n): 228 | totalcomps[key][k] += comps[key][k] 229 | 230 | # append per image bleu score 231 | bleu = 1. 232 | for k in range(n): 233 | bleu *= (float(comps['correct'][k]) + tiny) \ 234 | /(float(comps['guess'][k]) + small) 235 | bleu_list[k].append(bleu ** (1./(k+1))) 236 | ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division 237 | if ratio < 1: 238 | for k in range(n): 239 | bleu_list[k][-1] *= math.exp(1 - 1/ratio) 240 | 241 | if verbose > 1: 242 | print(comps, reflen) 243 | 244 | totalcomps['reflen'] = self._reflen 245 | totalcomps['testlen'] = self._testlen 246 | 247 | bleus = [] 248 | bleu = 1. 249 | for k in range(n): 250 | bleu *= float(totalcomps['correct'][k] + tiny) \ 251 | / (totalcomps['guess'][k] + small) 252 | bleus.append(bleu ** (1./(k+1))) 253 | ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division 254 | if ratio < 1: 255 | for k in range(n): 256 | bleus[k] *= math.exp(1 - 1/ratio) 257 | 258 | if verbose > 0: 259 | print(totalcomps) 260 | print("ratio:%f"%ratio) 261 | 262 | self._score = bleus 263 | return self._score, bleu_list 264 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/cider/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/cider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/__pycache__/cider.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/cider/__pycache__/cider.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/__pycache__/cider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/cider/__pycache__/cider.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/__pycache__/cider_scorer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/cider/__pycache__/cider_scorer.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/__pycache__/cider_scorer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/cider/__pycache__/cider_scorer.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from .cider_scorer import CiderScorer 11 | import pdb 12 | 13 | class Cider: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | assert(sorted(gts.keys()) == sorted(res.keys())) 33 | imgIds = sorted(gts.keys()) 34 | 35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 36 | 37 | for id in imgIds: 38 | hypo = res[id] 39 | ref = gts[id] 40 | 41 | # Sanity check. 42 | assert(type(hypo) is list) 43 | assert(len(hypo) == 1) 44 | assert(type(ref) is list) 45 | assert(len(ref) >= 1) 46 | 47 | cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr" 55 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | from .tokenizer.ptbtokenizer import PTBTokenizer 3 | from .bleu.bleu import Bleu 4 | from .meteor.meteor import Meteor 5 | from .rouge.rouge import Rouge 6 | from .cider.cider import Cider 7 | from .spice.spice import Spice 8 | 9 | class COCOEvalCap: 10 | def __init__(self, coco, cocoRes): 11 | self.evalImgs = [] 12 | self.eval = {} 13 | self.imgToEval = {} 14 | self.coco = coco 15 | self.cocoRes = cocoRes 16 | self.params = {'image_id': coco.getImgIds()} 17 | 18 | def evaluate(self): 19 | imgIds = self.params['image_id'] 20 | # imgIds = self.coco.getImgIds() 21 | gts = {} 22 | res = {} 23 | for imgId in imgIds: 24 | gts[imgId] = self.coco.imgToAnns[imgId] 25 | res[imgId] = self.cocoRes.imgToAnns[imgId] 26 | 27 | # ================================================= 28 | # Set up scorers 29 | # ================================================= 30 | print('tokenization...') 31 | tokenizer = PTBTokenizer() 32 | gts = tokenizer.tokenize(gts) 33 | res = tokenizer.tokenize(res) 34 | 35 | # ================================================= 36 | # Set up scorers 37 | # ================================================= 38 | print('setting up scorers...') 39 | scorers = [ 40 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 41 | (Meteor(),"METEOR"), 42 | (Rouge(), "ROUGE_L"), 43 | (Cider(), "CIDEr"), 44 | (Spice(), "SPICE") 45 | ] 46 | 47 | # ================================================= 48 | # Compute scores 49 | # ================================================= 50 | for scorer, method in scorers: 51 | print('computing %s score...'%(scorer.method())) 52 | score, scores = scorer.compute_score(gts, res) 53 | if type(method) == list: 54 | for sc, scs, m in zip(score, scores, method): 55 | self.setEval(sc, m) 56 | self.setImgToEvalImgs(scs, gts.keys(), m) 57 | print("%s: %0.3f"%(m, sc)) 58 | else: 59 | self.setEval(score, method) 60 | self.setImgToEvalImgs(scores, gts.keys(), method) 61 | print("%s: %0.3f"%(method, score)) 62 | self.setEvalImgs() 63 | 64 | def setEval(self, score, method): 65 | self.eval[method] = score 66 | 67 | def setImgToEvalImgs(self, scores, imgIds, method): 68 | for imgId, score in zip(sorted(imgIds), scores): 69 | if not imgId in self.imgToEval: 70 | self.imgToEval[imgId] = {} 71 | self.imgToEval[imgId]["image_id"] = imgId 72 | self.imgToEval[imgId][method] = score 73 | 74 | def setEvalImgs(self): 75 | self.evalImgs = [self.imgToEval[imgId] for imgId in sorted(self.imgToEval.keys())] 76 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/meteor/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/meteor/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/__pycache__/meteor.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/meteor/__pycache__/meteor.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/__pycache__/meteor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/meteor/__pycache__/meteor.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/meteor-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/meteor/meteor-1.5.jar -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Python wrapper for METEOR implementation, by Xinlei Chen 4 | # Acknowledge Michael Denkowski for the generous discussion and help 5 | 6 | import os 7 | import sys 8 | import subprocess 9 | import threading 10 | 11 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed. 12 | METEOR_JAR = 'meteor-1.5.jar' 13 | # print METEOR_JAR 14 | 15 | class Meteor: 16 | 17 | def __init__(self): 18 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \ 19 | '-', '-', '-stdio', '-l', 'en', '-norm'] 20 | self.meteor_p = subprocess.Popen(self.meteor_cmd, \ 21 | cwd=os.path.dirname(os.path.abspath(__file__)), \ 22 | stdin=subprocess.PIPE, \ 23 | stdout=subprocess.PIPE, \ 24 | stderr=subprocess.PIPE) 25 | # Used to guarantee thread safety 26 | self.lock = threading.Lock() 27 | 28 | def compute_score(self, gts, res): 29 | assert(sorted(gts.keys()) == sorted(res.keys())) 30 | imgIds = sorted(gts.keys()) 31 | scores = [] 32 | 33 | eval_line = 'EVAL' 34 | self.lock.acquire() 35 | for i in imgIds: 36 | assert(len(res[i]) == 1) 37 | stat = self._stat(res[i][0], gts[i]) 38 | eval_line += ' ||| {}'.format(stat) 39 | 40 | self.meteor_p.stdin.write('{}\n'.format(eval_line).encode()) 41 | self.meteor_p.stdin.flush() 42 | for i in range(0, len(imgIds)): 43 | scores.append(float(self.meteor_p.stdout.readline().decode().strip())) 44 | score = float(self.meteor_p.stdout.readline().decode().strip()) 45 | self.lock.release() 46 | 47 | return score, scores 48 | 49 | def method(self): 50 | return "METEOR" 51 | 52 | def _stat(self, hypothesis_str, reference_list): 53 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 54 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 55 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 56 | self.meteor_p.stdin.write('{}\n'.format(score_line).encode()) 57 | self.meteor_p.stdin.flush() 58 | return self.meteor_p.stdout.readline().decode().strip() 59 | 60 | def _score(self, hypothesis_str, reference_list): 61 | self.lock.acquire() 62 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 63 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 64 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 65 | self.meteor_p.stdin.write('{}\n'.format(score_line).encode()) 66 | self.meteor_p.stdin.flush() 67 | stats = self.meteor_p.stdout.readline().decode().strip() 68 | eval_line = 'EVAL ||| {}'.format(stats) 69 | # EVAL ||| stats 70 | self.meteor_p.stdin.write('{}\n'.format(eval_line).encode()) 71 | self.meteor_p.stdin.flush() 72 | score = float(self.meteor_p.stdout.readline().decode().strip()) 73 | # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice 74 | # thanks for Andrej for pointing this out 75 | score = float(self.meteor_p.stdout.readline().strip()) 76 | self.lock.release() 77 | return score 78 | 79 | def __exit__(self): 80 | self.lock.acquire() 81 | self.meteor_p.stdin.close() 82 | self.meteor_p.kill() 83 | self.meteor_p.wait() 84 | self.lock.release() 85 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/rouge/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/rouge/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/rouge/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/rouge/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/rouge/__pycache__/rouge.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/rouge/__pycache__/rouge.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/rouge/__pycache__/rouge.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/rouge/__pycache__/rouge.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/rouge/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | def my_lcs(string, sub): 14 | """ 15 | Calculates longest common subsequence for a pair of tokenized strings 16 | :param string : list of str : tokens from a string split using whitespace 17 | :param sub : list of str : shorter string, also split using whitespace 18 | :returns: length (list of int): length of the longest common subsequence between the two strings 19 | 20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 21 | """ 22 | if(len(string)< len(sub)): 23 | sub, string = string, sub 24 | 25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 26 | 27 | for j in range(1,len(sub)+1): 28 | for i in range(1,len(string)+1): 29 | if(string[i-1] == sub[j-1]): 30 | lengths[i][j] = lengths[i-1][j-1] + 1 31 | else: 32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 33 | 34 | return lengths[len(string)][len(sub)] 35 | 36 | class Rouge(): 37 | ''' 38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 39 | 40 | ''' 41 | def __init__(self): 42 | # vrama91: updated the value below based on discussion with Hovey 43 | self.beta = 1.2 44 | 45 | def calc_score(self, candidate, refs): 46 | """ 47 | Compute ROUGE-L score given one candidate and references for an image 48 | :param candidate: str : candidate sentence to be evaluated 49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 50 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 51 | """ 52 | assert(len(candidate)==1) 53 | assert(len(refs)>0) 54 | prec = [] 55 | rec = [] 56 | 57 | # split into tokens 58 | token_c = candidate[0].split(" ") 59 | 60 | for reference in refs: 61 | # split into tokens 62 | token_r = reference.split(" ") 63 | # compute the longest common subsequence 64 | lcs = my_lcs(token_r, token_c) 65 | prec.append(lcs/float(len(token_c))) 66 | rec.append(lcs/float(len(token_r))) 67 | 68 | prec_max = max(prec) 69 | rec_max = max(rec) 70 | 71 | if(prec_max!=0 and rec_max !=0): 72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 73 | else: 74 | score = 0.0 75 | return score 76 | 77 | def compute_score(self, gts, res): 78 | """ 79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 80 | Invoked by evaluate_captions.py 81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 84 | """ 85 | assert(sorted(gts.keys()) == sorted(res.keys())) 86 | imgIds = sorted(gts.keys()) 87 | 88 | score = [] 89 | for id in imgIds: 90 | hypo = res[id] 91 | ref = gts[id] 92 | 93 | score.append(self.calc_score(hypo, ref)) 94 | 95 | # Sanity check. 96 | assert(type(hypo) is list) 97 | assert(len(hypo) == 1) 98 | assert(type(ref) is list) 99 | assert(len(ref) >= 1) 100 | 101 | average_score = np.mean(np.array(score)) 102 | return average_score, np.array(score) 103 | 104 | def method(self): 105 | return "Rouge" 106 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/tokenizer/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/tokenizer/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/__pycache__/ptbtokenizer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/tokenizer/__pycache__/ptbtokenizer.cpython-35.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/__pycache__/ptbtokenizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/tokenizer/__pycache__/ptbtokenizer.cpython-36.pyc -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import sys 13 | import subprocess 14 | import tempfile 15 | import itertools 16 | 17 | # path to the stanford corenlp jar 18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 19 | 20 | # punctuations to be removed from the sentences 21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 22 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 23 | 24 | class PTBTokenizer: 25 | """Python wrapper of Stanford PTBTokenizer""" 26 | 27 | def tokenize(self, captions_for_image): 28 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 29 | 'edu.stanford.nlp.process.PTBTokenizer', \ 30 | '-preserveLines', '-lowerCase'] 31 | 32 | # ====================================================== 33 | # prepare data for PTB Tokenizer 34 | # ====================================================== 35 | final_tokenized_captions_for_image = {} 36 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 37 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 38 | 39 | # ====================================================== 40 | # save sentences to temporary file 41 | # ====================================================== 42 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 43 | tmp_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, dir=path_to_jar_dirname) 44 | tmp_file.write(sentences) 45 | tmp_file.close() 46 | 47 | # ====================================================== 48 | # tokenize sentence 49 | # ====================================================== 50 | cmd.append(os.path.basename(tmp_file.name)) 51 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ 52 | stdout=subprocess.PIPE) 53 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 54 | lines = token_lines.decode().split('\n') 55 | # remove temp file 56 | os.remove(tmp_file.name) 57 | 58 | # ====================================================== 59 | # create dictionary for tokenized captions 60 | # ====================================================== 61 | for k, line in zip(image_id, lines): 62 | if not k in final_tokenized_captions_for_image: 63 | final_tokenized_captions_for_image[k] = [] 64 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 65 | if w not in PUNCTUATIONS]) 66 | final_tokenized_captions_for_image[k].append(tokenized_caption) 67 | 68 | return final_tokenized_captions_for_image 69 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/coco-caption/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar -------------------------------------------------------------------------------- /coco-caption/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | iou = _mask.iou 77 | merge = _mask.merge 78 | frPyObjects = _mask.frPyObjects 79 | 80 | def encode(bimask): 81 | if len(bimask.shape) == 3: 82 | return _mask.encode(bimask) 83 | elif len(bimask.shape) == 2: 84 | h, w = bimask.shape 85 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 86 | 87 | def decode(rleObjs): 88 | if type(rleObjs) == list: 89 | return _mask.decode(rleObjs) 90 | else: 91 | return _mask.decode([rleObjs])[:,:,0] 92 | 93 | def area(rleObjs): 94 | if type(rleObjs) == list: 95 | return _mask.area(rleObjs) 96 | else: 97 | return _mask.area([rleObjs])[0] 98 | 99 | def toBbox(rleObjs): 100 | if type(rleObjs) == list: 101 | return _mask.toBbox(rleObjs) 102 | else: 103 | return _mask.toBbox([rleObjs])[0] -------------------------------------------------------------------------------- /cocoapi-master/LuaAPI/cocoDemo.lua: -------------------------------------------------------------------------------- 1 | -- Demo for the CocoApi (see CocoApi.lua) 2 | coco = require 'coco' 3 | image = require 'image' 4 | 5 | -- initialize COCO api (please specify dataType/annType below) 6 | annTypes = { 'instances', 'captions', 'person_keypoints' } 7 | dataType, annType = 'val2014', annTypes[1]; -- specify dataType/annType 8 | annFile = '../annotations/'..annType..'_'..dataType..'.json' 9 | cocoApi=coco.CocoApi(annFile) 10 | 11 | -- get all image ids, select one at random 12 | imgIds = cocoApi:getImgIds() 13 | imgId = imgIds[torch.random(imgIds:numel())] 14 | 15 | -- load image 16 | img = cocoApi:loadImgs(imgId)[1] 17 | I = image.load('../images/'..dataType..'/'..img.file_name,3) 18 | 19 | -- load and display instance annotations 20 | annIds = cocoApi:getAnnIds({imgId=imgId}) 21 | anns = cocoApi:loadAnns(annIds) 22 | J = cocoApi:showAnns(I,anns) 23 | image.save('RES_'..img.file_name,J:double()) 24 | -------------------------------------------------------------------------------- /cocoapi-master/LuaAPI/env.lua: -------------------------------------------------------------------------------- 1 | --[[---------------------------------------------------------------------------- 2 | 3 | Common Objects in COntext (COCO) Toolbox. version 3.0 4 | Data, paper, and tutorials available at: http://mscoco.org/ 5 | Code written by Pedro O. Pinheiro and Piotr Dollar, 2016. 6 | Licensed under the Simplified BSD License [see coco/license.txt] 7 | 8 | ------------------------------------------------------------------------------]] 9 | 10 | local coco = {} 11 | return coco 12 | -------------------------------------------------------------------------------- /cocoapi-master/LuaAPI/init.lua: -------------------------------------------------------------------------------- 1 | --[[---------------------------------------------------------------------------- 2 | 3 | Common Objects in COntext (COCO) Toolbox. version 3.0 4 | Data, paper, and tutorials available at: http://mscoco.org/ 5 | Code written by Pedro O. Pinheiro and Piotr Dollar, 2016. 6 | Licensed under the Simplified BSD License [see coco/license.txt] 7 | 8 | ------------------------------------------------------------------------------]] 9 | 10 | local coco = require 'coco.env' 11 | require 'coco.CocoApi' 12 | require 'coco.MaskApi' 13 | return coco 14 | -------------------------------------------------------------------------------- /cocoapi-master/LuaAPI/rocks/coco-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "coco" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/pdollar/coco.git" 6 | } 7 | 8 | description = { 9 | summary = "Interface for accessing the Microsoft COCO dataset", 10 | detailed = "See http://mscoco.org/ for more details", 11 | homepage = "https://github.com/pdollar/coco", 12 | license = "Simplified BSD" 13 | } 14 | 15 | dependencies = { 16 | "lua >= 5.1", 17 | "torch >= 7.0", 18 | "lua-cjson" 19 | } 20 | 21 | build = { 22 | type = "builtin", 23 | modules = { 24 | ["coco.env"] = "LuaAPI/env.lua", 25 | ["coco.init"] = "LuaAPI/init.lua", 26 | ["coco.MaskApi"] = "LuaAPI/MaskApi.lua", 27 | ["coco.CocoApi"] = "LuaAPI/CocoApi.lua", 28 | libmaskapi = { 29 | sources = { "common/maskApi.c" }, 30 | incdirs = { "common/" } 31 | } 32 | } 33 | } 34 | 35 | -- luarocks make LuaAPI/rocks/coco-scm-1.rockspec 36 | -- https://github.com/pdollar/coco/raw/master/LuaAPI/rocks/coco-scm-1.rockspec 37 | -------------------------------------------------------------------------------- /cocoapi-master/MatlabAPI/MaskApi.m: -------------------------------------------------------------------------------- 1 | classdef MaskApi 2 | % Interface for manipulating masks stored in RLE format. 3 | % 4 | % RLE is a simple yet efficient format for storing binary masks. RLE 5 | % first divides a vector (or vectorized image) into a series of piecewise 6 | % constant regions and then for each piece simply stores the length of 7 | % that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 8 | % be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 9 | % (note that the odd counts are always the numbers of zeros). Instead of 10 | % storing the counts directly, additional compression is achieved with a 11 | % variable bitrate representation based on a common scheme called LEB128. 12 | % 13 | % Compression is greatest given large piecewise constant regions. 14 | % Specifically, the size of the RLE is proportional to the number of 15 | % *boundaries* in M (or for an image the number of boundaries in the y 16 | % direction). Assuming fairly simple shapes, the RLE representation is 17 | % O(sqrt(n)) where n is number of pixels in the object. Hence space usage 18 | % is substantially lower, especially for large simple objects (large n). 19 | % 20 | % Many common operations on masks can be computed directly using the RLE 21 | % (without need for decoding). This includes computations such as area, 22 | % union, intersection, etc. All of these operations are linear in the 23 | % size of the RLE, in other words they are O(sqrt(n)) where n is the area 24 | % of the object. Computing these operations on the original mask is O(n). 25 | % Thus, using the RLE can result in substantial computational savings. 26 | % 27 | % The following API functions are defined: 28 | % encode - Encode binary masks using RLE. 29 | % decode - Decode binary masks encoded via RLE. 30 | % merge - Compute union or intersection of encoded masks. 31 | % iou - Compute intersection over union between masks. 32 | % nms - Compute non-maximum suppression between ordered masks. 33 | % area - Compute area of encoded masks. 34 | % toBbox - Get bounding boxes surrounding encoded masks. 35 | % frBbox - Convert bounding boxes to encoded masks. 36 | % frPoly - Convert polygon to encoded mask. 37 | % 38 | % Usage: 39 | % Rs = MaskApi.encode( masks ) 40 | % masks = MaskApi.decode( Rs ) 41 | % R = MaskApi.merge( Rs, [intersect=false] ) 42 | % o = MaskApi.iou( dt, gt, [iscrowd=false] ) 43 | % keep = MaskApi.nms( dt, thr ) 44 | % a = MaskApi.area( Rs ) 45 | % bbs = MaskApi.toBbox( Rs ) 46 | % Rs = MaskApi.frBbox( bbs, h, w ) 47 | % R = MaskApi.frPoly( poly, h, w ) 48 | % 49 | % In the API the following formats are used: 50 | % R,Rs - [struct] Run-length encoding of binary mask(s) 51 | % masks - [hxwxn] Binary mask(s) (must have type uint8) 52 | % bbs - [nx4] Bounding box(es) stored as [x y w h] 53 | % poly - Polygon stored as {[x1 y1 x2 y2...],[x1 y1 ...],...} 54 | % dt,gt - May be either bounding boxes or encoded masks 55 | % Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 56 | % 57 | % Finally, a note about the intersection over union (iou) computation. 58 | % The standard iou of a ground truth (gt) and detected (dt) object is 59 | % iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 60 | % For "crowd" regions, we use a modified criteria. If a gt object is 61 | % marked as "iscrowd", we allow a dt to match any subregion of the gt. 62 | % Choosing gt' in the crowd gt that best matches the dt can be done using 63 | % gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 64 | % iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 65 | % For crowd gt regions we use this modified criteria above for the iou. 66 | % 67 | % To compile use the following (some precompiled binaries are included): 68 | % mex('CFLAGS=\$CFLAGS -Wall -std=c99','-largeArrayDims',... 69 | % 'private/maskApiMex.c','../common/maskApi.c',... 70 | % '-I../common/','-outdir','private'); 71 | % Please do not contact us for help with compiling. 72 | % 73 | % Microsoft COCO Toolbox. version 2.0 74 | % Data, paper, and tutorials available at: http://mscoco.org/ 75 | % Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 76 | % Licensed under the Simplified BSD License [see coco/license.txt] 77 | 78 | methods( Static ) 79 | function Rs = encode( masks ) 80 | Rs = maskApiMex( 'encode', masks ); 81 | end 82 | 83 | function masks = decode( Rs ) 84 | masks = maskApiMex( 'decode', Rs ); 85 | end 86 | 87 | function R = merge( Rs, varargin ) 88 | R = maskApiMex( 'merge', Rs, varargin{:} ); 89 | end 90 | 91 | function o = iou( dt, gt, varargin ) 92 | o = maskApiMex( 'iou', dt', gt', varargin{:} ); 93 | end 94 | 95 | function keep = nms( dt, thr ) 96 | keep = maskApiMex('nms',dt',thr); 97 | end 98 | 99 | function a = area( Rs ) 100 | a = maskApiMex( 'area', Rs ); 101 | end 102 | 103 | function bbs = toBbox( Rs ) 104 | bbs = maskApiMex( 'toBbox', Rs )'; 105 | end 106 | 107 | function Rs = frBbox( bbs, h, w ) 108 | Rs = maskApiMex( 'frBbox', bbs', h, w ); 109 | end 110 | 111 | function R = frPoly( poly, h, w ) 112 | R = maskApiMex( 'frPoly', poly, h , w ); 113 | end 114 | end 115 | 116 | end 117 | -------------------------------------------------------------------------------- /cocoapi-master/MatlabAPI/cocoDemo.m: -------------------------------------------------------------------------------- 1 | %% Demo for the CocoApi (see CocoApi.m) 2 | 3 | %% initialize COCO api (please specify dataType/annType below) 4 | annTypes = { 'instances', 'captions', 'person_keypoints' }; 5 | dataType='val2014'; annType=annTypes{1}; % specify dataType/annType 6 | annFile=sprintf('../annotations/%s_%s.json',annType,dataType); 7 | coco=CocoApi(annFile); 8 | 9 | %% display COCO categories and supercategories 10 | if( ~strcmp(annType,'captions') ) 11 | cats = coco.loadCats(coco.getCatIds()); 12 | nms={cats.name}; fprintf('COCO categories: '); 13 | fprintf('%s, ',nms{:}); fprintf('\n'); 14 | nms=unique({cats.supercategory}); fprintf('COCO supercategories: '); 15 | fprintf('%s, ',nms{:}); fprintf('\n'); 16 | end 17 | 18 | %% get all images containing given categories, select one at random 19 | catIds = coco.getCatIds('catNms',{'person','dog','skateboard'}); 20 | imgIds = coco.getImgIds('catIds',catIds); 21 | imgId = imgIds(randi(length(imgIds))); 22 | 23 | %% load and display image 24 | img = coco.loadImgs(imgId); 25 | I = imread(sprintf('../images/%s/%s',dataType,img.file_name)); 26 | figure(1); imagesc(I); axis('image'); set(gca,'XTick',[],'YTick',[]) 27 | 28 | %% load and display annotations 29 | annIds = coco.getAnnIds('imgIds',imgId,'catIds',catIds,'iscrowd',[]); 30 | anns = coco.loadAnns(annIds); coco.showAnns(anns); 31 | -------------------------------------------------------------------------------- /cocoapi-master/MatlabAPI/evalDemo.m: -------------------------------------------------------------------------------- 1 | %% Demo demonstrating the algorithm result formats for COCO 2 | 3 | %% select results type for demo (either bbox or segm) 4 | type = {'segm','bbox','keypoints'}; type = type{1}; % specify type here 5 | fprintf('Running demo for *%s* results.\n\n',type); 6 | 7 | %% initialize COCO ground truth api 8 | dataDir='../'; prefix='instances'; dataType='val2014'; 9 | if(strcmp(type,'keypoints')), prefix='person_keypoints'; end 10 | annFile=sprintf('%s/annotations/%s_%s.json',dataDir,prefix,dataType); 11 | cocoGt=CocoApi(annFile); 12 | 13 | %% initialize COCO detections api 14 | resFile='%s/results/%s_%s_fake%s100_results.json'; 15 | resFile=sprintf(resFile,dataDir,prefix,dataType,type); 16 | cocoDt=cocoGt.loadRes(resFile); 17 | 18 | %% visialuze gt and dt side by side 19 | imgIds=sort(cocoGt.getImgIds()); imgIds=imgIds(1:100); 20 | imgId = imgIds(randi(100)); img = cocoGt.loadImgs(imgId); 21 | I = imread(sprintf('%s/images/val2014/%s',dataDir,img.file_name)); 22 | figure(1); subplot(1,2,1); imagesc(I); axis('image'); axis off; 23 | annIds = cocoGt.getAnnIds('imgIds',imgId); title('ground truth') 24 | anns = cocoGt.loadAnns(annIds); cocoGt.showAnns(anns); 25 | figure(1); subplot(1,2,2); imagesc(I); axis('image'); axis off; 26 | annIds = cocoDt.getAnnIds('imgIds',imgId); title('results') 27 | anns = cocoDt.loadAnns(annIds); cocoDt.showAnns(anns); 28 | 29 | %% load raw JSON and show exact format for results 30 | fprintf('results structure have the following format:\n'); 31 | res = gason(fileread(resFile)); disp(res) 32 | 33 | %% the following command can be used to save the results back to disk 34 | if(0), f=fopen(resFile,'w'); fwrite(f,gason(res)); fclose(f); end 35 | 36 | %% run COCO evaluation code (see CocoEval.m) 37 | cocoEval=CocoEval(cocoGt,cocoDt,type); 38 | cocoEval.params.imgIds=imgIds; 39 | cocoEval.evaluate(); 40 | cocoEval.accumulate(); 41 | cocoEval.summarize(); 42 | 43 | %% generate Derek Hoiem style analyis of false positives (slow) 44 | if(0), cocoEval.analyze(); end 45 | -------------------------------------------------------------------------------- /cocoapi-master/MatlabAPI/gason.m: -------------------------------------------------------------------------------- 1 | function out = gason( in ) 2 | % Convert between JSON strings and corresponding JSON objects. 3 | % 4 | % This parser is based on Gason written and maintained by Ivan Vashchaev: 5 | % https://github.com/vivkin/gason 6 | % Gason is a "lightweight and fast JSON parser for C++". Please see the 7 | % above link for license information and additional details about Gason. 8 | % 9 | % Given a JSON string, gason calls the C++ parser and converts the output 10 | % into an appropriate Matlab structure. As the parsing is performed in mex 11 | % the resulting parser is blazingly fast. Large JSON structs (100MB+) take 12 | % only a few seconds to parse (compared to hours for pure Matlab parsers). 13 | % 14 | % Given a JSON object, gason calls the C++ encoder to convert the object 15 | % back into a JSON string representation. Nearly any Matlab struct, cell 16 | % array, or numeric array represent a valid JSON object. Note that gason() 17 | % can be used to go both from JSON string to JSON object and back. 18 | % 19 | % Gason requires C++11 to compile (for GCC this requires version 4.7 or 20 | % later). The following command compiles the parser (may require tweaking): 21 | % mex('CXXFLAGS=\$CXXFLAGS -std=c++11 -Wall','-largeArrayDims',... 22 | % 'private/gasonMex.cpp','../common/gason.cpp',... 23 | % '-I../common/','-outdir','private'); 24 | % Note the use of the "-std=c++11" flag. A number of precompiled binaries 25 | % are included, please do not contact us for help with compiling. If needed 26 | % you can specify a compiler by adding the option 'CXX="/usr/bin/g++"'. 27 | % 28 | % Note that by default JSON arrays that contain only numbers are stored as 29 | % regular Matlab arrays. Likewise, JSON arrays that contain only objects of 30 | % the same type are stored as Matlab struct arrays. This is much faster and 31 | % can use considerably less memory than always using Matlab cell arrays. 32 | % 33 | % USAGE 34 | % object = gason( string ) 35 | % string = gason( object ) 36 | % 37 | % INPUTS/OUTPUTS 38 | % string - JSON string 39 | % object - JSON object 40 | % 41 | % EXAMPLE 42 | % o = struct('first',{'piotr','ty'},'last',{'dollar','lin'}) 43 | % s = gason( o ) % convert JSON object -> JSON string 44 | % p = gason( s ) % convert JSON string -> JSON object 45 | % 46 | % See also 47 | % 48 | % Microsoft COCO Toolbox. version 2.0 49 | % Data, paper, and tutorials available at: http://mscoco.org/ 50 | % Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 51 | % Licensed under the Simplified BSD License [see coco/license.txt] 52 | 53 | out = gasonMex( 'convert', in ); 54 | -------------------------------------------------------------------------------- /cocoapi-master/MatlabAPI/private/gasonMex.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "gason.h" 8 | #include "mex.h" 9 | #include "string.h" 10 | #include "math.h" 11 | #include 12 | #include 13 | #include 14 | typedef std::ostringstream ostrm; 15 | typedef unsigned long siz; 16 | typedef unsigned short ushort; 17 | 18 | siz length( const JsonValue &a ) { 19 | // get number of elements in JSON_ARRAY or JSON_OBJECT 20 | siz k=0; auto n=a.toNode(); while(n) { k++; n=n->next; } return k; 21 | } 22 | 23 | bool isRegularObjArray( const JsonValue &a ) { 24 | // check if all JSON_OBJECTs in JSON_ARRAY have the same fields 25 | JsonValue o=a.toNode()->value; siz k, n; const char **keys; 26 | n=length(o); keys=new const char*[n]; 27 | k=0; for(auto j:o) keys[k++]=j->key; 28 | for( auto i:a ) { 29 | if(length(i->value)!=n) return false; k=0; 30 | for(auto j:i->value) if(strcmp(j->key,keys[k++])) return false; 31 | } 32 | delete [] keys; return true; 33 | } 34 | 35 | mxArray* json( const JsonValue &o ) { 36 | // convert JsonValue to Matlab mxArray 37 | siz k, m, n; mxArray *M; const char **keys; 38 | switch( o.getTag() ) { 39 | case JSON_NUMBER: 40 | return mxCreateDoubleScalar(o.toNumber()); 41 | case JSON_STRING: 42 | return mxCreateString(o.toString()); 43 | case JSON_ARRAY: { 44 | if(!o.toNode()) return mxCreateDoubleMatrix(1,0,mxREAL); 45 | JsonValue o0=o.toNode()->value; JsonTag tag=o0.getTag(); 46 | n=length(o); bool isRegular=true; 47 | for(auto i:o) isRegular=isRegular && i->value.getTag()==tag; 48 | if( isRegular && tag==JSON_OBJECT && isRegularObjArray(o) ) { 49 | m=length(o0); keys=new const char*[m]; 50 | k=0; for(auto j:o0) keys[k++]=j->key; 51 | M = mxCreateStructMatrix(1,n,m,keys); 52 | k=0; for(auto i:o) { m=0; for(auto j:i->value) 53 | mxSetFieldByNumber(M,k,m++,json(j->value)); k++; } 54 | delete [] keys; return M; 55 | } else if( isRegular && tag==JSON_NUMBER ) { 56 | M = mxCreateDoubleMatrix(1,n,mxREAL); double *p=mxGetPr(M); 57 | k=0; for(auto i:o) p[k++]=i->value.toNumber(); return M; 58 | } else { 59 | M = mxCreateCellMatrix(1,n); 60 | k=0; for(auto i:o) mxSetCell(M,k++,json(i->value)); 61 | return M; 62 | } 63 | } 64 | case JSON_OBJECT: 65 | if(!o.toNode()) return mxCreateStructMatrix(1,0,0,NULL); 66 | n=length(o); keys=new const char*[n]; 67 | k=0; for(auto i:o) keys[k++]=i->key; 68 | M = mxCreateStructMatrix(1,1,n,keys); k=0; 69 | for(auto i:o) mxSetFieldByNumber(M,0,k++,json(i->value)); 70 | delete [] keys; return M; 71 | case JSON_TRUE: 72 | return mxCreateDoubleScalar(1); 73 | case JSON_FALSE: 74 | return mxCreateDoubleScalar(0); 75 | case JSON_NULL: 76 | return mxCreateDoubleMatrix(0,0,mxREAL); 77 | default: return NULL; 78 | } 79 | } 80 | 81 | template ostrm& json( ostrm &S, T *A, siz n ) { 82 | // convert numeric array to JSON string with casting 83 | if(n==0) { S<<"[]"; return S; } if(n==1) { S< ostrm& json( ostrm &S, T *A, siz n ) { 89 | // convert numeric array to JSON string without casting 90 | return json(S,A,n); 91 | } 92 | 93 | ostrm& json( ostrm &S, const char *A ) { 94 | // convert char array to JSON string (handle escape characters) 95 | #define RPL(a,b) case a: { S << b; A++; break; } 96 | S << "\""; while( *A>0 ) switch( *A ) { 97 | RPL('"',"\\\""); RPL('\\',"\\\\"); RPL('/',"\\/"); RPL('\b',"\\b"); 98 | RPL('\f',"\\f"); RPL('\n',"\\n"); RPL('\r',"\\r"); RPL('\t',"\\t"); 99 | default: S << *A; A++; 100 | } 101 | S << "\""; return S; 102 | } 103 | 104 | ostrm& json( ostrm& S, const JsonValue *o ) { 105 | // convert JsonValue to JSON string 106 | switch( o->getTag() ) { 107 | case JSON_NUMBER: S << o->toNumber(); return S; 108 | case JSON_TRUE: S << "true"; return S; 109 | case JSON_FALSE: S << "false"; return S; 110 | case JSON_NULL: S << "null"; return S; 111 | case JSON_STRING: return json(S,o->toString()); 112 | case JSON_ARRAY: 113 | S << "["; for(auto i:*o) { 114 | json(S,&i->value) << (i->next ? "," : ""); } 115 | S << "]"; return S; 116 | case JSON_OBJECT: 117 | S << "{"; for(auto i:*o) { 118 | json(S,i->key) << ":"; 119 | json(S,&i->value) << (i->next ? "," : ""); } 120 | S << "}"; return S; 121 | default: return S; 122 | } 123 | } 124 | 125 | ostrm& json( ostrm& S, const mxArray *M ) { 126 | // convert Matlab mxArray to JSON string 127 | siz i, j, m, n=mxGetNumberOfElements(M); 128 | void *A=mxGetData(M); ostrm *nms; 129 | switch( mxGetClassID(M) ) { 130 | case mxDOUBLE_CLASS: return json(S,(double*) A,n); 131 | case mxSINGLE_CLASS: return json(S,(float*) A,n); 132 | case mxINT64_CLASS: return json(S,(int64_t*) A,n); 133 | case mxUINT64_CLASS: return json(S,(uint64_t*) A,n); 134 | case mxINT32_CLASS: return json(S,(int32_t*) A,n); 135 | case mxUINT32_CLASS: return json(S,(uint32_t*) A,n); 136 | case mxINT16_CLASS: return json(S,(int16_t*) A,n); 137 | case mxUINT16_CLASS: return json(S,(uint16_t*) A,n); 138 | case mxINT8_CLASS: return json(S,(int8_t*) A,n); 139 | case mxUINT8_CLASS: return json(S,(uint8_t*) A,n); 140 | case mxLOGICAL_CLASS: return json(S,(uint8_t*) A,n); 141 | case mxCHAR_CLASS: return json(S,mxArrayToString(M)); 142 | case mxCELL_CLASS: 143 | S << "["; for(i=0; i0) json(S,mxGetCell(M,n-1)); S << "]"; return S; 145 | case mxSTRUCT_CLASS: 146 | if(n==0) { S<<"{}"; return S; } m=mxGetNumberOfFields(M); 147 | if(m==0) { S<<"["; for(i=0; i1) S<<"["; nms=new ostrm[m]; 149 | for(j=0; j1) S<<"]"; delete [] nms; return S; 156 | default: 157 | mexErrMsgTxt( "Unknown type." ); return S; 158 | } 159 | } 160 | 161 | mxArray* mxCreateStringRobust( const char* str ) { 162 | // convert char* to Matlab string (robust version of mxCreateString) 163 | mxArray *M; ushort *c; mwSize n[2]={1,strlen(str)}; 164 | M=mxCreateCharArray(2,n); c=(ushort*) mxGetData(M); 165 | for( siz i=0; i1 ) mexErrMsgTxt("One output expected."); 182 | 183 | if(!strcmp(action,"convert")) { 184 | if( nr!=1 ) mexErrMsgTxt("One input expected."); 185 | if( mxGetClassID(pr[0])==mxCHAR_CLASS ) { 186 | // object = mexFunction( string ) 187 | char *str = mxArrayToStringRobust(pr[0]); 188 | int status = jsonParse(str, &endptr, &val, allocator); 189 | if( status != JSON_OK) mexErrMsgTxt(jsonStrError(status)); 190 | pl[0] = json(val); mxFree(str); 191 | } else { 192 | // string = mexFunction( object ) 193 | ostrm S; S << std::setprecision(12); json(S,pr[0]); 194 | pl[0]=mxCreateStringRobust(S.str().c_str()); 195 | } 196 | 197 | } else if(!strcmp(action,"split")) { 198 | // strings = mexFunction( string, k ) 199 | if( nr!=2 ) mexErrMsgTxt("Two input expected."); 200 | char *str = mxArrayToStringRobust(pr[0]); 201 | int status = jsonParse(str, &endptr, &val, allocator); 202 | if( status != JSON_OK) mexErrMsgTxt(jsonStrError(status)); 203 | if( val.getTag()!=JSON_ARRAY ) mexErrMsgTxt("Array expected"); 204 | siz i=0, t=0, n=length(val), k=(siz) mxGetScalar(pr[1]); 205 | k=(k>n)?n:(k<1)?1:k; k=ceil(n/ceil(double(n)/k)); 206 | pl[0]=mxCreateCellMatrix(1,k); ostrm S; S<value); t--; if(!o->next) t=0; S << (t ? "," : "]"); 210 | if(!t) mxSetCell(pl[0],i++,mxCreateStringRobust(S.str().c_str())); 211 | } 212 | 213 | } else if(!strcmp(action,"merge")) { 214 | // string = mexFunction( strings ) 215 | if( nr!=1 ) mexErrMsgTxt("One input expected."); 216 | if(!mxIsCell(pr[0])) mexErrMsgTxt("Cell array expected."); 217 | siz n = mxGetNumberOfElements(pr[0]); 218 | ostrm S; S << std::setprecision(12); S << "["; 219 | for( siz i=0; ivalue) << (j->next ? "," : ""); 225 | mxFree(str); if(i1) 14 | % [ param1 ... paramN ] = getPrmDflt( prm, dfs, [checkExtra] ) 15 | % 16 | % INPUTS 17 | % prm - param struct or cell of form {'name1' v1 'name2' v2 ...} 18 | % dfs - cell of form {'name1' def1 'name2' def2 ...} 19 | % checkExtra - [0] if 1 throw error if prm contains params not in dfs 20 | % if -1 if prm contains params not in dfs adds them 21 | % 22 | % OUTPUTS (nargout==1) 23 | % prm - parameter struct with fields 'name1' through 'nameN' assigned 24 | % 25 | % OUTPUTS (nargout>1) 26 | % param1 - value assigned to parameter with 'name1' 27 | % ... 28 | % paramN - value assigned to parameter with 'nameN' 29 | % 30 | % EXAMPLE 31 | % dfs = { 'x','REQ', 'y',0, 'z',[], 'eps',1e-3 }; 32 | % prm = getPrmDflt( struct('x',1,'y',1), dfs ) 33 | % [ x y z eps ] = getPrmDflt( {'x',2,'y',1}, dfs ) 34 | % 35 | % See also INPUTPARSER 36 | % 37 | % Piotr's Computer Vision Matlab Toolbox Version 2.60 38 | % Copyright 2014 Piotr Dollar. [pdollar-at-gmail.com] 39 | % Licensed under the Simplified BSD License [see external/bsd.txt] 40 | 41 | if( mod(length(dfs),2) ), error('odd number of default parameters'); end 42 | if nargin<=2, checkExtra = 0; end 43 | 44 | % get the input parameters as two cell arrays: prmVal and prmField 45 | if iscell(prm) && length(prm)==1, prm=prm{1}; end 46 | if iscell(prm) 47 | if(mod(length(prm),2)), error('odd number of parameters in prm'); end 48 | prmField = prm(1:2:end); prmVal = prm(2:2:end); 49 | else 50 | if(~isstruct(prm)), error('prm must be a struct or a cell'); end 51 | prmVal = struct2cell(prm); prmField = fieldnames(prm); 52 | end 53 | 54 | % get and update default values using quick for loop 55 | dfsField = dfs(1:2:end); dfsVal = dfs(2:2:end); 56 | if checkExtra>0 57 | for i=1:length(prmField) 58 | j = find(strcmp(prmField{i},dfsField)); 59 | if isempty(j), error('parameter %s is not valid', prmField{i}); end 60 | dfsVal(j) = prmVal(i); 61 | end 62 | elseif checkExtra<0 63 | for i=1:length(prmField) 64 | j = find(strcmp(prmField{i},dfsField)); 65 | if isempty(j), j=length(dfsVal)+1; dfsField{j}=prmField{i}; end 66 | dfsVal(j) = prmVal(i); 67 | end 68 | else 69 | for i=1:length(prmField) 70 | dfsVal(strcmp(prmField{i},dfsField)) = prmVal(i); 71 | end 72 | end 73 | 74 | % check for missing values 75 | if any(strcmp('REQ',dfsVal)) 76 | cmpArray = find(strcmp('REQ',dfsVal)); 77 | error(['Required field ''' dfsField{cmpArray(1)} ''' not specified.'] ); 78 | end 79 | 80 | % set output 81 | if nargout==1 82 | varargout{1} = cell2struct( dfsVal, dfsField, 2 ); 83 | else 84 | varargout = dfsVal; 85 | end 86 | -------------------------------------------------------------------------------- /cocoapi-master/MatlabAPI/private/maskApiMex.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "mex.h" 8 | #include "maskApi.h" 9 | #include 10 | 11 | void checkType( const mxArray *M, mxClassID id ) { 12 | if(mxGetClassID(M)!=id) mexErrMsgTxt("Invalid type."); 13 | } 14 | 15 | mxArray* toMxArray( const RLE *R, siz n ) { 16 | const char *fs[] = {"size", "counts"}; 17 | mxArray *M=mxCreateStructMatrix(1,n,2,fs); 18 | for( siz i=0; i1) mexErrMsgTxt(err); 35 | for( i=0; i<*n; i++ ) { 36 | mxArray *S, *C; double *s; void *c; 37 | S=mxGetFieldByNumber(M,i,O[0]); checkType(S,mxDOUBLE_CLASS); 38 | C=mxGetFieldByNumber(M,i,O[1]); s=mxGetPr(S); c=mxGetData(C); 39 | h=(siz)s[0]; w=(siz)s[1]; m=mxGetNumberOfElements(C); 40 | if(same && i>0 && (h!=R[0].h || w!=R[0].w)) mexErrMsgTxt(err); 41 | if( mxGetClassID(C)==mxDOUBLE_CLASS ) { 42 | rleInit(R+i,h,w,m,0); 43 | for(j=0; j=2) ? (mxGetScalar(pr[1])>0) : false; 74 | rleMerge(R,&M,n,intersect); pl[0]=toMxArray(&M,1); rleFree(&M); 75 | 76 | } else if(!strcmp(action,"area")) { 77 | R=frMxArray(pr[0],&n,0); 78 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL); 79 | uint *a=(uint*) mxGetPr(pl[0]); rleArea(R,n,a); 80 | 81 | } else if(!strcmp(action,"iou")) { 82 | if(nr>2) checkType(pr[2],mxUINT8_CLASS); siz nDt, nGt; 83 | byte *iscrowd = nr>2 ? (byte*) mxGetPr(pr[2]) : NULL; 84 | if(mxIsStruct(pr[0]) || mxIsStruct(pr[1])) { 85 | RLE *dt=frMxArray(pr[0],&nDt,1), *gt=frMxArray(pr[1],&nGt,1); 86 | pl[0]=mxCreateNumericMatrix(nDt,nGt,mxDOUBLE_CLASS,mxREAL); 87 | double *o=mxGetPr(pl[0]); rleIou(dt,gt,nDt,nGt,iscrowd,o); 88 | rlesFree(&dt,nDt); rlesFree(>,nGt); 89 | } else { 90 | checkType(pr[0],mxDOUBLE_CLASS); checkType(pr[1],mxDOUBLE_CLASS); 91 | double *dt=mxGetPr(pr[0]); nDt=mxGetN(pr[0]); 92 | double *gt=mxGetPr(pr[1]); nGt=mxGetN(pr[1]); 93 | pl[0]=mxCreateNumericMatrix(nDt,nGt,mxDOUBLE_CLASS,mxREAL); 94 | double *o=mxGetPr(pl[0]); bbIou(dt,gt,nDt,nGt,iscrowd,o); 95 | } 96 | 97 | } else if(!strcmp(action,"nms")) { 98 | siz n; uint *keep; double thr=(double) mxGetScalar(pr[1]); 99 | if(mxIsStruct(pr[0])) { 100 | RLE *dt=frMxArray(pr[0],&n,1); 101 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL); 102 | keep=(uint*) mxGetPr(pl[0]); rleNms(dt,n,keep,thr); 103 | rlesFree(&dt,n); 104 | } else { 105 | checkType(pr[0],mxDOUBLE_CLASS); 106 | double *dt=mxGetPr(pr[0]); n=mxGetN(pr[0]); 107 | pl[0]=mxCreateNumericMatrix(1,n,mxUINT32_CLASS,mxREAL); 108 | keep=(uint*) mxGetPr(pl[0]); bbNms(dt,n,keep,thr); 109 | } 110 | 111 | } else if(!strcmp(action,"toBbox")) { 112 | R=frMxArray(pr[0],&n,0); 113 | pl[0]=mxCreateNumericMatrix(4,n,mxDOUBLE_CLASS,mxREAL); 114 | BB bb=mxGetPr(pl[0]); rleToBbox(R,bb,n); 115 | 116 | } else if(!strcmp(action,"frBbox")) { 117 | checkType(pr[0],mxDOUBLE_CLASS); 118 | double *bb=mxGetPr(pr[0]); n=mxGetN(pr[0]); 119 | h=(siz)mxGetScalar(pr[1]); w=(siz)mxGetScalar(pr[2]); 120 | rlesInit(&R,n); rleFrBbox(R,bb,h,w,n); pl[0]=toMxArray(R,n); 121 | 122 | } else if(!strcmp(action,"frPoly")) { 123 | checkType(pr[0],mxCELL_CLASS); n=mxGetNumberOfElements(pr[0]); 124 | h=(siz)mxGetScalar(pr[1]); w=(siz)mxGetScalar(pr[2]); rlesInit(&R,n); 125 | for(siz i=0; i=18.0 2 | cython>=0.27.3 3 | matplotlib>=2.1.0 4 | -------------------------------------------------------------------------------- /cocoapi-master/PythonAPI/pycocotools.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | pycocotools 2 | -------------------------------------------------------------------------------- /cocoapi-master/PythonAPI/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /cocoapi-master/PythonAPI/pycocotools/_mask.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/cocoapi-master/PythonAPI/pycocotools/_mask.so -------------------------------------------------------------------------------- /cocoapi-master/PythonAPI/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | iou = _mask.iou 77 | merge = _mask.merge 78 | frPyObjects = _mask.frPyObjects 79 | 80 | def encode(bimask): 81 | if len(bimask.shape) == 3: 82 | return _mask.encode(bimask) 83 | elif len(bimask.shape) == 2: 84 | h, w = bimask.shape 85 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 86 | 87 | def decode(rleObjs): 88 | if type(rleObjs) == list: 89 | return _mask.decode(rleObjs) 90 | else: 91 | return _mask.decode([rleObjs])[:,:,0] 92 | 93 | def area(rleObjs): 94 | if type(rleObjs) == list: 95 | return _mask.area(rleObjs) 96 | else: 97 | return _mask.area([rleObjs])[0] 98 | 99 | def toBbox(rleObjs): 100 | if type(rleObjs) == list: 101 | return _mask.toBbox(rleObjs) 102 | else: 103 | return _mask.toBbox([rleObjs])[0] -------------------------------------------------------------------------------- /cocoapi-master/PythonAPI/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | import numpy as np 3 | 4 | # To compile and install locally run "python setup.py build_ext --inplace" 5 | # To install library to Python site-packages run "python setup.py build_ext install" 6 | 7 | ext_modules = [ 8 | Extension( 9 | 'pycocotools._mask', 10 | sources=['../common/maskApi.c', 'pycocotools/_mask.pyx'], 11 | include_dirs = [np.get_include(), '../common'], 12 | extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], 13 | ) 14 | ] 15 | 16 | setup( 17 | name='pycocotools', 18 | packages=['pycocotools'], 19 | package_dir = {'pycocotools': 'pycocotools'}, 20 | install_requires=[ 21 | 'setuptools>=18.0', 22 | 'cython>=0.27.3', 23 | 'matplotlib>=2.1.0' 24 | ], 25 | version='2.0', 26 | ext_modules= ext_modules 27 | ) 28 | -------------------------------------------------------------------------------- /cocoapi-master/README.txt: -------------------------------------------------------------------------------- 1 | COCO API - http://cocodataset.org/ 2 | 3 | COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation. This package provides Matlab, Python, and Lua APIs that assists in loading, parsing, and visualizing the annotations in COCO. Please visit http://cocodataset.org/ for more information on COCO, including for the data, paper, and tutorials. The exact format of the annotations is also described on the COCO website. The Matlab and Python APIs are complete, the Lua API provides only basic functionality. 4 | 5 | In addition to this API, please download both the COCO images and annotations in order to run the demos and use the API. Both are available on the project website. 6 | -Please download, unzip, and place the images in: coco/images/ 7 | -Please download and place the annotations in: coco/annotations/ 8 | For substantially more details on the API please see http://cocodataset.org/#download. 9 | 10 | After downloading the images and annotations, run the Matlab, Python, or Lua demos for example usage. 11 | 12 | To install: 13 | -For Matlab, add coco/MatlabApi to the Matlab path (OSX/Linux binaries provided) 14 | -For Python, run "make" under coco/PythonAPI 15 | -For Lua, run “luarocks make LuaAPI/rocks/coco-scm-1.rockspec” under coco/ 16 | -------------------------------------------------------------------------------- /cocoapi-master/common/gason.cpp: -------------------------------------------------------------------------------- 1 | // https://github.com/vivkin/gason - pulled January 10, 2016 2 | #include "gason.h" 3 | #include 4 | 5 | #define JSON_ZONE_SIZE 4096 6 | #define JSON_STACK_SIZE 32 7 | 8 | const char *jsonStrError(int err) { 9 | switch (err) { 10 | #define XX(no, str) \ 11 | case JSON_##no: \ 12 | return str; 13 | JSON_ERRNO_MAP(XX) 14 | #undef XX 15 | default: 16 | return "unknown"; 17 | } 18 | } 19 | 20 | void *JsonAllocator::allocate(size_t size) { 21 | size = (size + 7) & ~7; 22 | 23 | if (head && head->used + size <= JSON_ZONE_SIZE) { 24 | char *p = (char *)head + head->used; 25 | head->used += size; 26 | return p; 27 | } 28 | 29 | size_t allocSize = sizeof(Zone) + size; 30 | Zone *zone = (Zone *)malloc(allocSize <= JSON_ZONE_SIZE ? JSON_ZONE_SIZE : allocSize); 31 | if (zone == nullptr) 32 | return nullptr; 33 | zone->used = allocSize; 34 | if (allocSize <= JSON_ZONE_SIZE || head == nullptr) { 35 | zone->next = head; 36 | head = zone; 37 | } else { 38 | zone->next = head->next; 39 | head->next = zone; 40 | } 41 | return (char *)zone + sizeof(Zone); 42 | } 43 | 44 | void JsonAllocator::deallocate() { 45 | while (head) { 46 | Zone *next = head->next; 47 | free(head); 48 | head = next; 49 | } 50 | } 51 | 52 | static inline bool isspace(char c) { 53 | return c == ' ' || (c >= '\t' && c <= '\r'); 54 | } 55 | 56 | static inline bool isdelim(char c) { 57 | return c == ',' || c == ':' || c == ']' || c == '}' || isspace(c) || !c; 58 | } 59 | 60 | static inline bool isdigit(char c) { 61 | return c >= '0' && c <= '9'; 62 | } 63 | 64 | static inline bool isxdigit(char c) { 65 | return (c >= '0' && c <= '9') || ((c & ~' ') >= 'A' && (c & ~' ') <= 'F'); 66 | } 67 | 68 | static inline int char2int(char c) { 69 | if (c <= '9') 70 | return c - '0'; 71 | return (c & ~' ') - 'A' + 10; 72 | } 73 | 74 | static double string2double(char *s, char **endptr) { 75 | char ch = *s; 76 | if (ch == '-') 77 | ++s; 78 | 79 | double result = 0; 80 | while (isdigit(*s)) 81 | result = (result * 10) + (*s++ - '0'); 82 | 83 | if (*s == '.') { 84 | ++s; 85 | 86 | double fraction = 1; 87 | while (isdigit(*s)) { 88 | fraction *= 0.1; 89 | result += (*s++ - '0') * fraction; 90 | } 91 | } 92 | 93 | if (*s == 'e' || *s == 'E') { 94 | ++s; 95 | 96 | double base = 10; 97 | if (*s == '+') 98 | ++s; 99 | else if (*s == '-') { 100 | ++s; 101 | base = 0.1; 102 | } 103 | 104 | unsigned int exponent = 0; 105 | while (isdigit(*s)) 106 | exponent = (exponent * 10) + (*s++ - '0'); 107 | 108 | double power = 1; 109 | for (; exponent; exponent >>= 1, base *= base) 110 | if (exponent & 1) 111 | power *= base; 112 | 113 | result *= power; 114 | } 115 | 116 | *endptr = s; 117 | return ch == '-' ? -result : result; 118 | } 119 | 120 | static inline JsonNode *insertAfter(JsonNode *tail, JsonNode *node) { 121 | if (!tail) 122 | return node->next = node; 123 | node->next = tail->next; 124 | tail->next = node; 125 | return node; 126 | } 127 | 128 | static inline JsonValue listToValue(JsonTag tag, JsonNode *tail) { 129 | if (tail) { 130 | auto head = tail->next; 131 | tail->next = nullptr; 132 | return JsonValue(tag, head); 133 | } 134 | return JsonValue(tag, nullptr); 135 | } 136 | 137 | int jsonParse(char *s, char **endptr, JsonValue *value, JsonAllocator &allocator) { 138 | JsonNode *tails[JSON_STACK_SIZE]; 139 | JsonTag tags[JSON_STACK_SIZE]; 140 | char *keys[JSON_STACK_SIZE]; 141 | JsonValue o; 142 | int pos = -1; 143 | bool separator = true; 144 | JsonNode *node; 145 | *endptr = s; 146 | 147 | while (*s) { 148 | while (isspace(*s)) { 149 | ++s; 150 | if (!*s) break; 151 | } 152 | *endptr = s++; 153 | switch (**endptr) { 154 | case '-': 155 | if (!isdigit(*s) && *s != '.') { 156 | *endptr = s; 157 | return JSON_BAD_NUMBER; 158 | } 159 | case '0': 160 | case '1': 161 | case '2': 162 | case '3': 163 | case '4': 164 | case '5': 165 | case '6': 166 | case '7': 167 | case '8': 168 | case '9': 169 | o = JsonValue(string2double(*endptr, &s)); 170 | if (!isdelim(*s)) { 171 | *endptr = s; 172 | return JSON_BAD_NUMBER; 173 | } 174 | break; 175 | case '"': 176 | o = JsonValue(JSON_STRING, s); 177 | for (char *it = s; *s; ++it, ++s) { 178 | int c = *it = *s; 179 | if (c == '\\') { 180 | c = *++s; 181 | switch (c) { 182 | case '\\': 183 | case '"': 184 | case '/': 185 | *it = c; 186 | break; 187 | case 'b': 188 | *it = '\b'; 189 | break; 190 | case 'f': 191 | *it = '\f'; 192 | break; 193 | case 'n': 194 | *it = '\n'; 195 | break; 196 | case 'r': 197 | *it = '\r'; 198 | break; 199 | case 't': 200 | *it = '\t'; 201 | break; 202 | case 'u': 203 | c = 0; 204 | for (int i = 0; i < 4; ++i) { 205 | if (isxdigit(*++s)) { 206 | c = c * 16 + char2int(*s); 207 | } else { 208 | *endptr = s; 209 | return JSON_BAD_STRING; 210 | } 211 | } 212 | if (c < 0x80) { 213 | *it = c; 214 | } else if (c < 0x800) { 215 | *it++ = 0xC0 | (c >> 6); 216 | *it = 0x80 | (c & 0x3F); 217 | } else { 218 | *it++ = 0xE0 | (c >> 12); 219 | *it++ = 0x80 | ((c >> 6) & 0x3F); 220 | *it = 0x80 | (c & 0x3F); 221 | } 222 | break; 223 | default: 224 | *endptr = s; 225 | return JSON_BAD_STRING; 226 | } 227 | } else if ((unsigned int)c < ' ' || c == '\x7F') { 228 | *endptr = s; 229 | return JSON_BAD_STRING; 230 | } else if (c == '"') { 231 | *it = 0; 232 | ++s; 233 | break; 234 | } 235 | } 236 | if (!isdelim(*s)) { 237 | *endptr = s; 238 | return JSON_BAD_STRING; 239 | } 240 | break; 241 | case 't': 242 | if (!(s[0] == 'r' && s[1] == 'u' && s[2] == 'e' && isdelim(s[3]))) 243 | return JSON_BAD_IDENTIFIER; 244 | o = JsonValue(JSON_TRUE); 245 | s += 3; 246 | break; 247 | case 'f': 248 | if (!(s[0] == 'a' && s[1] == 'l' && s[2] == 's' && s[3] == 'e' && isdelim(s[4]))) 249 | return JSON_BAD_IDENTIFIER; 250 | o = JsonValue(JSON_FALSE); 251 | s += 4; 252 | break; 253 | case 'n': 254 | if (!(s[0] == 'u' && s[1] == 'l' && s[2] == 'l' && isdelim(s[3]))) 255 | return JSON_BAD_IDENTIFIER; 256 | o = JsonValue(JSON_NULL); 257 | s += 3; 258 | break; 259 | case ']': 260 | if (pos == -1) 261 | return JSON_STACK_UNDERFLOW; 262 | if (tags[pos] != JSON_ARRAY) 263 | return JSON_MISMATCH_BRACKET; 264 | o = listToValue(JSON_ARRAY, tails[pos--]); 265 | break; 266 | case '}': 267 | if (pos == -1) 268 | return JSON_STACK_UNDERFLOW; 269 | if (tags[pos] != JSON_OBJECT) 270 | return JSON_MISMATCH_BRACKET; 271 | if (keys[pos] != nullptr) 272 | return JSON_UNEXPECTED_CHARACTER; 273 | o = listToValue(JSON_OBJECT, tails[pos--]); 274 | break; 275 | case '[': 276 | if (++pos == JSON_STACK_SIZE) 277 | return JSON_STACK_OVERFLOW; 278 | tails[pos] = nullptr; 279 | tags[pos] = JSON_ARRAY; 280 | keys[pos] = nullptr; 281 | separator = true; 282 | continue; 283 | case '{': 284 | if (++pos == JSON_STACK_SIZE) 285 | return JSON_STACK_OVERFLOW; 286 | tails[pos] = nullptr; 287 | tags[pos] = JSON_OBJECT; 288 | keys[pos] = nullptr; 289 | separator = true; 290 | continue; 291 | case ':': 292 | if (separator || keys[pos] == nullptr) 293 | return JSON_UNEXPECTED_CHARACTER; 294 | separator = true; 295 | continue; 296 | case ',': 297 | if (separator || keys[pos] != nullptr) 298 | return JSON_UNEXPECTED_CHARACTER; 299 | separator = true; 300 | continue; 301 | case '\0': 302 | continue; 303 | default: 304 | return JSON_UNEXPECTED_CHARACTER; 305 | } 306 | 307 | separator = false; 308 | 309 | if (pos == -1) { 310 | *endptr = s; 311 | *value = o; 312 | return JSON_OK; 313 | } 314 | 315 | if (tags[pos] == JSON_OBJECT) { 316 | if (!keys[pos]) { 317 | if (o.getTag() != JSON_STRING) 318 | return JSON_UNQUOTED_KEY; 319 | keys[pos] = o.toString(); 320 | continue; 321 | } 322 | if ((node = (JsonNode *) allocator.allocate(sizeof(JsonNode))) == nullptr) 323 | return JSON_ALLOCATION_FAILURE; 324 | tails[pos] = insertAfter(tails[pos], node); 325 | tails[pos]->key = keys[pos]; 326 | keys[pos] = nullptr; 327 | } else { 328 | if ((node = (JsonNode *) allocator.allocate(sizeof(JsonNode) - sizeof(char *))) == nullptr) 329 | return JSON_ALLOCATION_FAILURE; 330 | tails[pos] = insertAfter(tails[pos], node); 331 | } 332 | tails[pos]->value = o; 333 | } 334 | return JSON_BREAKING_BAD; 335 | } 336 | -------------------------------------------------------------------------------- /cocoapi-master/common/gason.h: -------------------------------------------------------------------------------- 1 | // https://github.com/vivkin/gason - pulled January 10, 2016 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | enum JsonTag { 9 | JSON_NUMBER = 0, 10 | JSON_STRING, 11 | JSON_ARRAY, 12 | JSON_OBJECT, 13 | JSON_TRUE, 14 | JSON_FALSE, 15 | JSON_NULL = 0xF 16 | }; 17 | 18 | struct JsonNode; 19 | 20 | #define JSON_VALUE_PAYLOAD_MASK 0x00007FFFFFFFFFFFULL 21 | #define JSON_VALUE_NAN_MASK 0x7FF8000000000000ULL 22 | #define JSON_VALUE_TAG_MASK 0xF 23 | #define JSON_VALUE_TAG_SHIFT 47 24 | 25 | union JsonValue { 26 | uint64_t ival; 27 | double fval; 28 | 29 | JsonValue(double x) 30 | : fval(x) { 31 | } 32 | JsonValue(JsonTag tag = JSON_NULL, void *payload = nullptr) { 33 | assert((uintptr_t)payload <= JSON_VALUE_PAYLOAD_MASK); 34 | ival = JSON_VALUE_NAN_MASK | ((uint64_t)tag << JSON_VALUE_TAG_SHIFT) | (uintptr_t)payload; 35 | } 36 | bool isDouble() const { 37 | return (int64_t)ival <= (int64_t)JSON_VALUE_NAN_MASK; 38 | } 39 | JsonTag getTag() const { 40 | return isDouble() ? JSON_NUMBER : JsonTag((ival >> JSON_VALUE_TAG_SHIFT) & JSON_VALUE_TAG_MASK); 41 | } 42 | uint64_t getPayload() const { 43 | assert(!isDouble()); 44 | return ival & JSON_VALUE_PAYLOAD_MASK; 45 | } 46 | double toNumber() const { 47 | assert(getTag() == JSON_NUMBER); 48 | return fval; 49 | } 50 | char *toString() const { 51 | assert(getTag() == JSON_STRING); 52 | return (char *)getPayload(); 53 | } 54 | JsonNode *toNode() const { 55 | assert(getTag() == JSON_ARRAY || getTag() == JSON_OBJECT); 56 | return (JsonNode *)getPayload(); 57 | } 58 | }; 59 | 60 | struct JsonNode { 61 | JsonValue value; 62 | JsonNode *next; 63 | char *key; 64 | }; 65 | 66 | struct JsonIterator { 67 | JsonNode *p; 68 | 69 | void operator++() { 70 | p = p->next; 71 | } 72 | bool operator!=(const JsonIterator &x) const { 73 | return p != x.p; 74 | } 75 | JsonNode *operator*() const { 76 | return p; 77 | } 78 | JsonNode *operator->() const { 79 | return p; 80 | } 81 | }; 82 | 83 | inline JsonIterator begin(JsonValue o) { 84 | return JsonIterator{o.toNode()}; 85 | } 86 | inline JsonIterator end(JsonValue) { 87 | return JsonIterator{nullptr}; 88 | } 89 | 90 | #define JSON_ERRNO_MAP(XX) \ 91 | XX(OK, "ok") \ 92 | XX(BAD_NUMBER, "bad number") \ 93 | XX(BAD_STRING, "bad string") \ 94 | XX(BAD_IDENTIFIER, "bad identifier") \ 95 | XX(STACK_OVERFLOW, "stack overflow") \ 96 | XX(STACK_UNDERFLOW, "stack underflow") \ 97 | XX(MISMATCH_BRACKET, "mismatch bracket") \ 98 | XX(UNEXPECTED_CHARACTER, "unexpected character") \ 99 | XX(UNQUOTED_KEY, "unquoted key") \ 100 | XX(BREAKING_BAD, "breaking bad") \ 101 | XX(ALLOCATION_FAILURE, "allocation failure") 102 | 103 | enum JsonErrno { 104 | #define XX(no, str) JSON_##no, 105 | JSON_ERRNO_MAP(XX) 106 | #undef XX 107 | }; 108 | 109 | const char *jsonStrError(int err); 110 | 111 | class JsonAllocator { 112 | struct Zone { 113 | Zone *next; 114 | size_t used; 115 | } *head = nullptr; 116 | 117 | public: 118 | JsonAllocator() = default; 119 | JsonAllocator(const JsonAllocator &) = delete; 120 | JsonAllocator &operator=(const JsonAllocator &) = delete; 121 | JsonAllocator(JsonAllocator &&x) : head(x.head) { 122 | x.head = nullptr; 123 | } 124 | JsonAllocator &operator=(JsonAllocator &&x) { 125 | head = x.head; 126 | x.head = nullptr; 127 | return *this; 128 | } 129 | ~JsonAllocator() { 130 | deallocate(); 131 | } 132 | void *allocate(size_t size); 133 | void deallocate(); 134 | }; 135 | 136 | int jsonParse(char *str, char **endptr, JsonValue *value, JsonAllocator &allocator); 137 | -------------------------------------------------------------------------------- /cocoapi-master/common/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | siz j; if(cnts) for(j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | siz i; *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && athr) keep[j]=0; 105 | } 106 | } 107 | } 108 | 109 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) { 110 | double h, w, i, u, ga, da; siz g, d; int crowd; 111 | for( g=0; gthr) keep[j]=0; 129 | } 130 | } 131 | } 132 | 133 | void rleToBbox( const RLE *R, BB bb, siz n ) { 134 | siz i; for( i=0; id?1:c=dy && xs>xe) || (dxye); 174 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 175 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 176 | if(dx>=dy) for( d=0; d<=dx; d++ ) { 177 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 178 | } else for( d=0; d<=dy; d++ ) { 179 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 180 | } 181 | } 182 | /* get points along y-boundary and downsample */ 183 | free(x); free(y); k=m; m=0; double xd, yd; 184 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 185 | for( j=1; jw-1 ) continue; 188 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 190 | x[m]=(int) xd; y[m]=(int) yd; m++; 191 | } 192 | /* compute rle encoding given y-boundary points */ 193 | k=m; a=malloc(sizeof(uint)*(k+1)); 194 | for( j=0; j0) b[m++]=a[j++]; else { 200 | j++; if(jm, p=0; long x; int more; 207 | char *s=malloc(sizeof(char)*m*6); 208 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 210 | while( more ) { 211 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 212 | if(more) c |= 0x20; c+=48; s[p++]=c; 213 | } 214 | } 215 | s[p]=0; return s; 216 | } 217 | 218 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 219 | siz m=0, p=0, k; long x; int more; uint *cnts; 220 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 221 | while( s[p] ) { 222 | x=0; k=0; more=1; 223 | while( more ) { 224 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 225 | more = c & 0x20; p++; k++; 226 | if(!more && (c & 0x10)) x |= -1 << 5*k; 227 | } 228 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 229 | } 230 | rleInit(R,h,w,m,cnts); free(cnts); 231 | } 232 | -------------------------------------------------------------------------------- /cocoapi-master/common/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | 9 | typedef unsigned int uint; 10 | typedef unsigned long siz; 11 | typedef unsigned char byte; 12 | typedef double* BB; 13 | typedef struct { siz h, w, m; uint *cnts; } RLE; 14 | 15 | /* Initialize/destroy RLE. */ 16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 17 | void rleFree( RLE *R ); 18 | 19 | /* Initialize/destroy RLE array. */ 20 | void rlesInit( RLE **R, siz n ); 21 | void rlesFree( RLE **R, siz n ); 22 | 23 | /* Encode binary masks using RLE. */ 24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 25 | 26 | /* Decode binary masks encoded via RLE. */ 27 | void rleDecode( const RLE *R, byte *mask, siz n ); 28 | 29 | /* Compute union or intersection of encoded masks. */ 30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); 31 | 32 | /* Compute area of encoded masks. */ 33 | void rleArea( const RLE *R, siz n, uint *a ); 34 | 35 | /* Compute intersection over union between masks. */ 36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 37 | 38 | /* Compute non-maximum suppression between bounding masks */ 39 | void rleNms( RLE *dt, siz n, uint *keep, double thr ); 40 | 41 | /* Compute intersection over union between bounding boxes. */ 42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 43 | 44 | /* Compute non-maximum suppression between bounding boxes */ 45 | void bbNms( BB dt, siz n, uint *keep, double thr ); 46 | 47 | /* Get bounding boxes surrounding encoded masks. */ 48 | void rleToBbox( const RLE *R, BB bb, siz n ); 49 | 50 | /* Convert bounding boxes to encoded masks. */ 51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 52 | 53 | /* Convert polygon to encoded mask. */ 54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 55 | 56 | /* Get compressed string representation of encoded mask. */ 57 | char* rleToString( const RLE *R ); 58 | 59 | /* Convert from compressed string representation of encoded mask. */ 60 | void rleFrString( RLE *R, char *s, siz h, siz w ); 61 | -------------------------------------------------------------------------------- /cocoapi-master/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /cocoapi-master/results/val2014_fake_eval_res.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------ 2 | type=segm 3 | Running per image evaluation... DONE (t=0.45s). 4 | Accumulating evaluation results... DONE (t=0.08s). 5 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.320 6 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.562 7 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.299 8 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.387 9 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.310 10 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.327 11 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.268 12 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.415 13 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.417 14 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.469 15 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.377 16 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.381 17 | 18 | ------------------------------------------------------------------------------ 19 | type=bbox 20 | Running per image evaluation... DONE (t=0.34s). 21 | Accumulating evaluation results... DONE (t=0.08s). 22 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.505 23 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.697 24 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.573 25 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.586 26 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.519 27 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.501 28 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.387 29 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.594 30 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.595 31 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.640 32 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.566 33 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.564 34 | 35 | ------------------------------------------------------------------------------ 36 | type=keypoints 37 | Running per image evaluation... DONE (t=0.06s). 38 | Accumulating evaluation results... DONE (t=0.00s). 39 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.372 40 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.636 41 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.348 42 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.384 43 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.386 44 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.514 45 | Average Recall (AR) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.734 46 | Average Recall (AR) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.504 47 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.508 48 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.522 49 | -------------------------------------------------------------------------------- /evaluate_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from vocab import Vocabulary 3 | import evaluation_models 4 | 5 | # for coco 6 | print('Evaluation on COCO:') 7 | evaluation_models.evalrank("pretrain_model/coco/model_coco_1.pth.tar", "pretrain_model/coco/model_coco_2.pth.tar", data_path='data/', split="testall", fold5=True) 8 | 9 | # for flickr 10 | print('Evaluation on Flickr30K:') 11 | evaluation_models.evalrank("pretrain_model/flickr/model_fliker_1.pth.tar", "pretrain_model/flickr/model_fliker_2.pth.tar", data_path='data/', split="test", fold5=False) 12 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import pickle 4 | 5 | import torch 6 | import numpy 7 | from data import get_test_loader 8 | import time 9 | import numpy as np 10 | from vocab import Vocabulary # NOQA 11 | from model import VSRN, order_sim 12 | from collections import OrderedDict 13 | 14 | 15 | class AverageMeter(object): 16 | """Computes and stores the average and current value""" 17 | 18 | def __init__(self): 19 | self.reset() 20 | 21 | def reset(self): 22 | self.val = 0 23 | self.avg = 0 24 | self.sum = 0 25 | self.count = 0 26 | 27 | def update(self, val, n=0): 28 | self.val = val 29 | self.sum += val * n 30 | self.count += n 31 | self.avg = self.sum / (.0001 + self.count) 32 | 33 | def __str__(self): 34 | """String representation for logging 35 | """ 36 | # for values that should be recorded exactly e.g. iteration number 37 | if self.count == 0: 38 | return str(self.val) 39 | # for stats 40 | return '%.4f (%.4f)' % (self.val, self.avg) 41 | 42 | 43 | class LogCollector(object): 44 | """A collection of logging objects that can change from train to val""" 45 | 46 | def __init__(self): 47 | # to keep the order of logged variables deterministic 48 | self.meters = OrderedDict() 49 | 50 | def update(self, k, v, n=0): 51 | # create a new meter if previously not recorded 52 | if k not in self.meters: 53 | self.meters[k] = AverageMeter() 54 | self.meters[k].update(v, n) 55 | 56 | def __str__(self): 57 | """Concatenate the meters in one log line 58 | """ 59 | s = '' 60 | for i, (k, v) in enumerate(self.meters.iteritems()): 61 | if i > 0: 62 | s += ' ' 63 | s += k + ' ' + str(v) 64 | return s 65 | 66 | def tb_log(self, tb_logger, prefix='', step=None): 67 | """Log using tensorboard 68 | """ 69 | for k, v in self.meters.iteritems(): 70 | tb_logger.log_value(prefix + k, v.val, step=step) 71 | 72 | 73 | def encode_data(model, data_loader, log_step=10, logging=print): 74 | """Encode all images and captions loadable by `data_loader` 75 | """ 76 | batch_time = AverageMeter() 77 | val_logger = LogCollector() 78 | 79 | # switch to evaluate mode 80 | model.val_start() 81 | 82 | end = time.time() 83 | 84 | # numpy array to keep all the embeddings 85 | img_embs = None 86 | cap_embs = None 87 | for i, (images, captions, lengths, ids, caption_labels, caption_masks) in enumerate(data_loader): 88 | # make sure val logger is used 89 | model.logger = val_logger 90 | 91 | # compute the embeddings 92 | img_emb, cap_emb, GCN_img_emd = model.forward_emb(images, captions, lengths, 93 | volatile=True) 94 | 95 | # initialize the numpy arrays given the size of the embeddings 96 | if img_embs is None: 97 | img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1))) 98 | cap_embs = np.zeros((len(data_loader.dataset), cap_emb.size(1))) 99 | 100 | # preserve the embeddings by copying from gpu and converting to numpy 101 | img_embs[ids] = img_emb.data.cpu().numpy().copy() 102 | cap_embs[ids] = cap_emb.data.cpu().numpy().copy() 103 | 104 | 105 | del images, captions 106 | 107 | return img_embs, cap_embs 108 | 109 | 110 | 111 | 112 | def evalrank(model_path, data_path=None, split='dev', fold5=False): 113 | """ 114 | Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold 115 | cross-validation is done (only for MSCOCO). Otherwise, the full data is 116 | used for evaluation. 117 | """ 118 | # load model and options 119 | checkpoint = torch.load(model_path) 120 | opt = checkpoint['opt'] 121 | if data_path is not None: 122 | opt.data_path = data_path 123 | 124 | # load vocabulary used by the model 125 | with open(os.path.join(opt.vocab_path, 126 | '%s_vocab.pkl' % opt.data_name), 'rb') as f: 127 | vocab = pickle.load(f) 128 | opt.vocab_size = len(vocab) 129 | 130 | # construct model 131 | model = VSRN(opt) 132 | 133 | # load model state 134 | model.load_state_dict(checkpoint['model']) 135 | 136 | print('Loading dataset') 137 | data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, 138 | opt.batch_size, opt.workers, opt) 139 | 140 | print('Computing results...') 141 | img_embs, cap_embs = encode_data(model, data_loader) 142 | print('Images: %d, Captions: %d' % 143 | (img_embs.shape[0] / 5, cap_embs.shape[0])) 144 | 145 | if not fold5: 146 | # no cross-validation, full evaluation 147 | r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True) 148 | ri, rti = t2i(img_embs, cap_embs, 149 | measure=opt.measure, return_ranks=True) 150 | ar = (r[0] + r[1] + r[2]) / 3 151 | ari = (ri[0] + ri[1] + ri[2]) / 3 152 | rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] 153 | print("rsum: %.1f" % rsum) 154 | print("Average i2t Recall: %.1f" % ar) 155 | print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) 156 | print("Average t2i Recall: %.1f" % ari) 157 | print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) 158 | else: 159 | # 5fold cross-validation, only for MSCOCO 160 | results = [] 161 | for i in range(5): 162 | r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], 163 | cap_embs[i * 5000:(i + 1) * 164 | 5000], measure=opt.measure, 165 | return_ranks=True) 166 | print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) 167 | ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], 168 | cap_embs[i * 5000:(i + 1) * 169 | 5000], measure=opt.measure, 170 | return_ranks=True) 171 | if i == 0: 172 | rt, rti = rt0, rti0 173 | print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) 174 | ar = (r[0] + r[1] + r[2]) / 3 175 | ari = (ri[0] + ri[1] + ri[2]) / 3 176 | rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] 177 | print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) 178 | results += [list(r) + list(ri) + [ar, ari, rsum]] 179 | 180 | print("-----------------------------------") 181 | print("Mean metrics: ") 182 | mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) 183 | print("rsum: %.1f" % (mean_metrics[10] * 6)) 184 | print("Average i2t Recall: %.1f" % mean_metrics[11]) 185 | print("Image to text: %.1f %.1f %.1f %.1f %.1f" % 186 | mean_metrics[:5]) 187 | print("Average t2i Recall: %.1f" % mean_metrics[12]) 188 | print("Text to image: %.1f %.1f %.1f %.1f %.1f" % 189 | mean_metrics[5:10]) 190 | 191 | torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar') 192 | 193 | 194 | def i2t(images, captions, npts=None, measure='cosine', return_ranks=False): 195 | """ 196 | Images->Text (Image Annotation) 197 | Images: (5N, K) matrix of images 198 | Captions: (5N, K) matrix of captions 199 | """ 200 | if npts is None: 201 | npts = images.shape[0] / 5 202 | index_list = [] 203 | 204 | ranks = numpy.zeros(npts) 205 | top1 = numpy.zeros(npts) 206 | for index in range(npts): 207 | 208 | # Get query image 209 | im = images[5 * index].reshape(1, images.shape[1]) 210 | 211 | # Compute scores 212 | if measure == 'order': 213 | bs = 100 214 | if index % bs == 0: 215 | mx = min(images.shape[0], 5 * (index + bs)) 216 | im2 = images[5 * index:mx:5] 217 | d2 = order_sim(torch.Tensor(im2).cuda(), 218 | torch.Tensor(captions).cuda()) 219 | d2 = d2.cpu().numpy() 220 | d = d2[index % bs] 221 | else: 222 | d = numpy.dot(im, captions.T).flatten() 223 | inds = numpy.argsort(d)[::-1] 224 | index_list.append(inds[0]) 225 | 226 | # Score 227 | rank = 1e20 228 | for i in range(5 * index, 5 * index + 5, 1): 229 | tmp = numpy.where(inds == i)[0][0] 230 | if tmp < rank: 231 | rank = tmp 232 | ranks[index] = rank 233 | top1[index] = inds[0] 234 | 235 | # Compute metrics 236 | r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) 237 | r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks) 238 | r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks) 239 | medr = numpy.floor(numpy.median(ranks)) + 1 240 | meanr = ranks.mean() + 1 241 | if return_ranks: 242 | return (r1, r5, r10, medr, meanr), (ranks, top1) 243 | else: 244 | return (r1, r5, r10, medr, meanr) 245 | 246 | 247 | def t2i(images, captions, npts=None, measure='cosine', return_ranks=False): 248 | """ 249 | Text->Images (Image Search) 250 | Images: (5N, K) matrix of images 251 | Captions: (5N, K) matrix of captions 252 | """ 253 | if npts is None: 254 | npts = images.shape[0] / 5 255 | ims = numpy.array([images[i] for i in range(0, len(images), 5)]) 256 | 257 | ranks = numpy.zeros(5 * npts) 258 | top1 = numpy.zeros(5 * npts) 259 | for index in range(npts): 260 | 261 | # Get query captions 262 | queries = captions[5 * index:5 * index + 5] 263 | 264 | # Compute scores 265 | if measure == 'order': 266 | bs = 100 267 | if 5 * index % bs == 0: 268 | mx = min(captions.shape[0], 5 * index + bs) 269 | q2 = captions[5 * index:mx] 270 | d2 = order_sim(torch.Tensor(ims).cuda(), 271 | torch.Tensor(q2).cuda()) 272 | d2 = d2.cpu().numpy() 273 | 274 | d = d2[:, (5 * index) % bs:(5 * index) % bs + 5].T 275 | else: 276 | d = numpy.dot(queries, ims.T) 277 | inds = numpy.zeros(d.shape) 278 | for i in range(len(inds)): 279 | inds[i] = numpy.argsort(d[i])[::-1] 280 | ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0] 281 | top1[5 * index + i] = inds[i][0] 282 | 283 | # Compute metrics 284 | r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) 285 | r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks) 286 | r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks) 287 | medr = numpy.floor(numpy.median(ranks)) + 1 288 | meanr = ranks.mean() + 1 289 | if return_ranks: 290 | return (r1, r5, r10, medr, meanr), (ranks, top1) 291 | else: 292 | return (r1, r5, r10, medr, meanr) 293 | -------------------------------------------------------------------------------- /fig/Q_i2t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/fig/Q_i2t.png -------------------------------------------------------------------------------- /fig/Q_t2i_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/fig/Q_t2i_2.png -------------------------------------------------------------------------------- /fig/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/fig/model.png -------------------------------------------------------------------------------- /fig/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/fig/teaser.png -------------------------------------------------------------------------------- /misc/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /misc/cocoeval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Wrapper for evaluation on CIDEr, ROUGE_L, METEOR and Bleu_N 3 | using coco-caption repo https://github.com/tylin/coco-caption 4 | 5 | class COCOScorer is taken from https://github.com/yaoli/arctic-capgen-vid 6 | ''' 7 | 8 | import json 9 | import os 10 | import sys 11 | sys.path.append('coco-caption') 12 | 13 | from pycocoevalcap.bleu.bleu import Bleu 14 | from pycocoevalcap.rouge.rouge import Rouge 15 | from pycocoevalcap.cider.cider import Cider 16 | from pycocoevalcap.meteor.meteor import Meteor 17 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 18 | # Define a context manager to suppress stdout and stderr. 19 | 20 | 21 | class suppress_stdout_stderr: 22 | ''' 23 | A context manager for doing a "deep suppression" of stdout and stderr in 24 | Python, i.e. will suppress all print, even if the print originates in a 25 | compiled C/Fortran sub-function. 26 | This will not suppress raised exceptions, since exceptions are printed 27 | to stderr just before a script exits, and after the context manager has 28 | exited (at least, I think that is why it lets exceptions through). 29 | 30 | ''' 31 | 32 | def __init__(self): 33 | # Open a pair of null files 34 | self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] 35 | # Save the actual stdout (1) and stderr (2) file descriptors. 36 | self.save_fds = (os.dup(1), os.dup(2)) 37 | 38 | def __enter__(self): 39 | # Assign the null pointers to stdout and stderr. 40 | os.dup2(self.null_fds[0], 1) 41 | os.dup2(self.null_fds[1], 2) 42 | 43 | def __exit__(self, *_): 44 | # Re-assign the real stdout/stderr back to (1) and (2) 45 | os.dup2(self.save_fds[0], 1) 46 | os.dup2(self.save_fds[1], 2) 47 | # Close the null files 48 | os.close(self.null_fds[0]) 49 | os.close(self.null_fds[1]) 50 | 51 | 52 | class COCOScorer(object): 53 | def __init__(self): 54 | print('init COCO-EVAL scorer') 55 | 56 | def score(self, GT, RES, IDs): 57 | self.eval = {} 58 | self.imgToEval = {} 59 | gts = {} 60 | res = {} 61 | for ID in IDs: 62 | # print ID 63 | gts[ID] = GT[ID] 64 | res[ID] = RES[ID] 65 | print('tokenization...') 66 | tokenizer = PTBTokenizer() 67 | gts = tokenizer.tokenize(gts) 68 | res = tokenizer.tokenize(res) 69 | 70 | # ================================================= 71 | # Set up scorers 72 | # ================================================= 73 | print('setting up scorers...') 74 | scorers = [ 75 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 76 | (Meteor(),"METEOR"), 77 | (Rouge(), "ROUGE_L"), 78 | (Cider(), "CIDEr"), 79 | #(Spice(), "SPICE") 80 | ] 81 | 82 | # ================================================= 83 | # Compute scores 84 | # ================================================= 85 | eval = {} 86 | for scorer, method in scorers: 87 | print('computing %s score...' % (scorer.method())) 88 | score, scores = scorer.compute_score(gts, res) 89 | if type(method) == list: 90 | for sc, scs, m in zip(score, scores, method): 91 | self.setEval(sc, m) 92 | self.setImgToEvalImgs(scs, IDs, m) 93 | print("%s: %0.3f" % (m, sc)) 94 | else: 95 | self.setEval(score, method) 96 | self.setImgToEvalImgs(scores, IDs, method) 97 | print("%s: %0.3f" % (method, score)) 98 | 99 | # for metric, score in self.eval.items(): 100 | # print '%s: %.3f'%(metric, score) 101 | return self.eval 102 | 103 | def setEval(self, score, method): 104 | self.eval[method] = score 105 | 106 | def setImgToEvalImgs(self, scores, imgIds, method): 107 | for imgId, score in zip(imgIds, scores): 108 | if imgId not in self.imgToEval: 109 | self.imgToEval[imgId] = {} 110 | self.imgToEval[imgId]["image_id"] = imgId 111 | self.imgToEval[imgId][method] = score 112 | 113 | 114 | def score(ref, sample): 115 | # ref and sample are both dict 116 | scorers = [ 117 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 118 | (Rouge(), "ROUGE_L"), 119 | (Cider(), "CIDEr") 120 | ] 121 | final_scores = {} 122 | for scorer, method in scorers: 123 | print('computing %s score with COCO-EVAL...' % (scorer.method())) 124 | score, scores = scorer.compute_score(ref, sample) 125 | if type(score) == list: 126 | for m, s in zip(method, score): 127 | final_scores[m] = s 128 | else: 129 | final_scores[method] = score 130 | return final_scores 131 | 132 | 133 | def test_cocoscorer(): 134 | '''gts = { 135 | 184321:[ 136 | {u'image_id': 184321, u'id': 352188, u'caption': u'A train traveling down-tracks next to lights.'}, 137 | {u'image_id': 184321, u'id': 356043, u'caption': u"A blue and silver train next to train's station and trees."}, 138 | {u'image_id': 184321, u'id': 356382, u'caption': u'A blue train is next to a sidewalk on the rails.'}, 139 | {u'image_id': 184321, u'id': 361110, u'caption': u'A passenger train pulls into a train station.'}, 140 | {u'image_id': 184321, u'id': 362544, u'caption': u'A train coming down the tracks arriving at a station.'}], 141 | 81922: [ 142 | {u'image_id': 81922, u'id': 86779, u'caption': u'A large jetliner flying over a traffic filled street.'}, 143 | {u'image_id': 81922, u'id': 90172, u'caption': u'An airplane flies low in the sky over a city street. '}, 144 | {u'image_id': 81922, u'id': 91615, u'caption': u'An airplane flies over a street with many cars.'}, 145 | {u'image_id': 81922, u'id': 92689, u'caption': u'An airplane comes in to land over a road full of cars'}, 146 | {u'image_id': 81922, u'id': 823814, u'caption': u'The plane is flying over top of the cars'}] 147 | } 148 | 149 | samples = { 150 | 184321: [{u'image_id': 184321, 'id': 111, u'caption': u'train traveling down a track in front of a road'}], 151 | 81922: [{u'image_id': 81922, 'id': 219, u'caption': u'plane is flying through the sky'}], 152 | } 153 | ''' 154 | gts = { 155 | '184321': [ 156 | {u'image_id': '184321', u'cap_id': 0, u'caption': u'A train traveling down tracks next to lights.', 157 | 'tokenized': 'a train traveling down tracks next to lights'}, 158 | {u'image_id': '184321', u'cap_id': 1, u'caption': u'A train coming down the tracks arriving at a station.', 159 | 'tokenized': 'a train coming down the tracks arriving at a station'}], 160 | '81922': [ 161 | {u'image_id': '81922', u'cap_id': 0, u'caption': u'A large jetliner flying over a traffic filled street.', 162 | 'tokenized': 'a large jetliner flying over a traffic filled street'}, 163 | {u'image_id': '81922', u'cap_id': 1, u'caption': u'The plane is flying over top of the cars', 164 | 'tokenized': 'the plan is flying over top of the cars'}, ] 165 | } 166 | 167 | samples = { 168 | '184321': [{u'image_id': '184321', u'caption': u'train traveling down a track in front of a road'}], 169 | '81922': [{u'image_id': '81922', u'caption': u'plane is flying through the sky'}], 170 | } 171 | IDs = ['184321', '81922'] 172 | scorer = COCOScorer() 173 | scorer.score(gts, samples, IDs) 174 | 175 | 176 | if __name__ == '__main__': 177 | test_cocoscorer() 178 | -------------------------------------------------------------------------------- /misc/rewards.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | import torch 4 | import sys 5 | sys.path.append("coco-caption") 6 | from pyciderevalcap.ciderD.ciderD import CiderD 7 | 8 | CiderD_scorer = None 9 | # CiderD_scorer = CiderD(df='corpus') 10 | 11 | 12 | def init_cider_scorer(cached_tokens): 13 | global CiderD_scorer 14 | CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens) 15 | 16 | 17 | def array_to_str(arr): 18 | out = '' 19 | for i in range(len(arr)): 20 | out += str(arr[i]) + ' ' 21 | if arr[i] == 0: 22 | break 23 | return out.strip() 24 | 25 | 26 | def get_self_critical_reward(model, fc_feats, data, gen_result): 27 | batch_size = gen_result.size(0) 28 | 29 | # get greedy decoding baseline 30 | _, greedy_res = model(fc_feats, mode='inference') 31 | 32 | res = OrderedDict() 33 | 34 | gen_result = gen_result.cpu().data.numpy() 35 | greedy_res = greedy_res.cpu().data.numpy() 36 | for i in range(batch_size): 37 | res[i] = [array_to_str(gen_result[i])] 38 | for i in range(batch_size): 39 | res[batch_size + i] = [array_to_str(greedy_res[i])] 40 | 41 | gts = OrderedDict() 42 | for i in range(data['gts'].size(0)): 43 | gts[i] = [array_to_str(data['gts'][i][j]) 44 | for j in range(data['gts'].size(1))] 45 | 46 | res = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)] 47 | gts = {i: gts[i % batch_size] for i in range(2 * batch_size)} 48 | _, scores = CiderD_scorer.compute_score(gts, res) 49 | print('Cider scores:', _) 50 | 51 | scores = scores[:batch_size] - scores[batch_size:] 52 | 53 | rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1) 54 | 55 | return rewards 56 | -------------------------------------------------------------------------------- /misc/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | 6 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token. 7 | def decode_sequence(ix_to_word, seq): 8 | seq = seq.cpu() 9 | N, D = seq.size() 10 | out = [] 11 | for i in range(N): 12 | txt = '' 13 | for j in range(D): 14 | ix = seq[i, j].item() 15 | if ix > 0: 16 | if j >= 1: 17 | txt = txt + ' ' 18 | txt = txt + ix_to_word[str(ix)] 19 | else: 20 | break 21 | out.append(txt) 22 | return out 23 | 24 | 25 | class RewardCriterion(nn.Module): 26 | 27 | def __init__(self): 28 | super(RewardCriterion, self).__init__() 29 | 30 | def forward(self, input, seq, reward): 31 | input = input.contiguous().view(-1) 32 | reward = reward.contiguous().view(-1) 33 | mask = (seq > 0).float() 34 | mask = torch.cat([mask.new(mask.size(0), 1).fill_(1).cuda(), 35 | mask[:, :-1]], 1).contiguous().view(-1) 36 | output = - input * reward * mask 37 | output = torch.sum(output) / torch.sum(mask) 38 | 39 | return output 40 | 41 | 42 | class LanguageModelCriterion(nn.Module): 43 | 44 | def __init__(self): 45 | super(LanguageModelCriterion, self).__init__() 46 | self.loss_fn = nn.NLLLoss(reduce=False) 47 | 48 | def forward(self, logits, target, mask): 49 | """ 50 | logits: shape of (N, seq_len, vocab_size) 51 | target: shape of (N, seq_len) 52 | mask: shape of (N, seq_len) 53 | """ 54 | # truncate to the same size 55 | batch_size = logits.shape[0] 56 | target = target[:, :logits.shape[1]] 57 | mask = mask[:, :logits.shape[1]] 58 | logits = logits.contiguous().view(-1, logits.shape[2]) 59 | target = target.contiguous().view(-1) 60 | mask = mask.contiguous().view(-1) 61 | loss = self.loss_fn(logits, target) 62 | output = torch.sum(loss * mask) / batch_size 63 | return output -------------------------------------------------------------------------------- /models/Attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Attention(nn.Module): 7 | """ 8 | Applies an attention mechanism on the output features from the decoder. 9 | """ 10 | 11 | def __init__(self, dim): 12 | super(Attention, self).__init__() 13 | self.dim = dim 14 | self.linear1 = nn.Linear(dim * 2, dim) 15 | self.linear2 = nn.Linear(dim, 1, bias=False) 16 | #self._init_hidden() 17 | 18 | def _init_hidden(self): 19 | nn.init.xavier_normal_(self.linear1.weight) 20 | nn.init.xavier_normal_(self.linear2.weight) 21 | 22 | def forward(self, hidden_state, encoder_outputs): 23 | """ 24 | Arguments: 25 | hidden_state {Variable} -- batch_size x dim 26 | encoder_outputs {Variable} -- batch_size x seq_len x dim 27 | 28 | Returns: 29 | Variable -- context vector of size batch_size x dim 30 | """ 31 | batch_size, seq_len, _ = encoder_outputs.size() 32 | hidden_state = hidden_state.unsqueeze(1).repeat(1, seq_len, 1) 33 | inputs = torch.cat((encoder_outputs, hidden_state), 34 | 2).view(-1, self.dim * 2) 35 | o = self.linear2(F.tanh(self.linear1(inputs))) 36 | e = o.view(batch_size, seq_len) 37 | alpha = F.softmax(e, dim=1) 38 | context = torch.bmm(alpha.unsqueeze(1), encoder_outputs).squeeze(1) 39 | return context 40 | -------------------------------------------------------------------------------- /models/DecoderRNN.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from .Attention import Attention 8 | 9 | 10 | class DecoderRNN(nn.Module): 11 | """ 12 | Provides functionality for decoding in a seq2seq framework, with an option for attention. 13 | Args: 14 | vocab_size (int): size of the vocabulary 15 | max_len (int): a maximum allowed length for the sequence to be processed 16 | dim_hidden (int): the number of features in the hidden state `h` 17 | n_layers (int, optional): number of recurrent layers (default: 1) 18 | rnn_cell (str, optional): type of RNN cell (default: gru) 19 | bidirectional (bool, optional): if the encoder is bidirectional (default False) 20 | input_dropout_p (float, optional): dropout probability for the input sequence (default: 0) 21 | rnn_dropout_p (float, optional): dropout probability for the output sequence (default: 0) 22 | 23 | """ 24 | 25 | def __init__(self, 26 | vocab_size, 27 | max_len, 28 | dim_hidden, 29 | dim_word, 30 | n_layers=1, 31 | rnn_cell='gru', 32 | bidirectional=False, 33 | input_dropout_p=0.1, 34 | rnn_dropout_p=0.1): 35 | super(DecoderRNN, self).__init__() 36 | 37 | self.bidirectional_encoder = bidirectional 38 | 39 | self.dim_output = vocab_size 40 | self.dim_hidden = dim_hidden * 2 if bidirectional else dim_hidden 41 | self.dim_word = dim_word 42 | self.max_length = max_len 43 | self.sos_id = 1 44 | self.eos_id = 0 45 | self.input_dropout = nn.Dropout(input_dropout_p) 46 | self.embedding = nn.Embedding(self.dim_output, dim_word) 47 | self.attention = Attention(self.dim_hidden) 48 | if rnn_cell.lower() == 'lstm': 49 | self.rnn_cell = nn.LSTM 50 | elif rnn_cell.lower() == 'gru': 51 | self.rnn_cell = nn.GRU 52 | self.rnn = self.rnn_cell( 53 | self.dim_hidden + dim_word, 54 | self.dim_hidden, 55 | n_layers, 56 | batch_first=True, 57 | dropout=rnn_dropout_p) 58 | 59 | self.out = nn.Linear(self.dim_hidden, self.dim_output) 60 | 61 | self._init_weights() 62 | 63 | def forward(self, 64 | encoder_outputs, 65 | encoder_hidden, 66 | targets=None, 67 | mode='train', 68 | opt={}): 69 | """ 70 | 71 | Inputs: inputs, encoder_hidden, encoder_outputs, function, teacher_forcing_ratio 72 | - **encoder_hidden** (num_layers * num_directions, batch_size, dim_hidden): tensor containing the features in the 73 | hidden state `h` of encoder. Used as the initial hidden state of the decoder. (default `None`) 74 | - **encoder_outputs** (batch, seq_len, dim_hidden * num_directions): (default is `None`). 75 | - **targets** (batch, max_length): targets labels of the ground truth sentences 76 | 77 | Outputs: seq_probs, 78 | - **seq_logprobs** (batch_size, max_length, vocab_size): tensors containing the outputs of the decoding function. 79 | - **seq_preds** (batch_size, max_length): predicted symbols 80 | """ 81 | sample_max = opt.get('sample_max', 1) 82 | beam_size = opt.get('beam_size', 1) 83 | temperature = opt.get('temperature', 1.0) 84 | 85 | batch_size, _, _ = encoder_outputs.size() 86 | decoder_hidden = self._init_rnn_state(encoder_hidden) 87 | 88 | seq_logprobs = [] 89 | seq_preds = [] 90 | self.rnn.flatten_parameters() 91 | if mode == 'train': 92 | # use targets as rnn inputs 93 | # print(targets) 94 | targets_emb = self.embedding(targets) 95 | for i in range(self.max_length - 1): 96 | current_words = targets_emb[:, i, :] 97 | context = self.attention(decoder_hidden.squeeze(0), encoder_outputs) 98 | decoder_input = torch.cat([current_words, context], dim=1) 99 | decoder_input = self.input_dropout(decoder_input).unsqueeze(1) 100 | decoder_output, decoder_hidden = self.rnn( 101 | decoder_input, decoder_hidden) 102 | logprobs = F.log_softmax( 103 | self.out(decoder_output.squeeze(1)), dim=1) 104 | seq_logprobs.append(logprobs.unsqueeze(1)) 105 | 106 | seq_logprobs = torch.cat(seq_logprobs, 1) 107 | 108 | elif mode == 'inference': 109 | if beam_size > 1: 110 | return self.sample_beam(encoder_outputs, decoder_hidden, opt) 111 | 112 | for t in range(self.max_length - 1): 113 | context = self.attention( 114 | decoder_hidden.squeeze(0), encoder_outputs) 115 | 116 | if t == 0: # input 117 | it = torch.LongTensor([self.sos_id] * batch_size).cuda() 118 | elif sample_max: 119 | sampleLogprobs, it = torch.max(logprobs, 1) 120 | seq_logprobs.append(sampleLogprobs.view(-1, 1)) 121 | it = it.view(-1).long() 122 | 123 | else: 124 | # sample according to distribuition 125 | if temperature == 1.0: 126 | prob_prev = torch.exp(logprobs) 127 | else: 128 | # scale logprobs by temperature 129 | prob_prev = torch.exp(torch.div(logprobs, temperature)) 130 | it = torch.multinomial(prob_prev, 1).cuda() 131 | sampleLogprobs = logprobs.gather(1, it) 132 | seq_logprobs.append(sampleLogprobs.view(-1, 1)) 133 | it = it.view(-1).long() 134 | 135 | seq_preds.append(it.view(-1, 1)) 136 | 137 | xt = self.embedding(it) 138 | decoder_input = torch.cat([xt, context], dim=1) 139 | decoder_input = self.input_dropout(decoder_input).unsqueeze(1) 140 | decoder_output, decoder_hidden = self.rnn( 141 | decoder_input, decoder_hidden) 142 | logprobs = F.log_softmax( 143 | self.out(decoder_output.squeeze(1)), dim=1) 144 | 145 | seq_logprobs = torch.cat(seq_logprobs, 1) 146 | seq_preds = torch.cat(seq_preds[1:], 1) 147 | 148 | return seq_logprobs, seq_preds 149 | 150 | def _init_weights(self): 151 | """ init the weight of some layers 152 | """ 153 | nn.init.xavier_normal_(self.out.weight) 154 | 155 | def _init_rnn_state(self, encoder_hidden): 156 | """ Initialize the encoder hidden state. """ 157 | if encoder_hidden is None: 158 | return None 159 | if isinstance(encoder_hidden, tuple): 160 | encoder_hidden = tuple( 161 | [self._cat_directions(h) for h in encoder_hidden]) 162 | else: 163 | encoder_hidden = self._cat_directions(encoder_hidden) 164 | return encoder_hidden 165 | 166 | def _cat_directions(self, h): 167 | """ If the encoder is bidirectional, do the following transformation. 168 | (#directions * #layers, #batch, dim_hidden) -> (#layers, #batch, #directions * dim_hidden) 169 | """ 170 | if self.bidirectional_encoder: 171 | h = torch.cat([h[0:h.size(0):2], h[1:h.size(0):2]], 2) 172 | return h 173 | -------------------------------------------------------------------------------- /models/EncoderRNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class EncoderRNN(nn.Module): 5 | def __init__(self, dim_vid, dim_hidden, input_dropout_p=0.2, rnn_dropout_p=0.5, 6 | n_layers=1, bidirectional=False, rnn_cell='gru'): 7 | """ 8 | 9 | Args: 10 | hidden_dim (int): dim of hidden state of rnn 11 | input_dropout_p (int): dropout probability for the input sequence 12 | dropout_p (float): dropout probability for the output sequence 13 | n_layers (int): number of rnn layers 14 | rnn_cell (str): type of RNN cell ('LSTM'/'GRU') 15 | """ 16 | super(EncoderRNN, self).__init__() 17 | self.dim_vid = dim_vid 18 | self.dim_hidden = dim_hidden 19 | self.input_dropout_p = input_dropout_p 20 | self.rnn_dropout_p = rnn_dropout_p 21 | self.n_layers = n_layers 22 | self.bidirectional = bidirectional 23 | self.rnn_cell = rnn_cell 24 | 25 | self.vid2hid = nn.Linear(dim_vid, dim_hidden) 26 | self.input_dropout = nn.Dropout(input_dropout_p) 27 | 28 | if rnn_cell.lower() == 'lstm': 29 | self.rnn_cell = nn.LSTM 30 | elif rnn_cell.lower() == 'gru': 31 | self.rnn_cell = nn.GRU 32 | 33 | self.rnn = self.rnn_cell(dim_hidden, dim_hidden, n_layers, batch_first=True, 34 | bidirectional=bidirectional, dropout=self.rnn_dropout_p) 35 | 36 | self._init_hidden() 37 | 38 | def _init_hidden(self): 39 | nn.init.xavier_normal_(self.vid2hid.weight) 40 | 41 | def forward(self, vid_feats): 42 | """ 43 | Applies a multi-layer RNN to an input sequence. 44 | Args: 45 | input_var (batch, seq_len): tensor containing the features of the input sequence. 46 | input_lengths (list of int, optional): A list that contains the lengths of sequences 47 | in the mini-batch 48 | Returns: output, hidden 49 | - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence 50 | - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h 51 | """ 52 | batch_size, seq_len, dim_vid = vid_feats.size() 53 | vid_feats = self.vid2hid(vid_feats.view(-1, dim_vid)) 54 | vid_feats = self.input_dropout(vid_feats) 55 | vid_feats = vid_feats.view(batch_size, seq_len, self.dim_hidden) 56 | self.rnn.flatten_parameters() 57 | output, hidden = self.rnn(vid_feats) 58 | return output, hidden 59 | -------------------------------------------------------------------------------- /models/S2VTAttModel.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class S2VTAttModel(nn.Module): 5 | def __init__(self, encoder, decoder): 6 | """ 7 | 8 | Args: 9 | encoder (nn.Module): Encoder rnn 10 | decoder (nn.Module): Decoder rnn 11 | """ 12 | super(S2VTAttModel, self).__init__() 13 | self.encoder = encoder 14 | self.decoder = decoder 15 | 16 | def forward(self, vid_feats, target_variable=None, 17 | mode='train', opt={}): 18 | """ 19 | 20 | Args: 21 | vid_feats (Variable): video feats of shape [batch_size, seq_len, dim_vid] 22 | target_variable (None, optional): groung truth labels 23 | 24 | Returns: 25 | seq_prob: Variable of shape [batch_size, max_len-1, vocab_size] 26 | seq_preds: [] or Variable of shape [batch_size, max_len-1] 27 | """ 28 | encoder_outputs, encoder_hidden = self.encoder(vid_feats) 29 | seq_prob, seq_preds = self.decoder(encoder_outputs, encoder_hidden, target_variable, mode, opt) 30 | return seq_prob, seq_preds 31 | -------------------------------------------------------------------------------- /models/S2VTModel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | import random 5 | from torch.autograd import Variable 6 | 7 | 8 | class S2VTModel(nn.Module): 9 | def __init__(self, vocab_size, max_len, dim_hidden, dim_word, dim_vid=2048, sos_id=1, eos_id=0, 10 | n_layers=1, rnn_cell='gru', rnn_dropout_p=0.2): 11 | super(S2VTModel, self).__init__() 12 | if rnn_cell.lower() == 'lstm': 13 | self.rnn_cell = nn.LSTM 14 | elif rnn_cell.lower() == 'gru': 15 | self.rnn_cell = nn.GRU 16 | self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, 17 | batch_first=True, dropout=rnn_dropout_p) 18 | self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, 19 | batch_first=True, dropout=rnn_dropout_p) 20 | 21 | self.dim_vid = dim_vid 22 | self.dim_output = vocab_size 23 | self.dim_hidden = dim_hidden 24 | self.dim_word = dim_word 25 | self.max_length = max_len 26 | self.sos_id = sos_id 27 | self.eos_id = eos_id 28 | self.embedding = nn.Embedding(self.dim_output, self.dim_word) 29 | 30 | self.out = nn.Linear(self.dim_hidden, self.dim_output) 31 | 32 | def forward(self, vid_feats, target_variable=None, 33 | mode='train', opt={}): 34 | batch_size, n_frames, _ = vid_feats.shape 35 | padding_words = Variable(vid_feats.data.new(batch_size, n_frames, self.dim_word)).zero_() 36 | padding_frames = Variable(vid_feats.data.new(batch_size, 1, self.dim_vid)).zero_() 37 | state1 = None 38 | state2 = None 39 | #self.rnn1.flatten_parameters() 40 | #self.rnn2.flatten_parameters() 41 | output1, state1 = self.rnn1(vid_feats, state1) 42 | input2 = torch.cat((output1, padding_words), dim=2) 43 | output2, state2 = self.rnn2(input2, state2) 44 | 45 | seq_probs = [] 46 | seq_preds = [] 47 | if mode == 'train': 48 | for i in range(self.max_length - 1): 49 | # doesn't input to the network 50 | current_words = self.embedding(target_variable[:, i]) 51 | self.rnn1.flatten_parameters() 52 | self.rnn2.flatten_parameters() 53 | output1, state1 = self.rnn1(padding_frames, state1) 54 | input2 = torch.cat( 55 | (output1, current_words.unsqueeze(1)), dim=2) 56 | output2, state2 = self.rnn2(input2, state2) 57 | logits = self.out(output2.squeeze(1)) 58 | logits = F.log_softmax(logits, dim=1) 59 | seq_probs.append(logits.unsqueeze(1)) 60 | seq_probs = torch.cat(seq_probs, 1) 61 | 62 | else: 63 | current_words = self.embedding( 64 | Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda()) 65 | for i in range(self.max_length - 1): 66 | self.rnn1.flatten_parameters() 67 | self.rnn2.flatten_parameters() 68 | output1, state1 = self.rnn1(padding_frames, state1) 69 | input2 = torch.cat( 70 | (output1, current_words.unsqueeze(1)), dim=2) 71 | output2, state2 = self.rnn2(input2, state2) 72 | logits = self.out(output2.squeeze(1)) 73 | logits = F.log_softmax(logits, dim=1) 74 | seq_probs.append(logits.unsqueeze(1)) 75 | _, preds = torch.max(logits, 1) 76 | current_words = self.embedding(preds) 77 | seq_preds.append(preds.unsqueeze(1)) 78 | seq_probs = torch.cat(seq_probs, 1) 79 | seq_preds = torch.cat(seq_preds, 1) 80 | return seq_probs, seq_preds 81 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .EncoderRNN import EncoderRNN 2 | from .DecoderRNN import DecoderRNN 3 | from .S2VTAttModel import S2VTAttModel 4 | from .S2VTModel import S2VTModel 5 | -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_opt(): 5 | parser = argparse.ArgumentParser() 6 | # Data input settings 7 | parser.add_argument( 8 | '--input_json', 9 | type=str, 10 | default='data/videodatainfo_2017.json', 11 | help='path to the json file containing video info') 12 | parser.add_argument( 13 | '--info_json', 14 | type=str, 15 | default='data/info.json', 16 | help='path to the json file containing additional info and vocab') 17 | parser.add_argument( 18 | '--caption_json', 19 | type=str, 20 | default='data/caption.json', 21 | help='path to the processed video caption json') 22 | 23 | parser.add_argument( 24 | '--feats_dir', 25 | nargs='*', 26 | type=str, 27 | default=['data/feats/resnet152/'], 28 | help='path to the directory containing the preprocessed fc feats') 29 | 30 | parser.add_argument('--c3d_feats_dir', type=str, default='data/c3d_feats') 31 | parser.add_argument( 32 | '--with_c3d', type=int, default=0, help='whether to use c3d features') 33 | 34 | parser.add_argument( 35 | '--cached_tokens', 36 | type=str, 37 | default='msr-all-idxs', 38 | help='Cached token file for calculating cider score \ 39 | during self critical training.') 40 | 41 | # Model settings 42 | parser.add_argument( 43 | "--model", type=str, default='S2VTModel', help="with model to use") 44 | 45 | parser.add_argument( 46 | "--max_len", 47 | type=int, 48 | default=28, 49 | help='max length of captions(containing ,)') 50 | parser.add_argument( 51 | "--bidirectional", 52 | type=int, 53 | default=0, 54 | help="0 for disable, 1 for enable. encoder/decoder bidirectional.") 55 | 56 | parser.add_argument( 57 | '--dim_hidden', 58 | type=int, 59 | default=512, 60 | help='size of the rnn hidden layer') 61 | parser.add_argument( 62 | '--num_layers', type=int, default=1, help='number of layers in the RNN') 63 | parser.add_argument( 64 | '--input_dropout_p', 65 | type=float, 66 | default=0.2, 67 | help='strength of dropout in the Language Model RNN') 68 | parser.add_argument( 69 | '--rnn_type', type=str, default='gru', help='lstm or gru') 70 | parser.add_argument( 71 | '--rnn_dropout_p', 72 | type=float, 73 | default=0.5, 74 | help='strength of dropout in the Language Model RNN') 75 | parser.add_argument( 76 | '--dim_word', 77 | type=int, 78 | default=512, 79 | help='the encoding size of each token in the vocabulary, and the video.' 80 | ) 81 | 82 | parser.add_argument( 83 | '--dim_vid', 84 | type=int, 85 | default=2048, 86 | help='dim of features of video frames') 87 | 88 | # Optimization: General 89 | 90 | parser.add_argument( 91 | '--epochs', type=int, default=6001, help='number of epochs') 92 | parser.add_argument( 93 | '--batch_size', type=int, default=128, help='minibatch size') 94 | parser.add_argument( 95 | '--grad_clip', 96 | type=float, 97 | default=5, # 5., 98 | help='clip gradients at this value') 99 | 100 | parser.add_argument( 101 | '--self_crit_after', 102 | type=int, 103 | default=-1, 104 | help='After what epoch do we start finetuning the CNN? \ 105 | (-1 = disable; never finetune, 0 = finetune from start)' 106 | ) 107 | 108 | parser.add_argument( 109 | '--learning_rate', type=float, default=4e-4, help='learning rate') 110 | 111 | parser.add_argument( 112 | '--learning_rate_decay_every', 113 | type=int, 114 | default=200, 115 | help='every how many iterations thereafter to drop LR?(in epoch)') 116 | parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8) 117 | parser.add_argument( 118 | '--optim_alpha', type=float, default=0.9, help='alpha for adam') 119 | parser.add_argument( 120 | '--optim_beta', type=float, default=0.999, help='beta used for adam') 121 | parser.add_argument( 122 | '--optim_epsilon', 123 | type=float, 124 | default=1e-8, 125 | help='epsilon that goes into denominator for smoothing') 126 | parser.add_argument( 127 | '--weight_decay', 128 | type=float, 129 | default=5e-4, 130 | help='weight_decay. strength of weight regularization') 131 | 132 | parser.add_argument( 133 | '--save_checkpoint_every', 134 | type=int, 135 | default=50, 136 | help='how often to save a model checkpoint (in epoch)?') 137 | parser.add_argument( 138 | '--checkpoint_path', 139 | type=str, 140 | default='save', 141 | help='directory to store checkpointed models') 142 | 143 | parser.add_argument( 144 | '--gpu', type=str, default='0', help='gpu device number') 145 | 146 | args = parser.parse_args() 147 | 148 | return args 149 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | Cython 0.29.2 2 | dask 1.1.1 3 | decorator 4.3.2 4 | enum34 1.1.6 5 | grpcio 1.18.0 6 | h5py 2.9.0 7 | lmdb 0.94 8 | matplotlib 2.2.3 py27hb69df0a_0 9 | mkl 2019.0 118 10 | networkx 2.2 11 | nltk 3.4 12 | numpy 1.15.2 py27h1d66e8a_1 13 | opencv 3.4.2 py27h6fd60c2_1 14 | pillow 5.3.0 py27h34e0f95_0 15 | protobuf 3.6.1 16 | py-opencv 3.4.2 py27hb342d67_1 17 | python 2.7.15 h1571d57_0 18 | pytorch 0.4.1 py27__9.0.176_7.1.2_2 19 | scikit-image 0.14.2 20 | scipy 1.2.0 21 | setuptools 40.4.3 py27_0 22 | tensorboard 1.12.2 23 | tensorboard-logger 0.1.0 24 | torchvision 0.2.1 py27_1 25 | -------------------------------------------------------------------------------- /vocab.py: -------------------------------------------------------------------------------- 1 | # Create a vocabulary wrapper 2 | import nltk 3 | import pickle 4 | from collections import Counter 5 | from pycocotools.coco import COCO 6 | import json 7 | import argparse 8 | import os 9 | 10 | annotations = { 11 | 'coco_precomp': ['train_caps.txt', 'dev_caps.txt'], 12 | 'coco': ['annotations/captions_train2014.json', 13 | 'annotations/captions_val2014.json'], 14 | 'f8k_precomp': ['train_caps.txt', 'dev_caps.txt'], 15 | '10crop_precomp': ['train_caps.txt', 'dev_caps.txt'], 16 | 'f30k_precomp': ['train_caps.txt', 'dev_caps.txt'], 17 | 'f8k': ['dataset_flickr8k.json'], 18 | 'f30k': ['dataset_flickr30k.json'], 19 | } 20 | 21 | 22 | class Vocabulary(object): 23 | """Simple vocabulary wrapper.""" 24 | 25 | def __init__(self): 26 | self.word2idx = {} 27 | self.idx2word = {} 28 | self.idx = 0 29 | 30 | def add_word(self, word): 31 | if word not in self.word2idx: 32 | self.word2idx[word] = self.idx 33 | self.idx2word[self.idx] = word 34 | self.idx += 1 35 | 36 | def __call__(self, word): 37 | if word not in self.word2idx: 38 | return self.word2idx[''] 39 | return self.word2idx[word] 40 | 41 | def __len__(self): 42 | return len(self.word2idx) 43 | 44 | 45 | def from_coco_json(path): 46 | coco = COCO(path) 47 | ids = coco.anns.keys() 48 | captions = [] 49 | for i, idx in enumerate(ids): 50 | captions.append(str(coco.anns[idx]['caption'])) 51 | 52 | return captions 53 | 54 | 55 | def from_flickr_json(path): 56 | dataset = json.load(open(path, 'r'))['images'] 57 | captions = [] 58 | for i, d in enumerate(dataset): 59 | captions += [str(x['raw']) for x in d['sentences']] 60 | 61 | return captions 62 | 63 | 64 | def from_txt(txt): 65 | captions = [] 66 | with open(txt, 'rb') as f: 67 | for line in f: 68 | captions.append(line.strip()) 69 | return captions 70 | 71 | 72 | def build_vocab(data_path, data_name, jsons, threshold): 73 | """Build a simple vocabulary wrapper.""" 74 | counter = Counter() 75 | for path in jsons[data_name]: 76 | full_path = os.path.join(os.path.join(data_path, data_name), path) 77 | if data_name == 'coco': 78 | captions = from_coco_json(full_path) 79 | elif data_name == 'f8k' or data_name == 'f30k': 80 | captions = from_flickr_json(full_path) 81 | else: 82 | captions = from_txt(full_path) 83 | for i, caption in enumerate(captions): 84 | tokens = nltk.tokenize.word_tokenize( 85 | caption.lower().decode('utf-8')) 86 | counter.update(tokens) 87 | 88 | if i % 1000 == 0: 89 | print("[%d/%d] tokenized the captions." % (i, len(captions))) 90 | 91 | # Discard if the occurrence of the word is less than min_word_cnt. 92 | words = [word for word, cnt in counter.items() if cnt >= threshold] 93 | 94 | # Create a vocab wrapper and add some special tokens. 95 | vocab = Vocabulary() 96 | vocab.add_word('') 97 | vocab.add_word('') 98 | vocab.add_word('') 99 | vocab.add_word('') 100 | 101 | # Add words to the vocabulary. 102 | for i, word in enumerate(words): 103 | vocab.add_word(word) 104 | return vocab 105 | 106 | 107 | def main(data_path, data_name): 108 | vocab = build_vocab(data_path, data_name, jsons=annotations, threshold=4) 109 | with open('./vocab/%s_vocab.pkl' % data_name, 'wb') as f: 110 | pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL) 111 | print("Saved vocabulary file to ", './vocab/%s_vocab.pkl' % data_name) 112 | 113 | 114 | if __name__ == '__main__': 115 | parser = argparse.ArgumentParser() 116 | parser.add_argument('--data_path', default='/w/31/faghri/vsepp_data/') 117 | parser.add_argument('--data_name', default='coco', 118 | help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') 119 | opt = parser.parse_args() 120 | main(opt.data_path, opt.data_name) 121 | -------------------------------------------------------------------------------- /vocab/10crop_precomp_vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/vocab/10crop_precomp_vocab.pkl -------------------------------------------------------------------------------- /vocab/coco_precomp_vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/vocab/coco_precomp_vocab.pkl -------------------------------------------------------------------------------- /vocab/coco_vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/vocab/coco_vocab.pkl -------------------------------------------------------------------------------- /vocab/f30k_precomp_vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/vocab/f30k_precomp_vocab.pkl -------------------------------------------------------------------------------- /vocab/f30k_vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/vocab/f30k_vocab.pkl -------------------------------------------------------------------------------- /vocab/f8k_precomp_vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/vocab/f8k_precomp_vocab.pkl -------------------------------------------------------------------------------- /vocab/f8k_vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunpengLi1994/VSRN/777ae74326fdb6abe69dbd3911d0e545322520d1/vocab/f8k_vocab.pkl --------------------------------------------------------------------------------