├── README.md ├── __init__.py ├── bleu ├── LICENSE ├── __init__.py ├── bleu.py └── bleu_scorer.py ├── cider ├── __init__.py ├── cider.py └── cider_scorer.py ├── eval.py ├── license.txt ├── meteor ├── __init__.py ├── data │ └── paraphrase-en.gz ├── meteor-1.5.jar └── meteor.py ├── rouge ├── __init__.py └── rouge.py └── tokenizer ├── __init__.py ├── ptbtokenizer.py └── stanford-corenlp-3.4.1.jar /README.md: -------------------------------------------------------------------------------- 1 | Microsoft COCO Caption Evaluation Tools
2 | --- 3 | 4 | Modified the code to work with Python 3.
5 | 6 | ### Requirements 7 | * Python 3.x 8 | * Java 1.8 9 | * pycocotools 10 | 11 | --- 12 | 13 | ### Tested on 14 | * Windows 10, Python 3.5. 15 | 16 | --- 17 | ### To fix Windows JVM memory error:
18 | Add the following in System Variables
19 |     Variable name : _JAVA_OPTIONS
20 |     Variable value : -Xmx1024M
21 | 22 | --- 23 | Original code : https://github.com/tylin/coco-caption
24 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' -------------------------------------------------------------------------------- /bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' -------------------------------------------------------------------------------- /bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT 12 | # By Sabarish Sivanath 13 | # To support Python 3 14 | 15 | from .bleu_scorer import BleuScorer 16 | 17 | 18 | class Bleu: 19 | def __init__(self, n=4): 20 | # default compute Blue score up to 4 21 | self._n = n 22 | self._hypo_for_image = {} 23 | self.ref_for_image = {} 24 | 25 | def compute_score(self, gts, res, score_option = 'closest', verbose = 1): 26 | ''' 27 | Inputs: 28 | gts - ground truths 29 | res - predictions 30 | score_option - {shortest, closest, average} 31 | verbose - 1 or 0 32 | Outputs: 33 | Blue scores 34 | ''' 35 | assert(gts.keys() == res.keys()) 36 | imgIds = gts.keys() 37 | 38 | bleu_scorer = BleuScorer(n=self._n) 39 | for id in imgIds: 40 | hypo = res[id] 41 | ref = gts[id] 42 | 43 | # Sanity check. 44 | assert(type(hypo) is list) 45 | assert(len(hypo) == 1) 46 | assert(type(ref) is list) 47 | #assert(len(ref) >= 1) 48 | 49 | bleu_scorer += (hypo[0], ref) 50 | 51 | score, scores = bleu_scorer.compute_score(option = score_option, verbose =verbose) 52 | 53 | # return (bleu, bleu_info) 54 | return score, scores 55 | 56 | def method(self): 57 | return "Bleu" 58 | -------------------------------------------------------------------------------- /bleu/bleu_scorer.py: -------------------------------------------------------------------------------- 1 | # bleu_scorer.py 2 | # David Chiang 3 | 4 | # Copyright (c) 2004-2006 University of Maryland. All rights 5 | # reserved. Do not redistribute without permission from the 6 | # author. Not for commercial use. 7 | 8 | # Modified by: 9 | # Hao Fang 10 | # Tsung-Yi Lin 11 | 12 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT 13 | # By Sabarish Sivanath 14 | # To support Python 3 15 | 16 | '''Provides: 17 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). 18 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). 19 | ''' 20 | 21 | import copy 22 | import sys, math, re 23 | from collections import defaultdict 24 | 25 | def precook(s, n=4, out=False): 26 | """Takes a string as input and returns an object that can be given to 27 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 28 | can take string arguments as well.""" 29 | words = s.split() 30 | counts = defaultdict(int) 31 | for k in range(1,n+1): 32 | for i in range(len(words)-k+1): 33 | ngram = tuple(words[i:i+k]) 34 | counts[ngram] += 1 35 | return (len(words), counts) 36 | 37 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" 38 | '''Takes a list of reference sentences for a single segment 39 | and returns an object that encapsulates everything that BLEU 40 | needs to know about them.''' 41 | 42 | reflen = [] 43 | maxcounts = {} 44 | for ref in refs: 45 | rl, counts = precook(ref, n) 46 | reflen.append(rl) 47 | for (ngram,count) in counts.items(): 48 | maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 49 | 50 | # Calculate effective reference sentence length. 51 | if eff == "shortest": 52 | reflen = min(reflen) 53 | elif eff == "average": 54 | reflen = float(sum(reflen))/len(reflen) 55 | 56 | ## lhuang: N.B.: leave reflen computaiton to the very end!! 57 | 58 | ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) 59 | 60 | return (reflen, maxcounts) 61 | 62 | def cook_test(test, refs , eff=None, n=4): 63 | '''Takes a test sentence and returns an object that 64 | encapsulates everything that BLEU needs to know about it.''' 65 | 66 | reflen = refs[0] 67 | refmaxcounts = refs[1] 68 | 69 | testlen, counts = precook(test, n, True) 70 | 71 | result = {} 72 | 73 | # Calculate effective reference sentence length. 74 | 75 | if eff == "closest": 76 | result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1] 77 | else: ## i.e., "average" or "shortest" or None 78 | result["reflen"] = reflen 79 | 80 | result["testlen"] = testlen 81 | 82 | result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)] 83 | 84 | result['correct'] = [0]*n 85 | for (ngram, count) in counts.items(): 86 | result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count) 87 | 88 | return result 89 | 90 | class BleuScorer(object): 91 | """Bleu scorer. 92 | """ 93 | 94 | __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" 95 | # special_reflen is used in oracle (proportional effective ref len for a node). 96 | 97 | def copy(self): 98 | ''' copy the refs.''' 99 | new = BleuScorer(n=self.n) 100 | new.ctest = copy.copy(self.ctest) 101 | new.crefs = copy.copy(self.crefs) 102 | new._score = None 103 | return new 104 | 105 | def __init__(self, test=None, refs=None, n=4, special_reflen=None): 106 | ''' singular instance ''' 107 | 108 | self.n = n 109 | self.crefs = [] 110 | self.ctest = [] 111 | self.cook_append(test, refs) 112 | self.special_reflen = special_reflen 113 | 114 | def cook_append(self, test, refs): 115 | '''called by constructor and __iadd__ to avoid creating new instances.''' 116 | 117 | if refs is not None: 118 | self.crefs.append(cook_refs(refs)) 119 | if test is not None: 120 | cooked_test = cook_test(test, self.crefs[-1]) 121 | self.ctest.append(cooked_test) ## N.B.: -1 122 | else: 123 | self.ctest.append(None) # lens of crefs and ctest have to match 124 | 125 | self._score = None ## need to recompute 126 | 127 | def ratio(self, option=None): 128 | self.compute_score(option=option) 129 | return self._ratio 130 | 131 | def score_ratio(self, option=None): 132 | '''return (bleu, len_ratio) pair''' 133 | return (self.fscore(option=option), self.ratio(option=option)) 134 | 135 | def score_ratio_str(self, option=None): 136 | return "%.4f (%.2f)" % self.score_ratio(option) 137 | 138 | def reflen(self, option=None): 139 | self.compute_score(option=option) 140 | return self._reflen 141 | 142 | def testlen(self, option=None): 143 | self.compute_score(option=option) 144 | return self._testlen 145 | 146 | def retest(self, new_test): 147 | if type(new_test) is str: 148 | new_test = [new_test] 149 | assert len(new_test) == len(self.crefs), new_test 150 | self.ctest = [] 151 | for t, rs in zip(new_test, self.crefs): 152 | self.ctest.append(cook_test(t, rs)) 153 | self._score = None 154 | 155 | return self 156 | 157 | def rescore(self, new_test): 158 | ''' replace test(s) with new test(s), and returns the new score.''' 159 | 160 | return self.retest(new_test).compute_score() 161 | 162 | def size(self): 163 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 164 | return len(self.crefs) 165 | 166 | def __iadd__(self, other): 167 | '''add an instance (e.g., from another sentence).''' 168 | 169 | if type(other) is tuple: 170 | ## avoid creating new BleuScorer instances 171 | self.cook_append(other[0], other[1]) 172 | else: 173 | assert self.compatible(other), "incompatible BLEUs." 174 | self.ctest.extend(other.ctest) 175 | self.crefs.extend(other.crefs) 176 | self._score = None ## need to recompute 177 | 178 | return self 179 | 180 | def compatible(self, other): 181 | return isinstance(other, BleuScorer) and self.n == other.n 182 | 183 | def single_reflen(self, option="average"): 184 | return self._single_reflen(self.crefs[0][0], option) 185 | 186 | def _single_reflen(self, reflens, option=None, testlen=None): 187 | 188 | if option == "shortest": 189 | reflen = min(reflens) 190 | elif option == "average": 191 | reflen = float(sum(reflens))/len(reflens) 192 | elif option == "closest": 193 | reflen = min((abs(l-testlen), l) for l in reflens)[1] 194 | else: 195 | assert False, "unsupported reflen option %s" % option 196 | 197 | return reflen 198 | 199 | def recompute_score(self, option=None, verbose=0): 200 | self._score = None 201 | return self.compute_score(option, verbose) 202 | 203 | def compute_score(self, option=None, verbose=0): 204 | n = self.n 205 | small = 1e-9 206 | tiny = 1e-15 ## so that if guess is 0 still return 0 207 | bleu_list = [[] for _ in range(n)] 208 | 209 | if self._score is not None: 210 | return self._score 211 | 212 | if option is None: 213 | option = "average" if len(self.crefs) == 1 else "closest" 214 | 215 | self._testlen = 0 216 | self._reflen = 0 217 | totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n} 218 | 219 | # for each sentence 220 | for comps in self.ctest: 221 | testlen = comps['testlen'] 222 | self._testlen += testlen 223 | 224 | if self.special_reflen is None: ## need computation 225 | reflen = self._single_reflen(comps['reflen'], option, testlen) 226 | else: 227 | reflen = self.special_reflen 228 | 229 | self._reflen += reflen 230 | 231 | for key in ['guess','correct']: 232 | for k in range(n): 233 | totalcomps[key][k] += comps[key][k] 234 | 235 | # append per image bleu score 236 | bleu = 1. 237 | for k in range(n): 238 | bleu *= (float(comps['correct'][k]) + tiny) \ 239 | /(float(comps['guess'][k]) + small) 240 | bleu_list[k].append(bleu ** (1./(k+1))) 241 | ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division 242 | if ratio < 1: 243 | for k in range(n): 244 | bleu_list[k][-1] *= math.exp(1 - 1/ratio) 245 | 246 | if verbose > 1: 247 | print(comps, reflen) 248 | 249 | totalcomps['reflen'] = self._reflen 250 | totalcomps['testlen'] = self._testlen 251 | 252 | bleus = [] 253 | bleu = 1. 254 | for k in range(n): 255 | bleu *= float(totalcomps['correct'][k] + tiny) \ 256 | / (totalcomps['guess'][k] + small) 257 | bleus.append(bleu ** (1./(k+1))) 258 | ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division 259 | if ratio < 1: 260 | for k in range(n): 261 | bleus[k] *= math.exp(1 - 1/ratio) 262 | 263 | if verbose > 0: 264 | print(totalcomps) 265 | print("ratio:", ratio) 266 | 267 | self._score = bleus 268 | return self._score, bleu_list 269 | -------------------------------------------------------------------------------- /cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | 11 | from .cider_scorer import CiderScorer 12 | import pdb 13 | 14 | class Cider: 15 | """ 16 | Main Class to compute the CIDEr metric 17 | 18 | """ 19 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 20 | # set cider to sum over 1 to 4-grams 21 | self._n = n 22 | # set the standard deviation parameter for gaussian penalty 23 | self._sigma = sigma 24 | 25 | def compute_score(self, gts, res): 26 | """ 27 | Main function to compute CIDEr score 28 | :param hypo_for_image (dict) : dictionary with key and value 29 | ref_for_image (dict) : dictionary with key and value 30 | :return: cider (float) : computed CIDEr score for the corpus 31 | """ 32 | 33 | assert(gts.keys() == res.keys()) 34 | imgIds = gts.keys() 35 | 36 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 37 | 38 | for id in imgIds: 39 | hypo = res[id] 40 | ref = gts[id] 41 | 42 | # Sanity check. 43 | assert(type(hypo) is list) 44 | assert(len(hypo) == 1) 45 | assert(type(ref) is list) 46 | assert(len(ref) > 0) 47 | 48 | cider_scorer += (hypo[0], ref) 49 | 50 | (score, scores) = cider_scorer.compute_score() 51 | 52 | return score, scores 53 | 54 | def method(self): 55 | return "CIDEr" -------------------------------------------------------------------------------- /cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | 6 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT 7 | # By Sabarish Sivanath 8 | # To support Python 3 9 | 10 | import copy 11 | from collections import defaultdict 12 | import numpy as np 13 | import pdb 14 | import math 15 | 16 | def precook(s, n=4, out=False): 17 | """ 18 | Takes a string as input and returns an object that can be given to 19 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 20 | can take string arguments as well. 21 | :param s: string : sentence to be converted into ngrams 22 | :param n: int : number of ngrams for which representation is calculated 23 | :return: term frequency vector for occuring ngrams 24 | """ 25 | words = s.split() 26 | counts = defaultdict(int) 27 | for k in range(1,n+1): 28 | for i in range(len(words)-k+1): 29 | ngram = tuple(words[i:i+k]) 30 | counts[ngram] += 1 31 | return counts 32 | 33 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 34 | '''Takes a list of reference sentences for a single segment 35 | and returns an object that encapsulates everything that BLEU 36 | needs to know about them. 37 | :param refs: list of string : reference sentences for some image 38 | :param n: int : number of ngrams for which (ngram) representation is calculated 39 | :return: result (list of dict) 40 | ''' 41 | return [precook(ref, n) for ref in refs] 42 | 43 | def cook_test(test, n=4): 44 | '''Takes a test sentence and returns an object that 45 | encapsulates everything that BLEU needs to know about it. 46 | :param test: list of string : hypothesis sentence for some image 47 | :param n: int : number of ngrams for which (ngram) representation is calculated 48 | :return: result (dict) 49 | ''' 50 | return precook(test, n, True) 51 | 52 | class CiderScorer(object): 53 | """CIDEr scorer. 54 | """ 55 | 56 | def copy(self): 57 | ''' copy the refs.''' 58 | new = CiderScorer(n=self.n) 59 | new.ctest = copy.copy(self.ctest) 60 | new.crefs = copy.copy(self.crefs) 61 | return new 62 | 63 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 64 | ''' singular instance ''' 65 | self.n = n 66 | self.sigma = sigma 67 | self.crefs = [] 68 | self.ctest = [] 69 | self.document_frequency = defaultdict(float) 70 | self.cook_append(test, refs) 71 | self.ref_len = None 72 | 73 | def cook_append(self, test, refs): 74 | '''called by constructor and __iadd__ to avoid creating new instances.''' 75 | 76 | if refs is not None: 77 | self.crefs.append(cook_refs(refs)) 78 | if test is not None: 79 | self.ctest.append(cook_test(test)) ## N.B.: -1 80 | else: 81 | self.ctest.append(None) # lens of crefs and ctest have to match 82 | 83 | def size(self): 84 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 85 | return len(self.crefs) 86 | 87 | def __iadd__(self, other): 88 | '''add an instance (e.g., from another sentence).''' 89 | 90 | if type(other) is tuple: 91 | ## avoid creating new CiderScorer instances 92 | self.cook_append(other[0], other[1]) 93 | else: 94 | self.ctest.extend(other.ctest) 95 | self.crefs.extend(other.crefs) 96 | 97 | return self 98 | def compute_doc_freq(self): 99 | ''' 100 | Compute term frequency for reference data. 101 | This will be used to compute idf (inverse document frequency later) 102 | The term frequency is stored in the object 103 | :return: None 104 | ''' 105 | for refs in self.crefs: 106 | # refs, k ref captions of one image 107 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]): 108 | self.document_frequency[ngram] += 1 109 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 110 | 111 | def compute_cider(self): 112 | def counts2vec(cnts): 113 | """ 114 | Function maps counts of ngram to vector of tfidf weights. 115 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 116 | The n-th entry of array denotes length of n-grams. 117 | :param cnts: 118 | :return: vec (array of dict), norm (array of float), length (int) 119 | """ 120 | vec = [defaultdict(float) for _ in range(self.n)] 121 | length = 0 122 | norm = [0.0 for _ in range(self.n)] 123 | for (ngram,term_freq) in cnts.items(): 124 | # give word count 1 if it doesn't appear in reference corpus 125 | df = np.log(max(1.0, self.document_frequency[ngram])) 126 | # ngram index 127 | n = len(ngram)-1 128 | # tf (term_freq) * idf (precomputed idf) for n-grams 129 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 130 | # compute norm for the vector. the norm will be used for computing similarity 131 | norm[n] += pow(vec[n][ngram], 2) 132 | 133 | if n == 1: 134 | length += term_freq 135 | norm = [np.sqrt(n) for n in norm] 136 | return vec, norm, length 137 | 138 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 139 | ''' 140 | Compute the cosine similarity of two vectors. 141 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 142 | :param vec_ref: array of dictionary for vector corresponding to reference 143 | :param norm_hyp: array of float for vector corresponding to hypothesis 144 | :param norm_ref: array of float for vector corresponding to reference 145 | :param length_hyp: int containing length of hypothesis 146 | :param length_ref: int containing length of reference 147 | :return: array of score for each n-grams cosine similarity 148 | ''' 149 | delta = float(length_hyp - length_ref) 150 | # measure consine similarity 151 | val = np.array([0.0 for _ in range(self.n)]) 152 | for n in range(self.n): 153 | # ngram 154 | for (ngram,count) in vec_hyp[n].items(): 155 | # vrama91 : added clipping 156 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 157 | 158 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 159 | val[n] /= (norm_hyp[n]*norm_ref[n]) 160 | 161 | assert(not math.isnan(val[n])) 162 | # vrama91: added a length based gaussian penalty 163 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 164 | return val 165 | 166 | # compute log reference length 167 | self.ref_len = np.log(float(len(self.crefs))) 168 | 169 | scores = [] 170 | for test, refs in zip(self.ctest, self.crefs): 171 | # compute vector for test captions 172 | vec, norm, length = counts2vec(test) 173 | # compute vector for ref captions 174 | score = np.array([0.0 for _ in range(self.n)]) 175 | for ref in refs: 176 | vec_ref, norm_ref, length_ref = counts2vec(ref) 177 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 178 | # change by vrama91 - mean of ngram scores, instead of sum 179 | score_avg = np.mean(score) 180 | # divide by number of references 181 | score_avg /= len(refs) 182 | # multiply score by 10 183 | score_avg *= 10.0 184 | # append score of an image to the score list 185 | scores.append(score_avg) 186 | return scores 187 | 188 | def compute_score(self, option=None, verbose=0): 189 | # compute idf 190 | self.compute_doc_freq() 191 | # assert to check document frequency 192 | assert(len(self.ctest) >= max(self.document_frequency.values())) 193 | # compute cider score 194 | score = self.compute_cider() 195 | # debug 196 | # print score 197 | return np.mean(np.array(score)), np.array(score) -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | from .tokenizer.ptbtokenizer import PTBTokenizer 3 | from .bleu.bleu import Bleu 4 | from .meteor.meteor import Meteor 5 | from .rouge.rouge import Rouge 6 | from .cider.cider import Cider 7 | 8 | class COCOEvalCap: 9 | def __init__(self, coco, cocoRes): 10 | self.evalImgs = [] 11 | self.eval = {} 12 | self.imgToEval = {} 13 | self.coco = coco 14 | self.cocoRes = cocoRes 15 | self.params = {'image_id': cocoRes.getImgIds()} 16 | 17 | def evaluate(self): 18 | imgIds = self.params['image_id'] 19 | # imgIds = self.coco.getImgIds() 20 | gts = {} 21 | res = {} 22 | for imgId in imgIds: 23 | gts[imgId] = self.coco.imgToAnns[imgId] 24 | res[imgId] = self.cocoRes.imgToAnns[imgId] 25 | 26 | # ================================================= 27 | # Set up scorers 28 | # ================================================= 29 | print('tokenization...') 30 | tokenizer = PTBTokenizer() 31 | gts = tokenizer.tokenize(gts) 32 | res = tokenizer.tokenize(res) 33 | 34 | # ================================================= 35 | # Set up scorers 36 | # ================================================= 37 | print('setting up scorers...') 38 | scorers = [ 39 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 40 | (Meteor(),"METEOR"), 41 | (Rouge(), "ROUGE_L"), 42 | (Cider(), "CIDEr") 43 | ] 44 | 45 | # ================================================= 46 | # Compute scores 47 | # ================================================= 48 | eval = {} 49 | for scorer, method in scorers: 50 | print('computing %s score...'%(scorer.method())) 51 | score, scores = scorer.compute_score(gts, res) 52 | if type(method) == list: 53 | for sc, scs, m in zip(score, scores, method): 54 | self.setEval(sc, m) 55 | self.setImgToEvalImgs(scs, imgIds, m) 56 | print("%s: %0.3f"%(m, sc)) 57 | else: 58 | self.setEval(score, method) 59 | self.setImgToEvalImgs(scores, imgIds, method) 60 | print("%s: %0.3f"%(method, score)) 61 | self.setEvalImgs() 62 | 63 | def setEval(self, score, method): 64 | self.eval[method] = score 65 | 66 | def setImgToEvalImgs(self, scores, imgIds, method): 67 | for imgId, score in zip(imgIds, scores): 68 | if not imgId in self.imgToEval: 69 | self.imgToEval[imgId] = {} 70 | self.imgToEval[imgId]["image_id"] = imgId 71 | self.imgToEval[imgId][method] = score 72 | 73 | def setEvalImgs(self): 74 | self.evalImgs = [eval for imgId, eval in self.imgToEval.items()] 75 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. -------------------------------------------------------------------------------- /meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /meteor/data/paraphrase-en.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/meteor/data/paraphrase-en.gz -------------------------------------------------------------------------------- /meteor/meteor-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/meteor/meteor-1.5.jar -------------------------------------------------------------------------------- /meteor/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Python wrapper for METEOR implementation, by Xinlei Chen 4 | # Acknowledge Michael Denkowski for the generous discussion and help 5 | 6 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT 7 | # By Sabarish Sivanath 8 | # To support Python 3 9 | 10 | import os 11 | import sys 12 | import subprocess 13 | import threading 14 | 15 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed. 16 | METEOR_JAR = 'meteor-1.5.jar' 17 | # print METEOR_JAR 18 | 19 | class Meteor: 20 | 21 | def __init__(self): 22 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \ 23 | '-', '-', '-stdio', '-l', 'en', '-norm'] 24 | self.meteor_p = subprocess.Popen(self.meteor_cmd, \ 25 | cwd=os.path.dirname(os.path.abspath(__file__)), \ 26 | stdin=subprocess.PIPE, \ 27 | stdout=subprocess.PIPE, \ 28 | stderr=subprocess.PIPE, 29 | universal_newlines = True, 30 | bufsize = 1) 31 | # Used to guarantee thread safety 32 | self.lock = threading.Lock() 33 | 34 | def compute_score(self, gts, res): 35 | assert(gts.keys() == res.keys()) 36 | imgIds = gts.keys() 37 | scores = [] 38 | 39 | eval_line = 'EVAL' 40 | self.lock.acquire() 41 | for i in imgIds: 42 | assert(len(res[i]) == 1) 43 | stat = self._stat(res[i][0], gts[i]) 44 | eval_line += ' ||| {}'.format(stat) 45 | 46 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 47 | for i in range(0,len(imgIds)): 48 | scores.append(float(self.meteor_p.stdout.readline().strip())) 49 | score = float(self.meteor_p.stdout.readline().strip()) 50 | self.lock.release() 51 | 52 | return score, scores 53 | 54 | def method(self): 55 | return "METEOR" 56 | 57 | def _stat(self, hypothesis_str, reference_list): 58 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 59 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 60 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 61 | self.meteor_p.stdin.write('{}\n'.format(score_line)) 62 | return self.meteor_p.stdout.readline().strip() 63 | 64 | def _score(self, hypothesis_str, reference_list): 65 | self.lock.acquire() 66 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 67 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 68 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 69 | self.meteor_p.stdin.write('{}\n'.format(score_line)) 70 | stats = self.meteor_p.stdout.readline().strip() 71 | eval_line = 'EVAL ||| {}'.format(stats) 72 | # EVAL ||| stats 73 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 74 | score = float(self.meteor_p.stdout.readline().strip()) 75 | # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice 76 | # thanks for Andrej for pointing this out 77 | score = float(self.meteor_p.stdout.readline().strip()) 78 | self.lock.release() 79 | return score 80 | 81 | def __del__(self): 82 | self.lock.acquire() 83 | self.meteor_p.stdin.close() 84 | self.meteor_p.kill() 85 | self.meteor_p.wait() 86 | self.lock.release() 87 | -------------------------------------------------------------------------------- /rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /rouge/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | def my_lcs(string, sub): 14 | """ 15 | Calculates longest common subsequence for a pair of tokenized strings 16 | :param string : list of str : tokens from a string split using whitespace 17 | :param sub : list of str : shorter string, also split using whitespace 18 | :returns: length (list of int): length of the longest common subsequence between the two strings 19 | 20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 21 | """ 22 | if(len(string)< len(sub)): 23 | sub, string = string, sub 24 | 25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 26 | 27 | for j in range(1,len(sub)+1): 28 | for i in range(1,len(string)+1): 29 | if(string[i-1] == sub[j-1]): 30 | lengths[i][j] = lengths[i-1][j-1] + 1 31 | else: 32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 33 | 34 | return lengths[len(string)][len(sub)] 35 | 36 | class Rouge(): 37 | ''' 38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 39 | 40 | ''' 41 | def __init__(self): 42 | # vrama91: updated the value below based on discussion with Hovey 43 | self.beta = 1.2 44 | 45 | def calc_score(self, candidate, refs): 46 | """ 47 | Compute ROUGE-L score given one candidate and references for an image 48 | :param candidate: str : candidate sentence to be evaluated 49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 50 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 51 | """ 52 | assert(len(candidate)==1) 53 | assert(len(refs)>0) 54 | prec = [] 55 | rec = [] 56 | 57 | # split into tokens 58 | token_c = candidate[0].split(" ") 59 | 60 | for reference in refs: 61 | # split into tokens 62 | token_r = reference.split(" ") 63 | # compute the longest common subsequence 64 | lcs = my_lcs(token_r, token_c) 65 | prec.append(lcs/float(len(token_c))) 66 | rec.append(lcs/float(len(token_r))) 67 | 68 | prec_max = max(prec) 69 | rec_max = max(rec) 70 | 71 | if(prec_max!=0 and rec_max !=0): 72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 73 | else: 74 | score = 0.0 75 | return score 76 | 77 | def compute_score(self, gts, res): 78 | """ 79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 80 | Invoked by evaluate_captions.py 81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 84 | """ 85 | assert(gts.keys() == res.keys()) 86 | imgIds = gts.keys() 87 | 88 | score = [] 89 | for id in imgIds: 90 | hypo = res[id] 91 | ref = gts[id] 92 | 93 | score.append(self.calc_score(hypo, ref)) 94 | 95 | # Sanity check. 96 | assert(type(hypo) is list) 97 | assert(len(hypo) == 1) 98 | assert(type(ref) is list) 99 | assert(len(ref) > 0) 100 | 101 | average_score = np.mean(np.array(score)) 102 | return average_score, np.array(score) 103 | 104 | def method(self): 105 | return "Rouge" 106 | -------------------------------------------------------------------------------- /tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import sys 13 | import subprocess 14 | import tempfile 15 | import itertools 16 | 17 | 18 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT 19 | # By Sabarish Sivanath 20 | # To support Python 3 21 | 22 | # path to the stanford corenlp jar 23 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 24 | 25 | # punctuations to be removed from the sentences 26 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 27 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 28 | 29 | class PTBTokenizer: 30 | """Python wrapper of Stanford PTBTokenizer""" 31 | 32 | def tokenize(self, captions_for_image): 33 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 34 | 'edu.stanford.nlp.process.PTBTokenizer', \ 35 | '-preserveLines', '-lowerCase'] 36 | 37 | # ====================================================== 38 | # prepare data for PTB Tokenizer 39 | # ====================================================== 40 | final_tokenized_captions_for_image = {} 41 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 42 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 43 | 44 | # ====================================================== 45 | # save sentences to temporary file 46 | # ====================================================== 47 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 48 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) 49 | tmp_file.write(sentences.encode('utf-8')) 50 | tmp_file.close() 51 | 52 | # ====================================================== 53 | # tokenize sentence 54 | # ====================================================== 55 | cmd.append(os.path.basename(tmp_file.name)) 56 | p_tokenizer = subprocess.Popen(cmd, 57 | cwd=path_to_jar_dirname, 58 | stdout=subprocess.PIPE, 59 | universal_newlines = True, 60 | bufsize = 1) 61 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 62 | lines = token_lines.split('\n') 63 | # remove temp file 64 | os.remove(tmp_file.name) 65 | 66 | # ====================================================== 67 | # create dictionary for tokenized captions 68 | # ====================================================== 69 | for k, line in zip(image_id, lines): 70 | if not k in final_tokenized_captions_for_image: 71 | final_tokenized_captions_for_image[k] = [] 72 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 73 | if w not in PUNCTUATIONS]) 74 | final_tokenized_captions_for_image[k].append(tokenized_caption) 75 | 76 | return final_tokenized_captions_for_image 77 | -------------------------------------------------------------------------------- /tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/tokenizer/stanford-corenlp-3.4.1.jar --------------------------------------------------------------------------------