├── README.md
├── __init__.py
├── bleu
    ├── LICENSE
    ├── __init__.py
    ├── bleu.py
    └── bleu_scorer.py
├── cider
    ├── __init__.py
    ├── cider.py
    └── cider_scorer.py
├── eval.py
├── license.txt
├── meteor
    ├── __init__.py
    ├── data
    │   └── paraphrase-en.gz
    ├── meteor-1.5.jar
    └── meteor.py
├── rouge
    ├── __init__.py
    └── rouge.py
└── tokenizer
    ├── __init__.py
    ├── ptbtokenizer.py
    └── stanford-corenlp-3.4.1.jar


/README.md:
--------------------------------------------------------------------------------
 1 | Microsoft COCO Caption Evaluation Tools <br />
 2 | ---
 3 | 
 4 | Modified the code to work with Python 3. <br />
 5 | 
 6 | ### Requirements
 7 | * Python 3.x
 8 | * Java 1.8
 9 | * pycocotools
10 | 
11 | ---
12 | 
13 | ### Tested on
14 | * Windows 10, Python 3.5.
15 | 
16 | ---
17 | ### To fix Windows JVM memory error: <br />
18 | Add the following in System Variables <br />
19 | &nbsp;&nbsp;&nbsp;&nbsp;Variable name : _JAVA_OPTIONS <br />
20 | &nbsp;&nbsp;&nbsp;&nbsp;Variable value : -Xmx1024M <br />
21 | 
22 | ---
23 | Original code : https://github.com/tylin/coco-caption <br />
24 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'


--------------------------------------------------------------------------------
/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'


--------------------------------------------------------------------------------
/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
12 | # By Sabarish Sivanath
13 | # To support Python 3
14 | 
15 | from .bleu_scorer import BleuScorer
16 | 
17 | 
18 | class Bleu:
19 |     def __init__(self, n=4):
20 |         # default compute Blue score up to 4
21 |         self._n = n
22 |         self._hypo_for_image = {}
23 |         self.ref_for_image = {}
24 | 
25 |     def compute_score(self, gts, res, score_option = 'closest', verbose = 1):
26 |         '''
27 |         Inputs:
28 |             gts - ground truths
29 |             res - predictions
30 |             score_option - {shortest, closest, average}
31 |             verbose - 1 or 0
32 |         Outputs:
33 |             Blue scores
34 |         '''
35 |         assert(gts.keys() == res.keys())
36 |         imgIds = gts.keys()
37 | 
38 |         bleu_scorer = BleuScorer(n=self._n)
39 |         for id in imgIds:
40 |             hypo = res[id]
41 |             ref = gts[id]
42 | 
43 |             # Sanity check.
44 |             assert(type(hypo) is list)
45 |             assert(len(hypo) == 1)
46 |             assert(type(ref) is list)
47 |             #assert(len(ref) >= 1)
48 | 
49 |             bleu_scorer += (hypo[0], ref)
50 | 
51 |         score, scores = bleu_scorer.compute_score(option = score_option, verbose =verbose)
52 | 
53 |         # return (bleu, bleu_info)
54 |         return score, scores
55 | 
56 |     def method(self):
57 |         return "Bleu"
58 | 


--------------------------------------------------------------------------------
/bleu/bleu_scorer.py:
--------------------------------------------------------------------------------
  1 | # bleu_scorer.py
  2 | # David Chiang <chiang@isi.edu>
  3 | 
  4 | # Copyright (c) 2004-2006 University of Maryland. All rights
  5 | # reserved. Do not redistribute without permission from the
  6 | # author. Not for commercial use.
  7 | 
  8 | # Modified by: 
  9 | # Hao Fang <hfang@uw.edu>
 10 | # Tsung-Yi Lin <tl483@cornell.edu>
 11 | 
 12 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
 13 | # By Sabarish Sivanath
 14 | # To support Python 3
 15 | 
 16 | '''Provides:
 17 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 18 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 19 | '''
 20 | 
 21 | import copy
 22 | import sys, math, re
 23 | from collections import defaultdict
 24 | 
 25 | def precook(s, n=4, out=False):
 26 |     """Takes a string as input and returns an object that can be given to
 27 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 28 |     can take string arguments as well."""
 29 |     words = s.split()
 30 |     counts = defaultdict(int)
 31 |     for k in range(1,n+1):
 32 |         for i in range(len(words)-k+1):
 33 |             ngram = tuple(words[i:i+k])
 34 |             counts[ngram] += 1
 35 |     return (len(words), counts)
 36 | 
 37 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
 38 |     '''Takes a list of reference sentences for a single segment
 39 |     and returns an object that encapsulates everything that BLEU
 40 |     needs to know about them.'''
 41 | 
 42 |     reflen = []
 43 |     maxcounts = {}
 44 |     for ref in refs:
 45 |         rl, counts = precook(ref, n)
 46 |         reflen.append(rl)
 47 |         for (ngram,count) in counts.items():
 48 |             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 49 | 
 50 |     # Calculate effective reference sentence length.
 51 |     if eff == "shortest":
 52 |         reflen = min(reflen)
 53 |     elif eff == "average":
 54 |         reflen = float(sum(reflen))/len(reflen)
 55 | 
 56 |     ## lhuang: N.B.: leave reflen computaiton to the very end!!
 57 |     
 58 |     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 59 | 
 60 |     return (reflen, maxcounts)
 61 | 
 62 | def cook_test(test, refs , eff=None, n=4):
 63 |     '''Takes a test sentence and returns an object that
 64 |     encapsulates everything that BLEU needs to know about it.'''
 65 |     
 66 |     reflen = refs[0]
 67 |     refmaxcounts = refs[1]
 68 | 
 69 |     testlen, counts = precook(test, n, True)
 70 | 
 71 |     result = {}
 72 | 
 73 |     # Calculate effective reference sentence length.
 74 |     
 75 |     if eff == "closest":
 76 |         result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
 77 |     else: ## i.e., "average" or "shortest" or None
 78 |         result["reflen"] = reflen
 79 | 
 80 |     result["testlen"] = testlen
 81 | 
 82 |     result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
 83 | 
 84 |     result['correct'] = [0]*n
 85 |     for (ngram, count) in counts.items():
 86 |         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
 87 | 
 88 |     return result
 89 | 
 90 | class BleuScorer(object):
 91 |     """Bleu scorer.
 92 |     """
 93 | 
 94 |     __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
 95 |     # special_reflen is used in oracle (proportional effective ref len for a node).
 96 | 
 97 |     def copy(self):
 98 |         ''' copy the refs.'''
 99 |         new = BleuScorer(n=self.n)
100 |         new.ctest = copy.copy(self.ctest)
101 |         new.crefs = copy.copy(self.crefs)
102 |         new._score = None
103 |         return new
104 | 
105 |     def __init__(self, test=None, refs=None, n=4, special_reflen=None):
106 |         ''' singular instance '''
107 | 
108 |         self.n = n
109 |         self.crefs = []
110 |         self.ctest = []
111 |         self.cook_append(test, refs)
112 |         self.special_reflen = special_reflen
113 | 
114 |     def cook_append(self, test, refs):
115 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
116 |         
117 |         if refs is not None:
118 |             self.crefs.append(cook_refs(refs))
119 |             if test is not None:
120 |                 cooked_test = cook_test(test, self.crefs[-1])
121 |                 self.ctest.append(cooked_test) ## N.B.: -1
122 |             else:
123 |                 self.ctest.append(None) # lens of crefs and ctest have to match
124 | 
125 |         self._score = None ## need to recompute
126 | 
127 |     def ratio(self, option=None):
128 |         self.compute_score(option=option)
129 |         return self._ratio
130 | 
131 |     def score_ratio(self, option=None):
132 |         '''return (bleu, len_ratio) pair'''
133 |         return (self.fscore(option=option), self.ratio(option=option))
134 | 
135 |     def score_ratio_str(self, option=None):
136 |         return "%.4f (%.2f)" % self.score_ratio(option)
137 | 
138 |     def reflen(self, option=None):
139 |         self.compute_score(option=option)
140 |         return self._reflen
141 | 
142 |     def testlen(self, option=None):
143 |         self.compute_score(option=option)
144 |         return self._testlen        
145 | 
146 |     def retest(self, new_test):
147 |         if type(new_test) is str:
148 |             new_test = [new_test]
149 |         assert len(new_test) == len(self.crefs), new_test
150 |         self.ctest = []
151 |         for t, rs in zip(new_test, self.crefs):
152 |             self.ctest.append(cook_test(t, rs))
153 |         self._score = None
154 | 
155 |         return self
156 | 
157 |     def rescore(self, new_test):
158 |         ''' replace test(s) with new test(s), and returns the new score.'''
159 |         
160 |         return self.retest(new_test).compute_score()
161 | 
162 |     def size(self):
163 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
164 |         return len(self.crefs)
165 | 
166 |     def __iadd__(self, other):
167 |         '''add an instance (e.g., from another sentence).'''
168 | 
169 |         if type(other) is tuple:
170 |             ## avoid creating new BleuScorer instances
171 |             self.cook_append(other[0], other[1])
172 |         else:
173 |             assert self.compatible(other), "incompatible BLEUs."
174 |             self.ctest.extend(other.ctest)
175 |             self.crefs.extend(other.crefs)
176 |             self._score = None ## need to recompute
177 | 
178 |         return self        
179 | 
180 |     def compatible(self, other):
181 |         return isinstance(other, BleuScorer) and self.n == other.n
182 | 
183 |     def single_reflen(self, option="average"):
184 |         return self._single_reflen(self.crefs[0][0], option)
185 | 
186 |     def _single_reflen(self, reflens, option=None, testlen=None):
187 |         
188 |         if option == "shortest":
189 |             reflen = min(reflens)
190 |         elif option == "average":
191 |             reflen = float(sum(reflens))/len(reflens)
192 |         elif option == "closest":
193 |             reflen = min((abs(l-testlen), l) for l in reflens)[1]
194 |         else:
195 |             assert False, "unsupported reflen option %s" % option
196 | 
197 |         return reflen
198 | 
199 |     def recompute_score(self, option=None, verbose=0):
200 |         self._score = None
201 |         return self.compute_score(option, verbose)
202 |         
203 |     def compute_score(self, option=None, verbose=0):
204 |         n = self.n
205 |         small = 1e-9
206 |         tiny = 1e-15 ## so that if guess is 0 still return 0
207 |         bleu_list = [[] for _ in range(n)]
208 | 
209 |         if self._score is not None:
210 |             return self._score
211 | 
212 |         if option is None:
213 |             option = "average" if len(self.crefs) == 1 else "closest"
214 | 
215 |         self._testlen = 0
216 |         self._reflen = 0
217 |         totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
218 | 
219 |         # for each sentence
220 |         for comps in self.ctest:            
221 |             testlen = comps['testlen']
222 |             self._testlen += testlen
223 | 
224 |             if self.special_reflen is None: ## need computation
225 |                 reflen = self._single_reflen(comps['reflen'], option, testlen)
226 |             else:
227 |                 reflen = self.special_reflen
228 | 
229 |             self._reflen += reflen
230 |                 
231 |             for key in ['guess','correct']:
232 |                 for k in range(n):
233 |                     totalcomps[key][k] += comps[key][k]
234 | 
235 |             # append per image bleu score
236 |             bleu = 1.
237 |             for k in range(n):
238 |                 bleu *= (float(comps['correct'][k]) + tiny) \
239 |                         /(float(comps['guess'][k]) + small) 
240 |                 bleu_list[k].append(bleu ** (1./(k+1)))
241 |             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
242 |             if ratio < 1:
243 |                 for k in range(n):
244 |                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
245 | 
246 |             if verbose > 1:
247 |                 print(comps, reflen)
248 | 
249 |         totalcomps['reflen'] = self._reflen
250 |         totalcomps['testlen'] = self._testlen
251 | 
252 |         bleus = []
253 |         bleu = 1.
254 |         for k in range(n):
255 |             bleu *= float(totalcomps['correct'][k] + tiny) \
256 |                     / (totalcomps['guess'][k] + small)
257 |             bleus.append(bleu ** (1./(k+1)))
258 |         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
259 |         if ratio < 1:
260 |             for k in range(n):
261 |                 bleus[k] *= math.exp(1 - 1/ratio)
262 | 
263 |         if verbose > 0:
264 |             print(totalcomps)
265 |             print("ratio:", ratio)
266 | 
267 |         self._score = bleus
268 |         return self._score, bleu_list
269 | 


--------------------------------------------------------------------------------
/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | 
11 | from .cider_scorer import CiderScorer
12 | import pdb
13 | 
14 | class Cider:
15 |     """
16 |     Main Class to compute the CIDEr metric 
17 | 
18 |     """
19 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
20 |         # set cider to sum over 1 to 4-grams
21 |         self._n = n
22 |         # set the standard deviation parameter for gaussian penalty
23 |         self._sigma = sigma
24 | 
25 |     def compute_score(self, gts, res):
26 |         """
27 |         Main function to compute CIDEr score
28 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
29 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
30 |         :return: cider (float) : computed CIDEr score for the corpus 
31 |         """
32 | 
33 |         assert(gts.keys() == res.keys())
34 |         imgIds = gts.keys()
35 | 
36 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
37 | 
38 |         for id in imgIds:
39 |             hypo = res[id]
40 |             ref = gts[id]
41 | 
42 |             # Sanity check.
43 |             assert(type(hypo) is list)
44 |             assert(len(hypo) == 1)
45 |             assert(type(ref) is list)
46 |             assert(len(ref) > 0)
47 | 
48 |             cider_scorer += (hypo[0], ref)
49 | 
50 |         (score, scores) = cider_scorer.compute_score()
51 | 
52 |         return score, scores
53 | 
54 |     def method(self):
55 |         return "CIDEr"


--------------------------------------------------------------------------------
/cider/cider_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Tsung-Yi Lin <tl483@cornell.edu>
  3 | # Ramakrishna Vedantam <vrama91@vt.edu>
  4 | 
  5 | 
  6 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
  7 | # By Sabarish Sivanath
  8 | # To support Python 3
  9 | 
 10 | import copy
 11 | from collections import defaultdict
 12 | import numpy as np
 13 | import pdb
 14 | import math
 15 | 
 16 | def precook(s, n=4, out=False):
 17 |     """
 18 |     Takes a string as input and returns an object that can be given to
 19 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 20 |     can take string arguments as well.
 21 |     :param s: string : sentence to be converted into ngrams
 22 |     :param n: int    : number of ngrams for which representation is calculated
 23 |     :return: term frequency vector for occuring ngrams
 24 |     """
 25 |     words = s.split()
 26 |     counts = defaultdict(int)
 27 |     for k in range(1,n+1):
 28 |         for i in range(len(words)-k+1):
 29 |             ngram = tuple(words[i:i+k])
 30 |             counts[ngram] += 1
 31 |     return counts
 32 | 
 33 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
 34 |     '''Takes a list of reference sentences for a single segment
 35 |     and returns an object that encapsulates everything that BLEU
 36 |     needs to know about them.
 37 |     :param refs: list of string : reference sentences for some image
 38 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 39 |     :return: result (list of dict)
 40 |     '''
 41 |     return [precook(ref, n) for ref in refs]
 42 | 
 43 | def cook_test(test, n=4):
 44 |     '''Takes a test sentence and returns an object that
 45 |     encapsulates everything that BLEU needs to know about it.
 46 |     :param test: list of string : hypothesis sentence for some image
 47 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 48 |     :return: result (dict)
 49 |     '''
 50 |     return precook(test, n, True)
 51 | 
 52 | class CiderScorer(object):
 53 |     """CIDEr scorer.
 54 |     """
 55 | 
 56 |     def copy(self):
 57 |         ''' copy the refs.'''
 58 |         new = CiderScorer(n=self.n)
 59 |         new.ctest = copy.copy(self.ctest)
 60 |         new.crefs = copy.copy(self.crefs)
 61 |         return new
 62 | 
 63 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
 64 |         ''' singular instance '''
 65 |         self.n = n
 66 |         self.sigma = sigma
 67 |         self.crefs = []
 68 |         self.ctest = []
 69 |         self.document_frequency = defaultdict(float)
 70 |         self.cook_append(test, refs)
 71 |         self.ref_len = None
 72 | 
 73 |     def cook_append(self, test, refs):
 74 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
 75 | 
 76 |         if refs is not None:
 77 |             self.crefs.append(cook_refs(refs))
 78 |             if test is not None:
 79 |                 self.ctest.append(cook_test(test)) ## N.B.: -1
 80 |             else:
 81 |                 self.ctest.append(None) # lens of crefs and ctest have to match
 82 | 
 83 |     def size(self):
 84 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
 85 |         return len(self.crefs)
 86 | 
 87 |     def __iadd__(self, other):
 88 |         '''add an instance (e.g., from another sentence).'''
 89 | 
 90 |         if type(other) is tuple:
 91 |             ## avoid creating new CiderScorer instances
 92 |             self.cook_append(other[0], other[1])
 93 |         else:
 94 |             self.ctest.extend(other.ctest)
 95 |             self.crefs.extend(other.crefs)
 96 | 
 97 |         return self
 98 |     def compute_doc_freq(self):
 99 |         '''
100 |         Compute term frequency for reference data.
101 |         This will be used to compute idf (inverse document frequency later)
102 |         The term frequency is stored in the object
103 |         :return: None
104 |         '''
105 |         for refs in self.crefs:
106 |             # refs, k ref captions of one image
107 |             for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
108 |                 self.document_frequency[ngram] += 1
109 |             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
110 | 
111 |     def compute_cider(self):
112 |         def counts2vec(cnts):
113 |             """
114 |             Function maps counts of ngram to vector of tfidf weights.
115 |             The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
116 |             The n-th entry of array denotes length of n-grams.
117 |             :param cnts:
118 |             :return: vec (array of dict), norm (array of float), length (int)
119 |             """
120 |             vec = [defaultdict(float) for _ in range(self.n)]
121 |             length = 0
122 |             norm = [0.0 for _ in range(self.n)]
123 |             for (ngram,term_freq) in cnts.items():
124 |                 # give word count 1 if it doesn't appear in reference corpus
125 |                 df = np.log(max(1.0, self.document_frequency[ngram]))
126 |                 # ngram index
127 |                 n = len(ngram)-1
128 |                 # tf (term_freq) * idf (precomputed idf) for n-grams
129 |                 vec[n][ngram] = float(term_freq)*(self.ref_len - df)
130 |                 # compute norm for the vector.  the norm will be used for computing similarity
131 |                 norm[n] += pow(vec[n][ngram], 2)
132 | 
133 |                 if n == 1:
134 |                     length += term_freq
135 |             norm = [np.sqrt(n) for n in norm]
136 |             return vec, norm, length
137 | 
138 |         def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
139 |             '''
140 |             Compute the cosine similarity of two vectors.
141 |             :param vec_hyp: array of dictionary for vector corresponding to hypothesis
142 |             :param vec_ref: array of dictionary for vector corresponding to reference
143 |             :param norm_hyp: array of float for vector corresponding to hypothesis
144 |             :param norm_ref: array of float for vector corresponding to reference
145 |             :param length_hyp: int containing length of hypothesis
146 |             :param length_ref: int containing length of reference
147 |             :return: array of score for each n-grams cosine similarity
148 |             '''
149 |             delta = float(length_hyp - length_ref)
150 |             # measure consine similarity
151 |             val = np.array([0.0 for _ in range(self.n)])
152 |             for n in range(self.n):
153 |                 # ngram
154 |                 for (ngram,count) in vec_hyp[n].items():
155 |                     # vrama91 : added clipping
156 |                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
157 | 
158 |                 if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
159 |                     val[n] /= (norm_hyp[n]*norm_ref[n])
160 | 
161 |                 assert(not math.isnan(val[n]))
162 |                 # vrama91: added a length based gaussian penalty
163 |                 val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
164 |             return val
165 | 
166 |         # compute log reference length
167 |         self.ref_len = np.log(float(len(self.crefs)))
168 | 
169 |         scores = []
170 |         for test, refs in zip(self.ctest, self.crefs):
171 |             # compute vector for test captions
172 |             vec, norm, length = counts2vec(test)
173 |             # compute vector for ref captions
174 |             score = np.array([0.0 for _ in range(self.n)])
175 |             for ref in refs:
176 |                 vec_ref, norm_ref, length_ref = counts2vec(ref)
177 |                 score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
178 |             # change by vrama91 - mean of ngram scores, instead of sum
179 |             score_avg = np.mean(score)
180 |             # divide by number of references
181 |             score_avg /= len(refs)
182 |             # multiply score by 10
183 |             score_avg *= 10.0
184 |             # append score of an image to the score list
185 |             scores.append(score_avg)
186 |         return scores
187 | 
188 |     def compute_score(self, option=None, verbose=0):
189 |         # compute idf
190 |         self.compute_doc_freq()
191 |         # assert to check document frequency
192 |         assert(len(self.ctest) >= max(self.document_frequency.values()))
193 |         # compute cider score
194 |         score = self.compute_cider()
195 |         # debug
196 |         # print score
197 |         return np.mean(np.array(score)), np.array(score)


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'tylin'
 2 | from .tokenizer.ptbtokenizer import PTBTokenizer
 3 | from .bleu.bleu import Bleu
 4 | from .meteor.meteor import Meteor
 5 | from .rouge.rouge import Rouge
 6 | from .cider.cider import Cider
 7 | 
 8 | class COCOEvalCap:
 9 |     def __init__(self, coco, cocoRes):
10 |         self.evalImgs = []
11 |         self.eval = {}
12 |         self.imgToEval = {}
13 |         self.coco = coco
14 |         self.cocoRes = cocoRes
15 |         self.params = {'image_id': cocoRes.getImgIds()}
16 | 
17 |     def evaluate(self):
18 |         imgIds = self.params['image_id']
19 |         # imgIds = self.coco.getImgIds()
20 |         gts = {}
21 |         res = {}
22 |         for imgId in imgIds:
23 |             gts[imgId] = self.coco.imgToAnns[imgId]
24 |             res[imgId] = self.cocoRes.imgToAnns[imgId]
25 | 
26 |         # =================================================
27 |         # Set up scorers
28 |         # =================================================
29 |         print('tokenization...')
30 |         tokenizer = PTBTokenizer()
31 |         gts  = tokenizer.tokenize(gts)
32 |         res = tokenizer.tokenize(res)
33 | 
34 |         # =================================================
35 |         # Set up scorers
36 |         # =================================================
37 |         print('setting up scorers...')
38 |         scorers = [
39 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
40 |             (Meteor(),"METEOR"),
41 |             (Rouge(), "ROUGE_L"),
42 |             (Cider(), "CIDEr")
43 |         ]
44 | 
45 |         # =================================================
46 |         # Compute scores
47 |         # =================================================
48 |         eval = {}
49 |         for scorer, method in scorers:
50 |             print('computing %s score...'%(scorer.method()))
51 |             score, scores = scorer.compute_score(gts, res)
52 |             if type(method) == list:
53 |                 for sc, scs, m in zip(score, scores, method):
54 |                     self.setEval(sc, m)
55 |                     self.setImgToEvalImgs(scs, imgIds, m)
56 |                     print("%s: %0.3f"%(m, sc))
57 |             else:
58 |                 self.setEval(score, method)
59 |                 self.setImgToEvalImgs(scores, imgIds, method)
60 |                 print("%s: %0.3f"%(method, score))
61 |         self.setEvalImgs()
62 | 
63 |     def setEval(self, score, method):
64 |         self.eval[method] = score
65 | 
66 |     def setImgToEvalImgs(self, scores, imgIds, method):
67 |         for imgId, score in zip(imgIds, scores):
68 |             if not imgId in self.imgToEval:
69 |                 self.imgToEval[imgId] = {}
70 |                 self.imgToEval[imgId]["image_id"] = imgId
71 |             self.imgToEval[imgId][method] = score
72 | 
73 |     def setEvalImgs(self):
74 |         self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]
75 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of the FreeBSD Project.


--------------------------------------------------------------------------------
/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/meteor/data/paraphrase-en.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/meteor/data/paraphrase-en.gz


--------------------------------------------------------------------------------
/meteor/meteor-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/meteor/meteor-1.5.jar


--------------------------------------------------------------------------------
/meteor/meteor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Python wrapper for METEOR implementation, by Xinlei Chen
 4 | # Acknowledge Michael Denkowski for the generous discussion and help 
 5 | 
 6 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
 7 | # By Sabarish Sivanath
 8 | # To support Python 3
 9 | 
10 | import os
11 | import sys
12 | import subprocess
13 | import threading
14 | 
15 | # Assumes meteor-1.5.jar is in the same directory as meteor.py.  Change as needed.
16 | METEOR_JAR = 'meteor-1.5.jar'
17 | # print METEOR_JAR
18 | 
19 | class Meteor:
20 | 
21 |     def __init__(self):
22 |         self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
23 |                 '-', '-', '-stdio', '-l', 'en', '-norm']
24 |         self.meteor_p = subprocess.Popen(self.meteor_cmd, \
25 |                 cwd=os.path.dirname(os.path.abspath(__file__)), \
26 |                 stdin=subprocess.PIPE, \
27 |                 stdout=subprocess.PIPE, \
28 |                 stderr=subprocess.PIPE,
29 |                 universal_newlines = True,
30 |                 bufsize = 1)
31 |         # Used to guarantee thread safety
32 |         self.lock = threading.Lock()
33 | 
34 |     def compute_score(self, gts, res):
35 |         assert(gts.keys() == res.keys())
36 |         imgIds = gts.keys()
37 |         scores = []
38 | 
39 |         eval_line = 'EVAL'
40 |         self.lock.acquire()
41 |         for i in imgIds:
42 |             assert(len(res[i]) == 1)
43 |             stat = self._stat(res[i][0], gts[i])
44 |             eval_line += ' ||| {}'.format(stat)
45 | 
46 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
47 |         for i in range(0,len(imgIds)):
48 |             scores.append(float(self.meteor_p.stdout.readline().strip()))
49 |         score = float(self.meteor_p.stdout.readline().strip())
50 |         self.lock.release()
51 | 
52 |         return score, scores
53 | 
54 |     def method(self):
55 |         return "METEOR"
56 | 
57 |     def _stat(self, hypothesis_str, reference_list):
58 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
59 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
60 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
61 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
62 |         return self.meteor_p.stdout.readline().strip()
63 | 
64 |     def _score(self, hypothesis_str, reference_list):
65 |         self.lock.acquire()
66 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
67 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
68 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
69 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
70 |         stats = self.meteor_p.stdout.readline().strip()
71 |         eval_line = 'EVAL ||| {}'.format(stats)
72 |         # EVAL ||| stats 
73 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
74 |         score = float(self.meteor_p.stdout.readline().strip())
75 |         # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
76 |         # thanks for Andrej for pointing this out
77 |         score = float(self.meteor_p.stdout.readline().strip())
78 |         self.lock.release()
79 |         return score
80 |  
81 |     def __del__(self):
82 |         self.lock.acquire()
83 |         self.meteor_p.stdin.close()
84 |         self.meteor_p.kill()
85 |         self.meteor_p.wait()
86 |         self.lock.release()
87 | 


--------------------------------------------------------------------------------
/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | def my_lcs(string, sub):
 14 |     """
 15 |     Calculates longest common subsequence for a pair of tokenized strings
 16 |     :param string : list of str : tokens from a string split using whitespace
 17 |     :param sub : list of str : shorter string, also split using whitespace
 18 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 19 | 
 20 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 21 |     """
 22 |     if(len(string)< len(sub)):
 23 |         sub, string = string, sub
 24 | 
 25 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 26 | 
 27 |     for j in range(1,len(sub)+1):
 28 |         for i in range(1,len(string)+1):
 29 |             if(string[i-1] == sub[j-1]):
 30 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 31 |             else:
 32 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 33 | 
 34 |     return lengths[len(string)][len(sub)]
 35 | 
 36 | class Rouge():
 37 |     '''
 38 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 39 | 
 40 |     '''
 41 |     def __init__(self):
 42 |         # vrama91: updated the value below based on discussion with Hovey
 43 |         self.beta = 1.2
 44 | 
 45 |     def calc_score(self, candidate, refs):
 46 |         """
 47 |         Compute ROUGE-L score given one candidate and references for an image
 48 |         :param candidate: str : candidate sentence to be evaluated
 49 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 50 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 51 |         """
 52 |         assert(len(candidate)==1)	
 53 |         assert(len(refs)>0)         
 54 |         prec = []
 55 |         rec = []
 56 | 
 57 |         # split into tokens
 58 |         token_c = candidate[0].split(" ")
 59 |     	
 60 |         for reference in refs:
 61 |             # split into tokens
 62 |             token_r = reference.split(" ")
 63 |             # compute the longest common subsequence
 64 |             lcs = my_lcs(token_r, token_c)
 65 |             prec.append(lcs/float(len(token_c)))
 66 |             rec.append(lcs/float(len(token_r)))
 67 | 
 68 |         prec_max = max(prec)
 69 |         rec_max = max(rec)
 70 | 
 71 |         if(prec_max!=0 and rec_max !=0):
 72 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 73 |         else:
 74 |             score = 0.0
 75 |         return score
 76 | 
 77 |     def compute_score(self, gts, res):
 78 |         """
 79 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 80 |         Invoked by evaluate_captions.py 
 81 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 82 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 83 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 84 |         """
 85 |         assert(gts.keys() == res.keys())
 86 |         imgIds = gts.keys()
 87 | 
 88 |         score = []
 89 |         for id in imgIds:
 90 |             hypo = res[id]
 91 |             ref  = gts[id]
 92 | 
 93 |             score.append(self.calc_score(hypo, ref))
 94 | 
 95 |             # Sanity check.
 96 |             assert(type(hypo) is list)
 97 |             assert(len(hypo) == 1)
 98 |             assert(type(ref) is list)
 99 |             assert(len(ref) > 0)
100 | 
101 |         average_score = np.mean(np.array(score))
102 |         return average_score, np.array(score)
103 | 
104 |     def method(self):
105 |         return "Rouge"
106 | 


--------------------------------------------------------------------------------
/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 | 


--------------------------------------------------------------------------------
/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # File Name : ptbtokenizer.py
 4 | #
 5 | # Description : Do the PTB Tokenization and remove punctuations.
 6 | #
 7 | # Creation Date : 29-12-2014
 8 | # Last Modified : Thu Mar 19 09:53:35 2015
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | import os
12 | import sys
13 | import subprocess
14 | import tempfile
15 | import itertools
16 | 
17 | 
18 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
19 | # By Sabarish Sivanath
20 | # To support Python 3
21 | 
22 | # path to the stanford corenlp jar
23 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
24 | 
25 | # punctuations to be removed from the sentences
26 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
27 |         ".", "?", "!", ",", ":", "-", "--", "...", ";"]
28 | 
29 | class PTBTokenizer:
30 |     """Python wrapper of Stanford PTBTokenizer"""
31 | 
32 |     def tokenize(self, captions_for_image):
33 |         cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
34 |                 'edu.stanford.nlp.process.PTBTokenizer', \
35 |                 '-preserveLines', '-lowerCase']
36 | 
37 |         # ======================================================
38 |         # prepare data for PTB Tokenizer
39 |         # ======================================================
40 |         final_tokenized_captions_for_image = {}
41 |         image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
42 |         sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
43 | 
44 |         # ======================================================
45 |         # save sentences to temporary file
46 |         # ======================================================
47 |         path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
48 |         tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
49 |         tmp_file.write(sentences.encode('utf-8'))
50 |         tmp_file.close()
51 | 
52 |         # ======================================================
53 |         # tokenize sentence
54 |         # ======================================================
55 |         cmd.append(os.path.basename(tmp_file.name))
56 |         p_tokenizer = subprocess.Popen(cmd, 
57 |                                        cwd=path_to_jar_dirname, 
58 |                                        stdout=subprocess.PIPE,
59 |                                        universal_newlines = True,
60 |                                        bufsize = 1)
61 |         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
62 |         lines = token_lines.split('\n')
63 |         # remove temp file
64 |         os.remove(tmp_file.name)
65 | 
66 |         # ======================================================
67 |         # create dictionary for tokenized captions
68 |         # ======================================================
69 |         for k, line in zip(image_id, lines):
70 |             if not k in final_tokenized_captions_for_image:
71 |                 final_tokenized_captions_for_image[k] = []
72 |             tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
73 |                     if w not in PUNCTUATIONS])
74 |             final_tokenized_captions_for_image[k].append(tokenized_caption)
75 | 
76 |         return final_tokenized_captions_for_image
77 | 


--------------------------------------------------------------------------------
/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/tokenizer/stanford-corenlp-3.4.1.jar


--------------------------------------------------------------------------------