├── README.md
├── __init__.py
├── bleu
├── LICENSE
├── __init__.py
├── bleu.py
└── bleu_scorer.py
├── cider
├── __init__.py
├── cider.py
└── cider_scorer.py
├── eval.py
├── license.txt
├── meteor
├── __init__.py
├── data
│ └── paraphrase-en.gz
├── meteor-1.5.jar
└── meteor.py
├── rouge
├── __init__.py
└── rouge.py
└── tokenizer
├── __init__.py
├── ptbtokenizer.py
└── stanford-corenlp-3.4.1.jar
/README.md:
--------------------------------------------------------------------------------
1 | Microsoft COCO Caption Evaluation Tools
2 | ---
3 |
4 | Modified the code to work with Python 3.
5 |
6 | ### Requirements
7 | * Python 3.x
8 | * Java 1.8
9 | * pycocotools
10 |
11 | ---
12 |
13 | ### Tested on
14 | * Windows 10, Python 3.5.
15 |
16 | ---
17 | ### To fix Windows JVM memory error:
18 | Add the following in System Variables
19 | Variable name : _JAVA_OPTIONS
20 | Variable value : -Xmx1024M
21 |
22 | ---
23 | Original code : https://github.com/tylin/coco-caption
24 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
--------------------------------------------------------------------------------
/bleu/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
--------------------------------------------------------------------------------
/bleu/bleu.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File Name : bleu.py
4 | #
5 | # Description : Wrapper for BLEU scorer.
6 | #
7 | # Creation Date : 06-01-2015
8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
9 | # Authors : Hao Fang and Tsung-Yi Lin
10 |
11 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
12 | # By Sabarish Sivanath
13 | # To support Python 3
14 |
15 | from .bleu_scorer import BleuScorer
16 |
17 |
18 | class Bleu:
19 | def __init__(self, n=4):
20 | # default compute Blue score up to 4
21 | self._n = n
22 | self._hypo_for_image = {}
23 | self.ref_for_image = {}
24 |
25 | def compute_score(self, gts, res, score_option = 'closest', verbose = 1):
26 | '''
27 | Inputs:
28 | gts - ground truths
29 | res - predictions
30 | score_option - {shortest, closest, average}
31 | verbose - 1 or 0
32 | Outputs:
33 | Blue scores
34 | '''
35 | assert(gts.keys() == res.keys())
36 | imgIds = gts.keys()
37 |
38 | bleu_scorer = BleuScorer(n=self._n)
39 | for id in imgIds:
40 | hypo = res[id]
41 | ref = gts[id]
42 |
43 | # Sanity check.
44 | assert(type(hypo) is list)
45 | assert(len(hypo) == 1)
46 | assert(type(ref) is list)
47 | #assert(len(ref) >= 1)
48 |
49 | bleu_scorer += (hypo[0], ref)
50 |
51 | score, scores = bleu_scorer.compute_score(option = score_option, verbose =verbose)
52 |
53 | # return (bleu, bleu_info)
54 | return score, scores
55 |
56 | def method(self):
57 | return "Bleu"
58 |
--------------------------------------------------------------------------------
/bleu/bleu_scorer.py:
--------------------------------------------------------------------------------
1 | # bleu_scorer.py
2 | # David Chiang
3 |
4 | # Copyright (c) 2004-2006 University of Maryland. All rights
5 | # reserved. Do not redistribute without permission from the
6 | # author. Not for commercial use.
7 |
8 | # Modified by:
9 | # Hao Fang
10 | # Tsung-Yi Lin
11 |
12 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
13 | # By Sabarish Sivanath
14 | # To support Python 3
15 |
16 | '''Provides:
17 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
18 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
19 | '''
20 |
21 | import copy
22 | import sys, math, re
23 | from collections import defaultdict
24 |
25 | def precook(s, n=4, out=False):
26 | """Takes a string as input and returns an object that can be given to
27 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
28 | can take string arguments as well."""
29 | words = s.split()
30 | counts = defaultdict(int)
31 | for k in range(1,n+1):
32 | for i in range(len(words)-k+1):
33 | ngram = tuple(words[i:i+k])
34 | counts[ngram] += 1
35 | return (len(words), counts)
36 |
37 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
38 | '''Takes a list of reference sentences for a single segment
39 | and returns an object that encapsulates everything that BLEU
40 | needs to know about them.'''
41 |
42 | reflen = []
43 | maxcounts = {}
44 | for ref in refs:
45 | rl, counts = precook(ref, n)
46 | reflen.append(rl)
47 | for (ngram,count) in counts.items():
48 | maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
49 |
50 | # Calculate effective reference sentence length.
51 | if eff == "shortest":
52 | reflen = min(reflen)
53 | elif eff == "average":
54 | reflen = float(sum(reflen))/len(reflen)
55 |
56 | ## lhuang: N.B.: leave reflen computaiton to the very end!!
57 |
58 | ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
59 |
60 | return (reflen, maxcounts)
61 |
62 | def cook_test(test, refs , eff=None, n=4):
63 | '''Takes a test sentence and returns an object that
64 | encapsulates everything that BLEU needs to know about it.'''
65 |
66 | reflen = refs[0]
67 | refmaxcounts = refs[1]
68 |
69 | testlen, counts = precook(test, n, True)
70 |
71 | result = {}
72 |
73 | # Calculate effective reference sentence length.
74 |
75 | if eff == "closest":
76 | result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
77 | else: ## i.e., "average" or "shortest" or None
78 | result["reflen"] = reflen
79 |
80 | result["testlen"] = testlen
81 |
82 | result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
83 |
84 | result['correct'] = [0]*n
85 | for (ngram, count) in counts.items():
86 | result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
87 |
88 | return result
89 |
90 | class BleuScorer(object):
91 | """Bleu scorer.
92 | """
93 |
94 | __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
95 | # special_reflen is used in oracle (proportional effective ref len for a node).
96 |
97 | def copy(self):
98 | ''' copy the refs.'''
99 | new = BleuScorer(n=self.n)
100 | new.ctest = copy.copy(self.ctest)
101 | new.crefs = copy.copy(self.crefs)
102 | new._score = None
103 | return new
104 |
105 | def __init__(self, test=None, refs=None, n=4, special_reflen=None):
106 | ''' singular instance '''
107 |
108 | self.n = n
109 | self.crefs = []
110 | self.ctest = []
111 | self.cook_append(test, refs)
112 | self.special_reflen = special_reflen
113 |
114 | def cook_append(self, test, refs):
115 | '''called by constructor and __iadd__ to avoid creating new instances.'''
116 |
117 | if refs is not None:
118 | self.crefs.append(cook_refs(refs))
119 | if test is not None:
120 | cooked_test = cook_test(test, self.crefs[-1])
121 | self.ctest.append(cooked_test) ## N.B.: -1
122 | else:
123 | self.ctest.append(None) # lens of crefs and ctest have to match
124 |
125 | self._score = None ## need to recompute
126 |
127 | def ratio(self, option=None):
128 | self.compute_score(option=option)
129 | return self._ratio
130 |
131 | def score_ratio(self, option=None):
132 | '''return (bleu, len_ratio) pair'''
133 | return (self.fscore(option=option), self.ratio(option=option))
134 |
135 | def score_ratio_str(self, option=None):
136 | return "%.4f (%.2f)" % self.score_ratio(option)
137 |
138 | def reflen(self, option=None):
139 | self.compute_score(option=option)
140 | return self._reflen
141 |
142 | def testlen(self, option=None):
143 | self.compute_score(option=option)
144 | return self._testlen
145 |
146 | def retest(self, new_test):
147 | if type(new_test) is str:
148 | new_test = [new_test]
149 | assert len(new_test) == len(self.crefs), new_test
150 | self.ctest = []
151 | for t, rs in zip(new_test, self.crefs):
152 | self.ctest.append(cook_test(t, rs))
153 | self._score = None
154 |
155 | return self
156 |
157 | def rescore(self, new_test):
158 | ''' replace test(s) with new test(s), and returns the new score.'''
159 |
160 | return self.retest(new_test).compute_score()
161 |
162 | def size(self):
163 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
164 | return len(self.crefs)
165 |
166 | def __iadd__(self, other):
167 | '''add an instance (e.g., from another sentence).'''
168 |
169 | if type(other) is tuple:
170 | ## avoid creating new BleuScorer instances
171 | self.cook_append(other[0], other[1])
172 | else:
173 | assert self.compatible(other), "incompatible BLEUs."
174 | self.ctest.extend(other.ctest)
175 | self.crefs.extend(other.crefs)
176 | self._score = None ## need to recompute
177 |
178 | return self
179 |
180 | def compatible(self, other):
181 | return isinstance(other, BleuScorer) and self.n == other.n
182 |
183 | def single_reflen(self, option="average"):
184 | return self._single_reflen(self.crefs[0][0], option)
185 |
186 | def _single_reflen(self, reflens, option=None, testlen=None):
187 |
188 | if option == "shortest":
189 | reflen = min(reflens)
190 | elif option == "average":
191 | reflen = float(sum(reflens))/len(reflens)
192 | elif option == "closest":
193 | reflen = min((abs(l-testlen), l) for l in reflens)[1]
194 | else:
195 | assert False, "unsupported reflen option %s" % option
196 |
197 | return reflen
198 |
199 | def recompute_score(self, option=None, verbose=0):
200 | self._score = None
201 | return self.compute_score(option, verbose)
202 |
203 | def compute_score(self, option=None, verbose=0):
204 | n = self.n
205 | small = 1e-9
206 | tiny = 1e-15 ## so that if guess is 0 still return 0
207 | bleu_list = [[] for _ in range(n)]
208 |
209 | if self._score is not None:
210 | return self._score
211 |
212 | if option is None:
213 | option = "average" if len(self.crefs) == 1 else "closest"
214 |
215 | self._testlen = 0
216 | self._reflen = 0
217 | totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
218 |
219 | # for each sentence
220 | for comps in self.ctest:
221 | testlen = comps['testlen']
222 | self._testlen += testlen
223 |
224 | if self.special_reflen is None: ## need computation
225 | reflen = self._single_reflen(comps['reflen'], option, testlen)
226 | else:
227 | reflen = self.special_reflen
228 |
229 | self._reflen += reflen
230 |
231 | for key in ['guess','correct']:
232 | for k in range(n):
233 | totalcomps[key][k] += comps[key][k]
234 |
235 | # append per image bleu score
236 | bleu = 1.
237 | for k in range(n):
238 | bleu *= (float(comps['correct'][k]) + tiny) \
239 | /(float(comps['guess'][k]) + small)
240 | bleu_list[k].append(bleu ** (1./(k+1)))
241 | ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
242 | if ratio < 1:
243 | for k in range(n):
244 | bleu_list[k][-1] *= math.exp(1 - 1/ratio)
245 |
246 | if verbose > 1:
247 | print(comps, reflen)
248 |
249 | totalcomps['reflen'] = self._reflen
250 | totalcomps['testlen'] = self._testlen
251 |
252 | bleus = []
253 | bleu = 1.
254 | for k in range(n):
255 | bleu *= float(totalcomps['correct'][k] + tiny) \
256 | / (totalcomps['guess'][k] + small)
257 | bleus.append(bleu ** (1./(k+1)))
258 | ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
259 | if ratio < 1:
260 | for k in range(n):
261 | bleus[k] *= math.exp(1 - 1/ratio)
262 |
263 | if verbose > 0:
264 | print(totalcomps)
265 | print("ratio:", ratio)
266 |
267 | self._score = bleus
268 | return self._score, bleu_list
269 |
--------------------------------------------------------------------------------
/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/cider/cider.py:
--------------------------------------------------------------------------------
1 | # Filename: cider.py
2 | #
3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
5 | #
6 | # Creation Date: Sun Feb 8 14:16:54 2015
7 | #
8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin
9 |
10 |
11 | from .cider_scorer import CiderScorer
12 | import pdb
13 |
14 | class Cider:
15 | """
16 | Main Class to compute the CIDEr metric
17 |
18 | """
19 | def __init__(self, test=None, refs=None, n=4, sigma=6.0):
20 | # set cider to sum over 1 to 4-grams
21 | self._n = n
22 | # set the standard deviation parameter for gaussian penalty
23 | self._sigma = sigma
24 |
25 | def compute_score(self, gts, res):
26 | """
27 | Main function to compute CIDEr score
28 | :param hypo_for_image (dict) : dictionary with key and value
29 | ref_for_image (dict) : dictionary with key and value
30 | :return: cider (float) : computed CIDEr score for the corpus
31 | """
32 |
33 | assert(gts.keys() == res.keys())
34 | imgIds = gts.keys()
35 |
36 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
37 |
38 | for id in imgIds:
39 | hypo = res[id]
40 | ref = gts[id]
41 |
42 | # Sanity check.
43 | assert(type(hypo) is list)
44 | assert(len(hypo) == 1)
45 | assert(type(ref) is list)
46 | assert(len(ref) > 0)
47 |
48 | cider_scorer += (hypo[0], ref)
49 |
50 | (score, scores) = cider_scorer.compute_score()
51 |
52 | return score, scores
53 |
54 | def method(self):
55 | return "CIDEr"
--------------------------------------------------------------------------------
/cider/cider_scorer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Tsung-Yi Lin
3 | # Ramakrishna Vedantam
4 |
5 |
6 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
7 | # By Sabarish Sivanath
8 | # To support Python 3
9 |
10 | import copy
11 | from collections import defaultdict
12 | import numpy as np
13 | import pdb
14 | import math
15 |
16 | def precook(s, n=4, out=False):
17 | """
18 | Takes a string as input and returns an object that can be given to
19 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
20 | can take string arguments as well.
21 | :param s: string : sentence to be converted into ngrams
22 | :param n: int : number of ngrams for which representation is calculated
23 | :return: term frequency vector for occuring ngrams
24 | """
25 | words = s.split()
26 | counts = defaultdict(int)
27 | for k in range(1,n+1):
28 | for i in range(len(words)-k+1):
29 | ngram = tuple(words[i:i+k])
30 | counts[ngram] += 1
31 | return counts
32 |
33 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
34 | '''Takes a list of reference sentences for a single segment
35 | and returns an object that encapsulates everything that BLEU
36 | needs to know about them.
37 | :param refs: list of string : reference sentences for some image
38 | :param n: int : number of ngrams for which (ngram) representation is calculated
39 | :return: result (list of dict)
40 | '''
41 | return [precook(ref, n) for ref in refs]
42 |
43 | def cook_test(test, n=4):
44 | '''Takes a test sentence and returns an object that
45 | encapsulates everything that BLEU needs to know about it.
46 | :param test: list of string : hypothesis sentence for some image
47 | :param n: int : number of ngrams for which (ngram) representation is calculated
48 | :return: result (dict)
49 | '''
50 | return precook(test, n, True)
51 |
52 | class CiderScorer(object):
53 | """CIDEr scorer.
54 | """
55 |
56 | def copy(self):
57 | ''' copy the refs.'''
58 | new = CiderScorer(n=self.n)
59 | new.ctest = copy.copy(self.ctest)
60 | new.crefs = copy.copy(self.crefs)
61 | return new
62 |
63 | def __init__(self, test=None, refs=None, n=4, sigma=6.0):
64 | ''' singular instance '''
65 | self.n = n
66 | self.sigma = sigma
67 | self.crefs = []
68 | self.ctest = []
69 | self.document_frequency = defaultdict(float)
70 | self.cook_append(test, refs)
71 | self.ref_len = None
72 |
73 | def cook_append(self, test, refs):
74 | '''called by constructor and __iadd__ to avoid creating new instances.'''
75 |
76 | if refs is not None:
77 | self.crefs.append(cook_refs(refs))
78 | if test is not None:
79 | self.ctest.append(cook_test(test)) ## N.B.: -1
80 | else:
81 | self.ctest.append(None) # lens of crefs and ctest have to match
82 |
83 | def size(self):
84 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
85 | return len(self.crefs)
86 |
87 | def __iadd__(self, other):
88 | '''add an instance (e.g., from another sentence).'''
89 |
90 | if type(other) is tuple:
91 | ## avoid creating new CiderScorer instances
92 | self.cook_append(other[0], other[1])
93 | else:
94 | self.ctest.extend(other.ctest)
95 | self.crefs.extend(other.crefs)
96 |
97 | return self
98 | def compute_doc_freq(self):
99 | '''
100 | Compute term frequency for reference data.
101 | This will be used to compute idf (inverse document frequency later)
102 | The term frequency is stored in the object
103 | :return: None
104 | '''
105 | for refs in self.crefs:
106 | # refs, k ref captions of one image
107 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
108 | self.document_frequency[ngram] += 1
109 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
110 |
111 | def compute_cider(self):
112 | def counts2vec(cnts):
113 | """
114 | Function maps counts of ngram to vector of tfidf weights.
115 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
116 | The n-th entry of array denotes length of n-grams.
117 | :param cnts:
118 | :return: vec (array of dict), norm (array of float), length (int)
119 | """
120 | vec = [defaultdict(float) for _ in range(self.n)]
121 | length = 0
122 | norm = [0.0 for _ in range(self.n)]
123 | for (ngram,term_freq) in cnts.items():
124 | # give word count 1 if it doesn't appear in reference corpus
125 | df = np.log(max(1.0, self.document_frequency[ngram]))
126 | # ngram index
127 | n = len(ngram)-1
128 | # tf (term_freq) * idf (precomputed idf) for n-grams
129 | vec[n][ngram] = float(term_freq)*(self.ref_len - df)
130 | # compute norm for the vector. the norm will be used for computing similarity
131 | norm[n] += pow(vec[n][ngram], 2)
132 |
133 | if n == 1:
134 | length += term_freq
135 | norm = [np.sqrt(n) for n in norm]
136 | return vec, norm, length
137 |
138 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
139 | '''
140 | Compute the cosine similarity of two vectors.
141 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis
142 | :param vec_ref: array of dictionary for vector corresponding to reference
143 | :param norm_hyp: array of float for vector corresponding to hypothesis
144 | :param norm_ref: array of float for vector corresponding to reference
145 | :param length_hyp: int containing length of hypothesis
146 | :param length_ref: int containing length of reference
147 | :return: array of score for each n-grams cosine similarity
148 | '''
149 | delta = float(length_hyp - length_ref)
150 | # measure consine similarity
151 | val = np.array([0.0 for _ in range(self.n)])
152 | for n in range(self.n):
153 | # ngram
154 | for (ngram,count) in vec_hyp[n].items():
155 | # vrama91 : added clipping
156 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
157 |
158 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
159 | val[n] /= (norm_hyp[n]*norm_ref[n])
160 |
161 | assert(not math.isnan(val[n]))
162 | # vrama91: added a length based gaussian penalty
163 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
164 | return val
165 |
166 | # compute log reference length
167 | self.ref_len = np.log(float(len(self.crefs)))
168 |
169 | scores = []
170 | for test, refs in zip(self.ctest, self.crefs):
171 | # compute vector for test captions
172 | vec, norm, length = counts2vec(test)
173 | # compute vector for ref captions
174 | score = np.array([0.0 for _ in range(self.n)])
175 | for ref in refs:
176 | vec_ref, norm_ref, length_ref = counts2vec(ref)
177 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
178 | # change by vrama91 - mean of ngram scores, instead of sum
179 | score_avg = np.mean(score)
180 | # divide by number of references
181 | score_avg /= len(refs)
182 | # multiply score by 10
183 | score_avg *= 10.0
184 | # append score of an image to the score list
185 | scores.append(score_avg)
186 | return scores
187 |
188 | def compute_score(self, option=None, verbose=0):
189 | # compute idf
190 | self.compute_doc_freq()
191 | # assert to check document frequency
192 | assert(len(self.ctest) >= max(self.document_frequency.values()))
193 | # compute cider score
194 | score = self.compute_cider()
195 | # debug
196 | # print score
197 | return np.mean(np.array(score)), np.array(score)
--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | from .tokenizer.ptbtokenizer import PTBTokenizer
3 | from .bleu.bleu import Bleu
4 | from .meteor.meteor import Meteor
5 | from .rouge.rouge import Rouge
6 | from .cider.cider import Cider
7 |
8 | class COCOEvalCap:
9 | def __init__(self, coco, cocoRes):
10 | self.evalImgs = []
11 | self.eval = {}
12 | self.imgToEval = {}
13 | self.coco = coco
14 | self.cocoRes = cocoRes
15 | self.params = {'image_id': cocoRes.getImgIds()}
16 |
17 | def evaluate(self):
18 | imgIds = self.params['image_id']
19 | # imgIds = self.coco.getImgIds()
20 | gts = {}
21 | res = {}
22 | for imgId in imgIds:
23 | gts[imgId] = self.coco.imgToAnns[imgId]
24 | res[imgId] = self.cocoRes.imgToAnns[imgId]
25 |
26 | # =================================================
27 | # Set up scorers
28 | # =================================================
29 | print('tokenization...')
30 | tokenizer = PTBTokenizer()
31 | gts = tokenizer.tokenize(gts)
32 | res = tokenizer.tokenize(res)
33 |
34 | # =================================================
35 | # Set up scorers
36 | # =================================================
37 | print('setting up scorers...')
38 | scorers = [
39 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
40 | (Meteor(),"METEOR"),
41 | (Rouge(), "ROUGE_L"),
42 | (Cider(), "CIDEr")
43 | ]
44 |
45 | # =================================================
46 | # Compute scores
47 | # =================================================
48 | eval = {}
49 | for scorer, method in scorers:
50 | print('computing %s score...'%(scorer.method()))
51 | score, scores = scorer.compute_score(gts, res)
52 | if type(method) == list:
53 | for sc, scs, m in zip(score, scores, method):
54 | self.setEval(sc, m)
55 | self.setImgToEvalImgs(scs, imgIds, m)
56 | print("%s: %0.3f"%(m, sc))
57 | else:
58 | self.setEval(score, method)
59 | self.setImgToEvalImgs(scores, imgIds, method)
60 | print("%s: %0.3f"%(method, score))
61 | self.setEvalImgs()
62 |
63 | def setEval(self, score, method):
64 | self.eval[method] = score
65 |
66 | def setImgToEvalImgs(self, scores, imgIds, method):
67 | for imgId, score in zip(imgIds, scores):
68 | if not imgId in self.imgToEval:
69 | self.imgToEval[imgId] = {}
70 | self.imgToEval[imgId]["image_id"] = imgId
71 | self.imgToEval[imgId][method] = score
72 |
73 | def setEvalImgs(self):
74 | self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]
75 |
--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 |
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 |
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of the FreeBSD Project.
--------------------------------------------------------------------------------
/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/meteor/data/paraphrase-en.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/meteor/data/paraphrase-en.gz
--------------------------------------------------------------------------------
/meteor/meteor-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/meteor/meteor-1.5.jar
--------------------------------------------------------------------------------
/meteor/meteor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Python wrapper for METEOR implementation, by Xinlei Chen
4 | # Acknowledge Michael Denkowski for the generous discussion and help
5 |
6 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
7 | # By Sabarish Sivanath
8 | # To support Python 3
9 |
10 | import os
11 | import sys
12 | import subprocess
13 | import threading
14 |
15 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed.
16 | METEOR_JAR = 'meteor-1.5.jar'
17 | # print METEOR_JAR
18 |
19 | class Meteor:
20 |
21 | def __init__(self):
22 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
23 | '-', '-', '-stdio', '-l', 'en', '-norm']
24 | self.meteor_p = subprocess.Popen(self.meteor_cmd, \
25 | cwd=os.path.dirname(os.path.abspath(__file__)), \
26 | stdin=subprocess.PIPE, \
27 | stdout=subprocess.PIPE, \
28 | stderr=subprocess.PIPE,
29 | universal_newlines = True,
30 | bufsize = 1)
31 | # Used to guarantee thread safety
32 | self.lock = threading.Lock()
33 |
34 | def compute_score(self, gts, res):
35 | assert(gts.keys() == res.keys())
36 | imgIds = gts.keys()
37 | scores = []
38 |
39 | eval_line = 'EVAL'
40 | self.lock.acquire()
41 | for i in imgIds:
42 | assert(len(res[i]) == 1)
43 | stat = self._stat(res[i][0], gts[i])
44 | eval_line += ' ||| {}'.format(stat)
45 |
46 | self.meteor_p.stdin.write('{}\n'.format(eval_line))
47 | for i in range(0,len(imgIds)):
48 | scores.append(float(self.meteor_p.stdout.readline().strip()))
49 | score = float(self.meteor_p.stdout.readline().strip())
50 | self.lock.release()
51 |
52 | return score, scores
53 |
54 | def method(self):
55 | return "METEOR"
56 |
57 | def _stat(self, hypothesis_str, reference_list):
58 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
59 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ')
60 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
61 | self.meteor_p.stdin.write('{}\n'.format(score_line))
62 | return self.meteor_p.stdout.readline().strip()
63 |
64 | def _score(self, hypothesis_str, reference_list):
65 | self.lock.acquire()
66 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
67 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ')
68 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
69 | self.meteor_p.stdin.write('{}\n'.format(score_line))
70 | stats = self.meteor_p.stdout.readline().strip()
71 | eval_line = 'EVAL ||| {}'.format(stats)
72 | # EVAL ||| stats
73 | self.meteor_p.stdin.write('{}\n'.format(eval_line))
74 | score = float(self.meteor_p.stdout.readline().strip())
75 | # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
76 | # thanks for Andrej for pointing this out
77 | score = float(self.meteor_p.stdout.readline().strip())
78 | self.lock.release()
79 | return score
80 |
81 | def __del__(self):
82 | self.lock.acquire()
83 | self.meteor_p.stdin.close()
84 | self.meteor_p.kill()
85 | self.meteor_p.wait()
86 | self.lock.release()
87 |
--------------------------------------------------------------------------------
/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 |
--------------------------------------------------------------------------------
/rouge/rouge.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File Name : rouge.py
4 | #
5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
6 | #
7 | # Creation Date : 2015-01-07 06:03
8 | # Author : Ramakrishna Vedantam
9 |
10 | import numpy as np
11 | import pdb
12 |
13 | def my_lcs(string, sub):
14 | """
15 | Calculates longest common subsequence for a pair of tokenized strings
16 | :param string : list of str : tokens from a string split using whitespace
17 | :param sub : list of str : shorter string, also split using whitespace
18 | :returns: length (list of int): length of the longest common subsequence between the two strings
19 |
20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
21 | """
22 | if(len(string)< len(sub)):
23 | sub, string = string, sub
24 |
25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
26 |
27 | for j in range(1,len(sub)+1):
28 | for i in range(1,len(string)+1):
29 | if(string[i-1] == sub[j-1]):
30 | lengths[i][j] = lengths[i-1][j-1] + 1
31 | else:
32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
33 |
34 | return lengths[len(string)][len(sub)]
35 |
36 | class Rouge():
37 | '''
38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
39 |
40 | '''
41 | def __init__(self):
42 | # vrama91: updated the value below based on discussion with Hovey
43 | self.beta = 1.2
44 |
45 | def calc_score(self, candidate, refs):
46 | """
47 | Compute ROUGE-L score given one candidate and references for an image
48 | :param candidate: str : candidate sentence to be evaluated
49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated
50 | :returns score: int (ROUGE-L score for the candidate evaluated against references)
51 | """
52 | assert(len(candidate)==1)
53 | assert(len(refs)>0)
54 | prec = []
55 | rec = []
56 |
57 | # split into tokens
58 | token_c = candidate[0].split(" ")
59 |
60 | for reference in refs:
61 | # split into tokens
62 | token_r = reference.split(" ")
63 | # compute the longest common subsequence
64 | lcs = my_lcs(token_r, token_c)
65 | prec.append(lcs/float(len(token_c)))
66 | rec.append(lcs/float(len(token_r)))
67 |
68 | prec_max = max(prec)
69 | rec_max = max(rec)
70 |
71 | if(prec_max!=0 and rec_max !=0):
72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
73 | else:
74 | score = 0.0
75 | return score
76 |
77 | def compute_score(self, gts, res):
78 | """
79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset
80 | Invoked by evaluate_captions.py
81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
84 | """
85 | assert(gts.keys() == res.keys())
86 | imgIds = gts.keys()
87 |
88 | score = []
89 | for id in imgIds:
90 | hypo = res[id]
91 | ref = gts[id]
92 |
93 | score.append(self.calc_score(hypo, ref))
94 |
95 | # Sanity check.
96 | assert(type(hypo) is list)
97 | assert(len(hypo) == 1)
98 | assert(type(ref) is list)
99 | assert(len(ref) > 0)
100 |
101 | average_score = np.mean(np.array(score))
102 | return average_score, np.array(score)
103 |
104 | def method(self):
105 | return "Rouge"
106 |
--------------------------------------------------------------------------------
/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 |
--------------------------------------------------------------------------------
/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File Name : ptbtokenizer.py
4 | #
5 | # Description : Do the PTB Tokenization and remove punctuations.
6 | #
7 | # Creation Date : 29-12-2014
8 | # Last Modified : Thu Mar 19 09:53:35 2015
9 | # Authors : Hao Fang and Tsung-Yi Lin
10 |
11 | import os
12 | import sys
13 | import subprocess
14 | import tempfile
15 | import itertools
16 |
17 |
18 | # Last modified : Wed 22 May 2019 08:10:00 PM EDT
19 | # By Sabarish Sivanath
20 | # To support Python 3
21 |
22 | # path to the stanford corenlp jar
23 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
24 |
25 | # punctuations to be removed from the sentences
26 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
27 | ".", "?", "!", ",", ":", "-", "--", "...", ";"]
28 |
29 | class PTBTokenizer:
30 | """Python wrapper of Stanford PTBTokenizer"""
31 |
32 | def tokenize(self, captions_for_image):
33 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
34 | 'edu.stanford.nlp.process.PTBTokenizer', \
35 | '-preserveLines', '-lowerCase']
36 |
37 | # ======================================================
38 | # prepare data for PTB Tokenizer
39 | # ======================================================
40 | final_tokenized_captions_for_image = {}
41 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
42 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
43 |
44 | # ======================================================
45 | # save sentences to temporary file
46 | # ======================================================
47 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
48 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
49 | tmp_file.write(sentences.encode('utf-8'))
50 | tmp_file.close()
51 |
52 | # ======================================================
53 | # tokenize sentence
54 | # ======================================================
55 | cmd.append(os.path.basename(tmp_file.name))
56 | p_tokenizer = subprocess.Popen(cmd,
57 | cwd=path_to_jar_dirname,
58 | stdout=subprocess.PIPE,
59 | universal_newlines = True,
60 | bufsize = 1)
61 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
62 | lines = token_lines.split('\n')
63 | # remove temp file
64 | os.remove(tmp_file.name)
65 |
66 | # ======================================================
67 | # create dictionary for tokenized captions
68 | # ======================================================
69 | for k, line in zip(image_id, lines):
70 | if not k in final_tokenized_captions_for_image:
71 | final_tokenized_captions_for_image[k] = []
72 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
73 | if w not in PUNCTUATIONS])
74 | final_tokenized_captions_for_image[k].append(tokenized_caption)
75 |
76 | return final_tokenized_captions_for_image
77 |
--------------------------------------------------------------------------------
/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sks3i/pycocoevalcap/e652aac37787c573eef5c4dbd6a53fa9969dd9d1/tokenizer/stanford-corenlp-3.4.1.jar
--------------------------------------------------------------------------------