import spacy
import copy
from collections import defaultdict
import numpy as np
import math
"""
Code adapted from: https://github.com/INK-USC/CommonGen/tree/master/evaluation/Traditional/eval_metrics
"""
[docs]def precook(s, n=4, out=False):
"""
Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well.
:param s: string : sentence to be converted into ngrams
:param n: int : number of ngrams for which representation is calculated
:return: term frequency vector for occuring ngrams
"""
words = s.split()
counts = defaultdict(int)
for k in range(1, n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return counts
[docs]def cook_refs(refs, n=4): # lhuang: oracle will call with "average"
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.
:param refs: list of string : reference sentences for some image
:param n: int : number of ngrams for which (ngram) representation is calculated
:return: result (list of dict)
'''
return [precook(ref, n) for ref in refs]
[docs]def cook_test(test, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.
:param test: list of string : hypothesis sentence for some image
:param n: int : number of ngrams for which (ngram) representation is calculated
:return: result (dict)
'''
return precook(test, n, True)
[docs]class CiderScorer(object):
"""CIDEr scorer.
"""
[docs] def copy(self):
''' copy the refs.'''
new = CiderScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
return new
[docs] def __init__(self, test=None, refs=None, n=4, sigma=6.0):
''' singular instance '''
self.n = n
self.sigma = sigma
self.crefs = []
self.ctest = []
self.document_frequency = defaultdict(float)
self.cook_append(test, refs)
self.ref_len = None
[docs] def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
self.ctest.append(cook_test(test)) # N.B.: -1
else:
# lens of crefs and ctest have to match
self.ctest.append(None)
[docs] def size(self):
assert len(self.crefs) == len(
self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
# avoid creating new CiderScorer instances
self.cook_append(other[0], other[1])
else:
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
return self
[docs] def compute_doc_freq(self):
'''
Compute term frequency for reference data.
This will be used to compute idf (inverse document frequency later)
The term frequency is stored in the object
:return: None
'''
for refs in self.crefs:
# refs, k ref captions of one image
for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]):
self.document_frequency[ngram] += 1
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
[docs] def compute_cider(self):
def counts2vec(cnts):
"""
Function maps counts of ngram to vector of tfidf weights.
The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
The n-th entry of array denotes length of n-grams.
:param cnts:
:return: vec (array of dict), norm (array of float), length (int)
"""
vec = [defaultdict(float) for _ in range(self.n)]
length = 0
norm = [0.0 for _ in range(self.n)]
for (ngram, term_freq) in cnts.items():
# give word count 1 if it doesn't appear in reference corpus
df = np.log(max(1.0, self.document_frequency[ngram]))
# ngram index
n = len(ngram)-1
# tf (term_freq) * idf (precomputed idf) for n-grams
vec[n][ngram] = float(term_freq)*(self.ref_len - df)
# compute norm for the vector. the norm will be used for computing similarity
norm[n] += pow(vec[n][ngram], 2)
if n == 1:
length += term_freq
norm = [np.sqrt(n) for n in norm]
return vec, norm, length
def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
'''
Compute the cosine similarity of two vectors.
:param vec_hyp: array of dictionary for vector corresponding to hypothesis
:param vec_ref: array of dictionary for vector corresponding to reference
:param norm_hyp: array of float for vector corresponding to hypothesis
:param norm_ref: array of float for vector corresponding to reference
:param length_hyp: int containing length of hypothesis
:param length_ref: int containing length of reference
:return: array of score for each n-grams cosine similarity
'''
delta = float(length_hyp - length_ref)
# measure consine similarity
val = np.array([0.0 for _ in range(self.n)])
for n in range(self.n):
# ngram
for (ngram, count) in vec_hyp[n].items():
# vrama91 : added clipping
val[n] += min(vec_hyp[n][ngram], vec_ref[n]
[ngram]) * vec_ref[n][ngram]
if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
val[n] /= (norm_hyp[n]*norm_ref[n])
assert(not math.isnan(val[n]))
# vrama91: added a length based gaussian penalty
val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
return val
# compute log reference length
self.ref_len = np.log(float(len(self.crefs)))
scores = []
for test, refs in zip(self.ctest, self.crefs):
# compute vector for test captions
vec, norm, length = counts2vec(test)
# compute vector for ref captions
score = np.array([0.0 for _ in range(self.n)])
for ref in refs:
vec_ref, norm_ref, length_ref = counts2vec(ref)
score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
# change by vrama91 - mean of ngram scores, instead of sum
score_avg = np.mean(score)
# divide by number of references
score_avg /= len(refs)
# multiply score by 10
score_avg *= 10.0
# append score of an image to the score list
scores.append(score_avg)
return scores
[docs] def compute_score(self, option=None, verbose=0):
# compute idf
self.compute_doc_freq()
# assert to check document frequency
assert(len(self.ctest) >= max(self.document_frequency.values()))
# compute cider score
score = self.compute_cider()
# debug
# print score
return np.mean(np.array(score)), np.array(score)
[docs]class Cider:
"""
Main Class to compute the CIDEr metric
"""
[docs] def __init__(self, test=None, refs=None, n=4, sigma=6.0):
# set cider to sum over 1 to 4-grams
self._n = n
# set the standard deviation parameter for gaussian penalty
self._sigma = sigma
self._nlp = spacy.load("en_core_web_sm")
# keep only tagger
for pipe in ["tok2vec", "parser", "ner", "attribute_ruler", "lemmatizer"]:
self._nlp.remove_pipe(pipe)
[docs] def tokenize(self, dict):
for key in dict:
new_sentence_list = []
for sentence in dict[key]:
a = ''
for token in self._nlp(str(sentence)):
a += token.text
a += ' '
new_sentence_list.append(a.rstrip())
dict[key] = new_sentence_list
return dict
[docs] def compute_score(self, gts, res):
# tokenize
gts = self.tokenize(gts)
res = self.tokenize(res)
assert(gts.keys() == res.keys())
imgIds = gts.keys()
cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
for id in imgIds:
hypo = res[id]
ref = gts[id]
# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) > 0)
cider_scorer += (hypo[0], ref)
(score, scores) = cider_scorer.compute_score()
individual_scores = {image_id: score for image_id,
score in zip(imgIds, scores)}
return score, individual_scores
[docs] def method(self):
return "CIDEr"
if __name__ == '__main__':
gts = {"cat#dog#boy": ["The dog is the boy's cat.", "The dog eats the cat of the boy."],
"apple#tree#boy": ["A boy is picking apples from trees."]}
res = {"cat#dog#boy": ["The dog is the boy's cat."],
"apple#tree#boy": ["A boy is picking apples from trees and put them into bags."]}
metric = Cider()
print(metric.compute_score(gts, res))