Source code for rl4lms.envs.text_generation.caption_metrics.cider


import spacy

import copy
from collections import defaultdict
import numpy as np
import math

"""
Code adapted from: https://github.com/INK-USC/CommonGen/tree/master/evaluation/Traditional/eval_metrics
"""


[docs]def precook(s, n=4, out=False): """ Takes a string as input and returns an object that can be given to either cook_refs or cook_test. This is optional: cook_refs and cook_test can take string arguments as well. :param s: string : sentence to be converted into ngrams :param n: int : number of ngrams for which representation is calculated :return: term frequency vector for occuring ngrams """ words = s.split() counts = defaultdict(int) for k in range(1, n+1): for i in range(len(words)-k+1): ngram = tuple(words[i:i+k]) counts[ngram] += 1 return counts
[docs]def cook_refs(refs, n=4): # lhuang: oracle will call with "average" '''Takes a list of reference sentences for a single segment and returns an object that encapsulates everything that BLEU needs to know about them. :param refs: list of string : reference sentences for some image :param n: int : number of ngrams for which (ngram) representation is calculated :return: result (list of dict) ''' return [precook(ref, n) for ref in refs]
[docs]def cook_test(test, n=4): '''Takes a test sentence and returns an object that encapsulates everything that BLEU needs to know about it. :param test: list of string : hypothesis sentence for some image :param n: int : number of ngrams for which (ngram) representation is calculated :return: result (dict) ''' return precook(test, n, True)
[docs]class CiderScorer(object): """CIDEr scorer. """
[docs] def copy(self): ''' copy the refs.''' new = CiderScorer(n=self.n) new.ctest = copy.copy(self.ctest) new.crefs = copy.copy(self.crefs) return new
[docs] def __init__(self, test=None, refs=None, n=4, sigma=6.0): ''' singular instance ''' self.n = n self.sigma = sigma self.crefs = [] self.ctest = [] self.document_frequency = defaultdict(float) self.cook_append(test, refs) self.ref_len = None
[docs] def cook_append(self, test, refs): '''called by constructor and __iadd__ to avoid creating new instances.''' if refs is not None: self.crefs.append(cook_refs(refs)) if test is not None: self.ctest.append(cook_test(test)) # N.B.: -1 else: # lens of crefs and ctest have to match self.ctest.append(None)
[docs] def size(self): assert len(self.crefs) == len( self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) return len(self.crefs)
def __iadd__(self, other): '''add an instance (e.g., from another sentence).''' if type(other) is tuple: # avoid creating new CiderScorer instances self.cook_append(other[0], other[1]) else: self.ctest.extend(other.ctest) self.crefs.extend(other.crefs) return self
[docs] def compute_doc_freq(self): ''' Compute term frequency for reference data. This will be used to compute idf (inverse document frequency later) The term frequency is stored in the object :return: None ''' for refs in self.crefs: # refs, k ref captions of one image for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]): self.document_frequency[ngram] += 1
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
[docs] def compute_cider(self): def counts2vec(cnts): """ Function maps counts of ngram to vector of tfidf weights. The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. The n-th entry of array denotes length of n-grams. :param cnts: :return: vec (array of dict), norm (array of float), length (int) """ vec = [defaultdict(float) for _ in range(self.n)] length = 0 norm = [0.0 for _ in range(self.n)] for (ngram, term_freq) in cnts.items(): # give word count 1 if it doesn't appear in reference corpus df = np.log(max(1.0, self.document_frequency[ngram])) # ngram index n = len(ngram)-1 # tf (term_freq) * idf (precomputed idf) for n-grams vec[n][ngram] = float(term_freq)*(self.ref_len - df) # compute norm for the vector. the norm will be used for computing similarity norm[n] += pow(vec[n][ngram], 2) if n == 1: length += term_freq norm = [np.sqrt(n) for n in norm] return vec, norm, length def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): ''' Compute the cosine similarity of two vectors. :param vec_hyp: array of dictionary for vector corresponding to hypothesis :param vec_ref: array of dictionary for vector corresponding to reference :param norm_hyp: array of float for vector corresponding to hypothesis :param norm_ref: array of float for vector corresponding to reference :param length_hyp: int containing length of hypothesis :param length_ref: int containing length of reference :return: array of score for each n-grams cosine similarity ''' delta = float(length_hyp - length_ref) # measure consine similarity val = np.array([0.0 for _ in range(self.n)]) for n in range(self.n): # ngram for (ngram, count) in vec_hyp[n].items(): # vrama91 : added clipping val[n] += min(vec_hyp[n][ngram], vec_ref[n] [ngram]) * vec_ref[n][ngram] if (norm_hyp[n] != 0) and (norm_ref[n] != 0): val[n] /= (norm_hyp[n]*norm_ref[n]) assert(not math.isnan(val[n])) # vrama91: added a length based gaussian penalty val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) return val # compute log reference length self.ref_len = np.log(float(len(self.crefs))) scores = [] for test, refs in zip(self.ctest, self.crefs): # compute vector for test captions vec, norm, length = counts2vec(test) # compute vector for ref captions score = np.array([0.0 for _ in range(self.n)]) for ref in refs: vec_ref, norm_ref, length_ref = counts2vec(ref) score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) # change by vrama91 - mean of ngram scores, instead of sum score_avg = np.mean(score) # divide by number of references score_avg /= len(refs) # multiply score by 10 score_avg *= 10.0 # append score of an image to the score list scores.append(score_avg) return scores
[docs] def compute_score(self, option=None, verbose=0): # compute idf self.compute_doc_freq() # assert to check document frequency assert(len(self.ctest) >= max(self.document_frequency.values())) # compute cider score score = self.compute_cider() # debug # print score return np.mean(np.array(score)), np.array(score)
[docs]class Cider: """ Main Class to compute the CIDEr metric """
[docs] def __init__(self, test=None, refs=None, n=4, sigma=6.0): # set cider to sum over 1 to 4-grams self._n = n # set the standard deviation parameter for gaussian penalty self._sigma = sigma self._nlp = spacy.load("en_core_web_sm") # keep only tagger for pipe in ["tok2vec", "parser", "ner", "attribute_ruler", "lemmatizer"]: self._nlp.remove_pipe(pipe)
[docs] def tokenize(self, dict): for key in dict: new_sentence_list = [] for sentence in dict[key]: a = '' for token in self._nlp(str(sentence)): a += token.text a += ' ' new_sentence_list.append(a.rstrip()) dict[key] = new_sentence_list return dict
[docs] def compute_score(self, gts, res): # tokenize gts = self.tokenize(gts) res = self.tokenize(res) assert(gts.keys() == res.keys()) imgIds = gts.keys() cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) > 0) cider_scorer += (hypo[0], ref) (score, scores) = cider_scorer.compute_score() individual_scores = {image_id: score for image_id, score in zip(imgIds, scores)} return score, individual_scores
[docs] def method(self): return "CIDEr"
if __name__ == '__main__': gts = {"cat#dog#boy": ["The dog is the boy's cat.", "The dog eats the cat of the boy."], "apple#tree#boy": ["A boy is picking apples from trees."]} res = {"cat#dog#boy": ["The dog is the boy's cat."], "apple#tree#boy": ["A boy is picking apples from trees and put them into bags."]} metric = Cider() print(metric.compute_score(gts, res))