326 lines
13 KiB
Python
326 lines
13 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Natural Language Toolkit: RIBES Score
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
|
|
# Mark Byers, ekhumoro, P. Ortiz
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
""" RIBES score implementation """
|
|
|
|
from itertools import islice
|
|
import math
|
|
|
|
from nltk.util import ngrams, choose
|
|
|
|
|
|
def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10):
|
|
"""
|
|
The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from
|
|
Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and
|
|
Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for
|
|
Distant Language Pairs". In Proceedings of EMNLP.
|
|
http://www.aclweb.org/anthology/D/D10/D10-1092.pdf
|
|
|
|
The generic RIBES scores used in shared task, e.g. Workshop for
|
|
Asian Translation (WAT) uses the following RIBES calculations:
|
|
|
|
RIBES = kendall_tau * (alpha**p1) * (beta**bp)
|
|
|
|
Please note that this re-implementation differs from the official
|
|
RIBES implementation and though it emulates the results as describe
|
|
in the original paper, there are further optimization implemented
|
|
in the official RIBES script.
|
|
|
|
Users are encouraged to use the official RIBES script instead of this
|
|
implementation when evaluating your machine translation system. Refer
|
|
to http://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script.
|
|
|
|
:param references: a list of reference sentences
|
|
:type reference: list(list(str))
|
|
:param hypothesis: a hypothesis sentence
|
|
:type hypothesis: list(str)
|
|
:param alpha: hyperparameter used as a prior for the unigram precision.
|
|
:type alpha: float
|
|
:param beta: hyperparameter used as a prior for the brevity penalty.
|
|
:type beta: float
|
|
:return: The best ribes score from one of the references.
|
|
:rtype: float
|
|
"""
|
|
best_ribes = -1.0
|
|
# Calculates RIBES for each reference and returns the best score.
|
|
for reference in references:
|
|
# Collects the *worder* from the ranked correlation alignments.
|
|
worder = word_rank_alignment(reference, hypothesis)
|
|
nkt = kendall_tau(worder)
|
|
|
|
# Calculates the brevity penalty
|
|
bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis)))
|
|
|
|
# Calculates the unigram precision, *p1*
|
|
p1 = len(worder) / len(hypothesis)
|
|
|
|
_ribes = nkt * (p1 ** alpha) * (bp ** beta)
|
|
|
|
if _ribes > best_ribes: # Keeps the best score.
|
|
best_ribes = _ribes
|
|
|
|
return best_ribes
|
|
|
|
|
|
def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10):
|
|
"""
|
|
This function "calculates RIBES for a system output (hypothesis) with
|
|
multiple references, and returns "best" score among multi-references and
|
|
individual scores. The scores are corpus-wise, i.e., averaged by the number
|
|
of sentences." (c.f. RIBES version 1.03.1 code).
|
|
|
|
Different from BLEU's micro-average precision, RIBES calculates the
|
|
macro-average precision by averaging the best RIBES score for each pair of
|
|
hypothesis and its corresponding references
|
|
|
|
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
|
... 'ensures', 'that', 'the', 'military', 'always',
|
|
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
|
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
|
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
|
... 'heed', 'Party', 'commands']
|
|
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
|
... 'guarantees', 'the', 'military', 'forces', 'always',
|
|
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
|
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
|
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
|
... 'of', 'the', 'party']
|
|
|
|
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
|
... 'interested', 'in', 'world', 'history']
|
|
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
|
... 'because', 'he', 'read', 'the', 'book']
|
|
|
|
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
|
>>> hypotheses = [hyp1, hyp2]
|
|
>>> round(corpus_ribes(list_of_references, hypotheses),4)
|
|
0.3597
|
|
|
|
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
|
:type references: list(list(list(str)))
|
|
:param hypotheses: a list of hypothesis sentences
|
|
:type hypotheses: list(list(str))
|
|
:param alpha: hyperparameter used as a prior for the unigram precision.
|
|
:type alpha: float
|
|
:param beta: hyperparameter used as a prior for the brevity penalty.
|
|
:type beta: float
|
|
:return: The best ribes score from one of the references.
|
|
:rtype: float
|
|
"""
|
|
corpus_best_ribes = 0.0
|
|
# Iterate through each hypothesis and their corresponding references.
|
|
for references, hypothesis in zip(list_of_references, hypotheses):
|
|
corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta)
|
|
return corpus_best_ribes / len(hypotheses)
|
|
|
|
|
|
def position_of_ngram(ngram, sentence):
|
|
"""
|
|
This function returns the position of the first instance of the ngram
|
|
appearing in a sentence.
|
|
|
|
Note that one could also use string as follows but the code is a little
|
|
convoluted with type casting back and forth:
|
|
|
|
char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
|
|
word_pos = char_pos.count(' ')
|
|
|
|
Another way to conceive this is:
|
|
|
|
return next(i for i, ng in enumerate(ngrams(sentence, len(ngram)))
|
|
if ng == ngram)
|
|
|
|
:param ngram: The ngram that needs to be searched
|
|
:type ngram: tuple
|
|
:param sentence: The list of tokens to search from.
|
|
:type sentence: list(str)
|
|
"""
|
|
# Iterates through the ngrams in sentence.
|
|
for i, sublist in enumerate(ngrams(sentence, len(ngram))):
|
|
# Returns the index of the word when ngram matches.
|
|
if ngram == sublist:
|
|
return i
|
|
|
|
|
|
def word_rank_alignment(reference, hypothesis, character_based=False):
|
|
"""
|
|
This is the word rank alignment algorithm described in the paper to produce
|
|
the *worder* list, i.e. a list of word indices of the hypothesis word orders
|
|
w.r.t. the list of reference words.
|
|
|
|
Below is (H0, R0) example from the Isozaki et al. 2010 paper,
|
|
note the examples are indexed from 1 but the results here are indexed from 0:
|
|
|
|
>>> ref = str('he was interested in world history because he '
|
|
... 'read the book').split()
|
|
>>> hyp = str('he read the book because he was interested in world '
|
|
... 'history').split()
|
|
>>> word_rank_alignment(ref, hyp)
|
|
[7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
|
|
|
The (H1, R1) example from the paper, note the 0th index:
|
|
|
|
>>> ref = 'John hit Bob yesterday'.split()
|
|
>>> hyp = 'Bob hit John yesterday'.split()
|
|
>>> word_rank_alignment(ref, hyp)
|
|
[2, 1, 0, 3]
|
|
|
|
Here is the (H2, R2) example from the paper, note the 0th index here too:
|
|
|
|
>>> ref = 'the boy read the book'.split()
|
|
>>> hyp = 'the book was read by the boy'.split()
|
|
>>> word_rank_alignment(ref, hyp)
|
|
[3, 4, 2, 0, 1]
|
|
|
|
:param reference: a reference sentence
|
|
:type reference: list(str)
|
|
:param hypothesis: a hypothesis sentence
|
|
:type hypothesis: list(str)
|
|
"""
|
|
worder = []
|
|
hyp_len = len(hypothesis)
|
|
# Stores a list of possible ngrams from the reference sentence.
|
|
# This is used for matching context window later in the algorithm.
|
|
ref_ngrams = []
|
|
hyp_ngrams = []
|
|
for n in range(1, len(reference) + 1):
|
|
for ng in ngrams(reference, n):
|
|
ref_ngrams.append(ng)
|
|
for ng in ngrams(hypothesis, n):
|
|
hyp_ngrams.append(ng)
|
|
for i, h_word in enumerate(hypothesis):
|
|
# If word is not in the reference, continue.
|
|
if h_word not in reference:
|
|
continue
|
|
# If we can determine one-to-one word correspondence for unigrams that
|
|
# only appear once in both the reference and hypothesis.
|
|
elif hypothesis.count(h_word) == reference.count(h_word) == 1:
|
|
worder.append(reference.index(h_word))
|
|
else:
|
|
max_window_size = max(i, hyp_len - i + 1)
|
|
for window in range(1, max_window_size):
|
|
if i + window < hyp_len: # If searching the right context is possible.
|
|
# Retrieve the right context window.
|
|
right_context_ngram = tuple(islice(hypothesis, i, i + window + 1))
|
|
num_times_in_ref = ref_ngrams.count(right_context_ngram)
|
|
num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
|
|
# If ngram appears only once in both ref and hyp.
|
|
if num_times_in_ref == num_times_in_hyp == 1:
|
|
# Find the position of ngram that matched the reference.
|
|
pos = position_of_ngram(right_context_ngram, reference)
|
|
worder.append(pos) # Add the positions of the ngram.
|
|
break
|
|
if window <= i: # If searching the left context is possible.
|
|
# Retrieve the left context window.
|
|
left_context_ngram = tuple(islice(hypothesis, i - window, i + 1))
|
|
num_times_in_ref = ref_ngrams.count(left_context_ngram)
|
|
num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
|
|
if num_times_in_ref == num_times_in_hyp == 1:
|
|
# Find the position of ngram that matched the reference.
|
|
pos = position_of_ngram(left_context_ngram, reference)
|
|
# Add the positions of the ngram.
|
|
worder.append(pos + len(left_context_ngram) - 1)
|
|
break
|
|
return worder
|
|
|
|
|
|
def find_increasing_sequences(worder):
|
|
"""
|
|
Given the *worder* list, this function groups monotonic +1 sequences.
|
|
|
|
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
|
>>> list(find_increasing_sequences(worder))
|
|
[(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)]
|
|
|
|
:param worder: The worder list output from word_rank_alignment
|
|
:param type: list(int)
|
|
"""
|
|
items = iter(worder)
|
|
a, b = None, next(items, None)
|
|
result = [b]
|
|
while b is not None:
|
|
a, b = b, next(items, None)
|
|
if b is not None and a + 1 == b:
|
|
result.append(b)
|
|
else:
|
|
if len(result) > 1:
|
|
yield tuple(result)
|
|
result = [b]
|
|
|
|
|
|
def kendall_tau(worder, normalize=True):
|
|
"""
|
|
Calculates the Kendall's Tau correlation coefficient given the *worder*
|
|
list of word alignments from word_rank_alignment(), using the formula:
|
|
|
|
tau = 2 * num_increasing_pairs / num_possible pairs -1
|
|
|
|
Note that the no. of increasing pairs can be discontinuous in the *worder*
|
|
list and each each increasing sequence can be tabulated as choose(len(seq), 2)
|
|
no. of increasing pairs, e.g.
|
|
|
|
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
|
>>> number_possible_pairs = choose(len(worder), 2)
|
|
>>> round(kendall_tau(worder, normalize=False),3)
|
|
-0.236
|
|
>>> round(kendall_tau(worder),3)
|
|
0.382
|
|
|
|
:param worder: The worder list output from word_rank_alignment
|
|
:type worder: list(int)
|
|
:param normalize: Flag to indicate normalization
|
|
:type normalize: boolean
|
|
:return: The Kendall's Tau correlation coefficient.
|
|
:rtype: float
|
|
"""
|
|
worder_len = len(worder)
|
|
# Extract the groups of increasing/monotonic sequences.
|
|
increasing_sequences = find_increasing_sequences(worder)
|
|
# Calculate no. of increasing_pairs in *worder* list.
|
|
num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences)
|
|
# Calculate no. of possible pairs.
|
|
num_possible_pairs = choose(worder_len, 2)
|
|
# Kendall's Tau computation.
|
|
tau = 2 * num_increasing_pairs / num_possible_pairs - 1
|
|
if normalize: # If normalized, the tau output falls between 0.0 to 1.0
|
|
return (tau + 1) / 2
|
|
else: # Otherwise, the tau outputs falls between -1.0 to +1.0
|
|
return tau
|
|
|
|
|
|
def spearman_rho(worder, normalize=True):
|
|
"""
|
|
Calculates the Spearman's Rho correlation coefficient given the *worder*
|
|
list of word alignment from word_rank_alignment(), using the formula:
|
|
|
|
rho = 1 - sum(d**2) / choose(len(worder)+1, 3)
|
|
|
|
Given that d is the sum of difference between the *worder* list of indices
|
|
and the original word indices from the reference sentence.
|
|
|
|
Using the (H0,R0) and (H5, R5) example from the paper
|
|
|
|
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
|
>>> round(spearman_rho(worder, normalize=False), 3)
|
|
-0.591
|
|
>>> round(spearman_rho(worder), 3)
|
|
0.205
|
|
|
|
:param worder: The worder list output from word_rank_alignment
|
|
:param type: list(int)
|
|
"""
|
|
worder_len = len(worder)
|
|
sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len)))
|
|
rho = 1 - sum_d_square / choose(worder_len + 1, 3)
|
|
|
|
if normalize: # If normalized, the rho output falls between 0.0 to 1.0
|
|
return (rho + 1) / 2
|
|
else: # Otherwise, the rho outputs falls between -1.0 to +1.0
|
|
return rho
|