mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-03 01:14:15 +03:00
the script now calculates the p-value and confidence intervals not only using BLEU, but also the NIST score;
improved confidence interval representation (avg+-stddev); fixed bugs git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3345 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
a21c9bff68
commit
1f6e9b488b
@ -8,7 +8,9 @@
|
||||
#
|
||||
# Author: Mark Fishel, fishel@ut.ee
|
||||
#
|
||||
# 22.10: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
|
||||
# 22.10.2008: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
|
||||
#
|
||||
# 23.01.2010: added NIST p-value and interval computation
|
||||
###############################################
|
||||
|
||||
use strict;
|
||||
@ -16,7 +18,7 @@ use strict;
|
||||
#constants
|
||||
my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
|
||||
my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
|
||||
my $MAX_NGRAMS_FOR_BLEU = 4;
|
||||
my $MAX_NGRAMS = 4;
|
||||
|
||||
#checking cmdline argument consistency
|
||||
if (@ARGV < 3) {
|
||||
@ -34,6 +36,7 @@ print "reading data; " . `date`;
|
||||
#read all data
|
||||
my $data = readAllData(@ARGV);
|
||||
|
||||
my $verbose = $ARGV[3];
|
||||
|
||||
#calculate each sentence's contribution to BP and ngram precision
|
||||
print "performing preliminary calculations (hypothesis 1); " . `date`;
|
||||
@ -43,74 +46,113 @@ print "performing preliminary calculations (hypothesis 2); " . `date`;
|
||||
preEvalHypo($data, "hyp2");
|
||||
|
||||
#start comparing
|
||||
print "comparing hypotheses; " . `date`;
|
||||
print "comparing hypotheses -- this may take some time; " . `date`;
|
||||
|
||||
my @subSampleBleuDiffArr;
|
||||
my @subSampleBleu1Arr;
|
||||
my @subSampleBleu2Arr;
|
||||
bootstrap_report("BLEU", \&getBleu);
|
||||
bootstrap_report("NIST", \&getNist);
|
||||
|
||||
#applying sampling
|
||||
for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub bootstrap_report {
|
||||
my $title = shift;
|
||||
my $proc = shift;
|
||||
|
||||
my ($subSampleScoreDiffArr, $subSampleScore1Arr, $subSampleScore2Arr) = bootstrap_pass($proc);
|
||||
|
||||
my $realScore1 = &$proc($data->{refs}, $data->{hyp1});
|
||||
my $realScore2 = &$proc($data->{refs}, $data->{hyp2});
|
||||
|
||||
my $scorePValue = bootstrap_pvalue($subSampleScoreDiffArr, $realScore1, $realScore2);
|
||||
|
||||
my ($scoreAvg1, $scoreVar1) = bootstrap_interval($subSampleScore1Arr);
|
||||
my ($scoreAvg2, $scoreVar2) = bootstrap_interval($subSampleScore2Arr);
|
||||
|
||||
print "\n---=== $title score ===---\n";
|
||||
|
||||
print "actual score of hypothesis 1: $realScore1\n";
|
||||
print "95% confidence interval for hypothesis 1 score: $scoreAvg1 +- $scoreVar1\n-----\n";
|
||||
print "actual score of hypothesis 1: $realScore2\n";
|
||||
print "95% confidence interval for hypothesis 2 score: $scoreAvg2 +- $scoreVar2\n-----\n";
|
||||
print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n";
|
||||
print "the probability of actually getting them (p-value) is: $scorePValue.\n";
|
||||
}
|
||||
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub bootstrap_pass {
|
||||
my $scoreFunc = shift;
|
||||
|
||||
my @subSampleDiffArr;
|
||||
my @subSample1Arr;
|
||||
my @subSample2Arr;
|
||||
|
||||
#applying sampling
|
||||
for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
|
||||
my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
|
||||
|
||||
my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices);
|
||||
my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices);
|
||||
my $score1 = &$scoreFunc($data->{refs}, $data->{hyp1}, $subSampleIndices);
|
||||
my $score2 = &$scoreFunc($data->{refs}, $data->{hyp2}, $subSampleIndices);
|
||||
|
||||
push @subSampleBleuDiffArr, abs($bleu2 - $bleu1);
|
||||
push @subSampleBleu1Arr, $bleu1;
|
||||
push @subSampleBleu2Arr, $bleu2;
|
||||
|
||||
if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) {
|
||||
print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING " . `date`;
|
||||
push @subSampleDiffArr, abs($score2 - $score1);
|
||||
push @subSample1Arr, $score1;
|
||||
push @subSample2Arr, $score2;
|
||||
}
|
||||
|
||||
return (\@subSampleDiffArr, \@subSample1Arr, \@subSample2Arr);
|
||||
}
|
||||
|
||||
#get subsample bleu difference mean
|
||||
my $averageSubSampleBleuDiff = 0;
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub bootstrap_pvalue {
|
||||
my $subSampleDiffArr = shift;
|
||||
my $realScore1 = shift;
|
||||
my $realScore2 = shift;
|
||||
|
||||
for my $subSampleDiff (@subSampleBleuDiffArr) {
|
||||
$averageSubSampleBleuDiff += $subSampleDiff;
|
||||
}
|
||||
my $realDiff = abs($realScore2 - $realScore1);
|
||||
|
||||
$averageSubSampleBleuDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
|
||||
#get subsample difference mean
|
||||
my $averageSubSampleDiff = 0;
|
||||
|
||||
print "average subsample bleu: $averageSubSampleBleuDiff " . `date`;
|
||||
for my $subSampleDiff (@$subSampleDiffArr) {
|
||||
$averageSubSampleDiff += $subSampleDiff;
|
||||
}
|
||||
|
||||
#calculating p-value
|
||||
my $count = 0;
|
||||
$averageSubSampleDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
|
||||
|
||||
my $realBleu1 = getBleu($data->{refs}, $data->{hyp1});
|
||||
my $realBleu2 = getBleu($data->{refs}, $data->{hyp2});
|
||||
#calculating p-value
|
||||
my $count = 0;
|
||||
|
||||
print "actual BLEU of hypothesis 1: $realBleu1\n";
|
||||
print "actual BLEU of hypothesis 1: $realBleu2\n";
|
||||
my $realScoreDiff = abs($realScore2 - $realScore1);
|
||||
|
||||
my $realBleuDiff = abs($realBleu2 - $realBleu1);
|
||||
|
||||
for my $subSampleDiff (@subSampleBleuDiffArr) {
|
||||
my $op;
|
||||
|
||||
if ($subSampleDiff - $averageSubSampleBleuDiff >= $realBleuDiff) {
|
||||
for my $subSampleDiff (@$subSampleDiffArr) {
|
||||
if ($subSampleDiff - $averageSubSampleDiff >= $realDiff) {
|
||||
$count++;
|
||||
$op = ">=";
|
||||
}
|
||||
else {
|
||||
$op = "< ";
|
||||
}
|
||||
|
||||
#print "$subSampleDiff - $averageSubSampleBleuDiff $op $realBleuDiff\n";
|
||||
return $count / $TIMES_TO_REPEAT_SUBSAMPLING;
|
||||
}
|
||||
|
||||
my $result = $count / $TIMES_TO_REPEAT_SUBSAMPLING;
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub bootstrap_interval {
|
||||
my $subSampleArr = shift;
|
||||
|
||||
print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n";
|
||||
print "the probability of actually getting them (p-value) is: $result.\n";
|
||||
my @sorted = sort @$subSampleArr;
|
||||
|
||||
my @sorted1 = sort @subSampleBleu1Arr;
|
||||
my @sorted2 = sort @subSampleBleu2Arr;
|
||||
my $lowerIdx = int($TIMES_TO_REPEAT_SUBSAMPLING / 40);
|
||||
my $higherIdx = $TIMES_TO_REPEAT_SUBSAMPLING - $lowerIdx - 1;
|
||||
|
||||
print "95% confidence interval for hypothesis 1: " . $sorted1[25] . " -- " . $sorted1[924] . "\n";
|
||||
print "95% confidence interval for hypothesis 2: " . $sorted2[25] . " -- " . $sorted2[924] . "\n";
|
||||
my $lower = $sorted[$lowerIdx];
|
||||
my $higher = $sorted[$higherIdx];
|
||||
my $diff = $higher - $lower;
|
||||
|
||||
return ($lower + 0.5 * $diff, 0.5 * $diff);
|
||||
}
|
||||
|
||||
#####
|
||||
# read 2 hyp and 1 to \infty ref data files
|
||||
@ -131,6 +173,7 @@ sub readAllData {
|
||||
|
||||
#reading reference(s) and checking for matching sizes
|
||||
$result{refs} = [];
|
||||
$result{ngramCounts} = { };
|
||||
my $i = 0;
|
||||
|
||||
for my $refFile (@refFiles) {
|
||||
@ -141,12 +184,49 @@ sub readAllData {
|
||||
die ("ERROR: ref set $i size doesn't match the size of hyp sets");
|
||||
}
|
||||
|
||||
updateCounts($result{ngramCounts}, $refDataX);
|
||||
|
||||
push @{$result{refs}}, $refDataX;
|
||||
}
|
||||
|
||||
return \%result;
|
||||
}
|
||||
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub updateCounts {
|
||||
my ($countHash, $refData) = @_;
|
||||
|
||||
for my $snt(@$refData) {
|
||||
my $size = scalar @{$snt->{words}};
|
||||
$countHash->{""} += $size;
|
||||
|
||||
for my $order(1..$MAX_NGRAMS) {
|
||||
my $ngram;
|
||||
|
||||
for my $i (0..($size-$order)) {
|
||||
$ngram = join(" ", @{$snt->{words}}[$i..($i + $order - 1)]);
|
||||
|
||||
$countHash->{$ngram}++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub ngramInfo {
|
||||
my ($data, $ngram) = @_;
|
||||
|
||||
my @nwords = split(/ /, $ngram);
|
||||
pop @nwords;
|
||||
my $smallGram = join(" ", @nwords);
|
||||
|
||||
return log($data->{ngramCounts}->{$smallGram} / $data->{ngramCounts}->{$ngram}) / log(2.0);
|
||||
}
|
||||
|
||||
#####
|
||||
# read sentences from file
|
||||
#####
|
||||
@ -172,10 +252,21 @@ sub preEvalHypo {
|
||||
my $data = shift;
|
||||
my $hypId = shift;
|
||||
|
||||
for my $lineIdx (0..($data->{size} - 1)) {
|
||||
preEvalHypoSnt($data, $hypId, $lineIdx);
|
||||
}
|
||||
}
|
||||
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub preEvalHypoSnt {
|
||||
my ($data, $hypId, $lineIdx) = @_;
|
||||
|
||||
my ($correctNgramCounts, $totalNgramCounts);
|
||||
my ($refNgramCounts, $hypNgramCounts);
|
||||
my ($coocNgramInfoSum, $totalNgramAmt);
|
||||
|
||||
for my $lineIdx (0..($data->{size} - 1)) {
|
||||
my $hypSnt = $data->{$hypId}->[$lineIdx];
|
||||
|
||||
#update total hyp len
|
||||
@ -183,12 +274,13 @@ sub preEvalHypo {
|
||||
|
||||
#update total ref len with closest current ref len
|
||||
$hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen});
|
||||
$hypSnt->{avgreflen} = getAvgLength($data->{refs}, $lineIdx);
|
||||
|
||||
$hypSnt->{correctNgrams} = [];
|
||||
$hypSnt->{totalNgrams} = [];
|
||||
|
||||
#update ngram precision for each n-gram order
|
||||
for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
|
||||
for my $order (1..$MAX_NGRAMS) {
|
||||
#hyp ngrams
|
||||
$hypNgramCounts = groupNgrams($hypSnt, $order);
|
||||
|
||||
@ -197,16 +289,27 @@ sub preEvalHypo {
|
||||
|
||||
$correctNgramCounts = 0;
|
||||
$totalNgramCounts = 0;
|
||||
$coocNgramInfoSum = 0;
|
||||
$totalNgramAmt = 0;
|
||||
my $coocUpd;
|
||||
|
||||
#correct, total
|
||||
for my $ngram (keys %$hypNgramCounts) {
|
||||
$correctNgramCounts += min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram});
|
||||
$coocUpd = min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram});
|
||||
$correctNgramCounts += $coocUpd;
|
||||
$totalNgramCounts += $hypNgramCounts->{$ngram};
|
||||
|
||||
if ($coocUpd > 0) {
|
||||
$coocNgramInfoSum += ngramInfo($data, $ngram);
|
||||
}
|
||||
|
||||
$totalNgramAmt++;
|
||||
}
|
||||
|
||||
$hypSnt->{correctNgrams}->[$order] = $correctNgramCounts;
|
||||
$hypSnt->{totalNgrams}->[$order] = $totalNgramCounts;
|
||||
}
|
||||
$hypSnt->{ngramNistInfoSum}->[$order] = $coocNgramInfoSum;
|
||||
$hypSnt->{ngramNistCount}->[$order] = $totalNgramAmt;
|
||||
}
|
||||
}
|
||||
|
||||
@ -225,6 +328,56 @@ sub drawWithReplacement {
|
||||
return \@result;
|
||||
}
|
||||
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub getNist {
|
||||
my ($refs, $hyp, $idxs) = @_;
|
||||
|
||||
#default value for $idxs
|
||||
unless (defined($idxs)) {
|
||||
$idxs = [0..((scalar @$hyp) - 1)];
|
||||
}
|
||||
|
||||
#vars
|
||||
my ($hypothesisLength, $referenceLength) = (0, 0);
|
||||
my (@infosum, @totalamt);
|
||||
|
||||
#gather info from each line
|
||||
for my $lineIdx (@$idxs) {
|
||||
|
||||
my $hypSnt = $hyp->[$lineIdx];
|
||||
|
||||
#update total hyp len
|
||||
$hypothesisLength += $hypSnt->{hyplen};
|
||||
|
||||
#update total ref len with closest current ref len
|
||||
$referenceLength += $hypSnt->{avgreflen};
|
||||
|
||||
#update ngram precision for each n-gram order
|
||||
for my $order (1..$MAX_NGRAMS) {
|
||||
$infosum[$order] += $hypSnt->{ngramNistInfoSum}->[$order];
|
||||
$totalamt[$order] += $hypSnt->{ngramNistCount}->[$order];
|
||||
}
|
||||
}
|
||||
|
||||
my $toplog = log($hypothesisLength / $referenceLength);
|
||||
my $btmlog = log(2.0/3.0);
|
||||
|
||||
#compose nist score
|
||||
my $brevityPenalty = ($hypothesisLength > $referenceLength)? 1.0: exp(log(0.5) * $toplog * $toplog / ($btmlog * $btmlog));
|
||||
|
||||
my $sum = 0;
|
||||
|
||||
for my $order (1..$MAX_NGRAMS) {
|
||||
$sum += $infosum[$order]/$totalamt[$order];
|
||||
}
|
||||
|
||||
my $result = $sum * $brevityPenalty;
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
#####
|
||||
# refs: arrayref of different references, reference = array of lines, line = array of words, word = string
|
||||
# hyp: arrayref of lines of hypothesis translation, line = array of words, word = string
|
||||
@ -254,7 +407,7 @@ sub getBleu {
|
||||
$referenceLength += $hypSnt->{reflen};
|
||||
|
||||
#update ngram precision for each n-gram order
|
||||
for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
|
||||
for my $order (1..$MAX_NGRAMS) {
|
||||
$correctNgramCounts[$order] += $hypSnt->{correctNgrams}->[$order];
|
||||
$totalNgramCounts[$order] += $hypSnt->{totalNgrams}->[$order];
|
||||
}
|
||||
@ -265,11 +418,28 @@ sub getBleu {
|
||||
|
||||
my $logsum = 0;
|
||||
|
||||
for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
|
||||
for my $order (1..$MAX_NGRAMS) {
|
||||
$logsum += safeLog($correctNgramCounts[$order] / $totalNgramCounts[$order]);
|
||||
}
|
||||
|
||||
return $brevityPenalty * exp($logsum / $MAX_NGRAMS_FOR_BLEU);
|
||||
return $brevityPenalty * exp($logsum / $MAX_NGRAMS);
|
||||
}
|
||||
|
||||
#####
|
||||
#
|
||||
#####
|
||||
sub getAvgLength {
|
||||
my ($refs, $lineIdx) = @_;
|
||||
|
||||
my $result = 0;
|
||||
my $count = 0;
|
||||
|
||||
for my $ref (@$refs) {
|
||||
$result += scalar @{$ref->[$lineIdx]->{words}};
|
||||
$count++;
|
||||
}
|
||||
|
||||
return $result / $count;
|
||||
}
|
||||
|
||||
#####
|
||||
@ -366,3 +536,9 @@ sub max {
|
||||
|
||||
return ($a > $b)? $a: $b;
|
||||
}
|
||||
|
||||
sub poww {
|
||||
my ($a, $b) = @_;
|
||||
|
||||
return exp($b * log($a));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user