mosesdecoder/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
mphi 1f6e9b488b the script now calculates the p-value and confidence intervals not only using BLEU, but also the NIST score;
improved confidence interval representation (avg+-stddev);

fixed bugs



git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3345 1f5c12ca-751b-0410-a591-d2e778427230
2010-06-22 20:17:42 +00:00

545 lines
12 KiB
Perl
Executable File

#!/usr/bin/perl
###############################################
# An implementation of paired bootstrap resampling for testing the statistical
# significance of the difference between two systems from (Koehn 2004 @ EMNLP)
#
# Usage: ./compare-hypotheses-with-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ]
#
# Author: Mark Fishel, fishel@ut.ee
#
# 22.10.2008: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
#
# 23.01.2010: added NIST p-value and interval computation
###############################################
use strict;
#constants
my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
my $MAX_NGRAMS = 4;
#checking cmdline argument consistency
if (@ARGV < 3) {
print STDERR "Usage: ./bootstrap-hypothesis-difference-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ]\n";
unless ($ARGV[0] =~ /^(--help|-help|-h|-\?|\/\?|--usage|-usage)$/) {
die("\nERROR: not enough arguments");
}
exit 1;
}
print "reading data; " . `date`;
#read all data
my $data = readAllData(@ARGV);
my $verbose = $ARGV[3];
#calculate each sentence's contribution to BP and ngram precision
print "performing preliminary calculations (hypothesis 1); " . `date`;
preEvalHypo($data, "hyp1");
print "performing preliminary calculations (hypothesis 2); " . `date`;
preEvalHypo($data, "hyp2");
#start comparing
print "comparing hypotheses -- this may take some time; " . `date`;
bootstrap_report("BLEU", \&getBleu);
bootstrap_report("NIST", \&getNist);
#####
#
#####
sub bootstrap_report {
my $title = shift;
my $proc = shift;
my ($subSampleScoreDiffArr, $subSampleScore1Arr, $subSampleScore2Arr) = bootstrap_pass($proc);
my $realScore1 = &$proc($data->{refs}, $data->{hyp1});
my $realScore2 = &$proc($data->{refs}, $data->{hyp2});
my $scorePValue = bootstrap_pvalue($subSampleScoreDiffArr, $realScore1, $realScore2);
my ($scoreAvg1, $scoreVar1) = bootstrap_interval($subSampleScore1Arr);
my ($scoreAvg2, $scoreVar2) = bootstrap_interval($subSampleScore2Arr);
print "\n---=== $title score ===---\n";
print "actual score of hypothesis 1: $realScore1\n";
print "95% confidence interval for hypothesis 1 score: $scoreAvg1 +- $scoreVar1\n-----\n";
print "actual score of hypothesis 1: $realScore2\n";
print "95% confidence interval for hypothesis 2 score: $scoreAvg2 +- $scoreVar2\n-----\n";
print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n";
print "the probability of actually getting them (p-value) is: $scorePValue.\n";
}
#####
#
#####
sub bootstrap_pass {
my $scoreFunc = shift;
my @subSampleDiffArr;
my @subSample1Arr;
my @subSample2Arr;
#applying sampling
for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
my $score1 = &$scoreFunc($data->{refs}, $data->{hyp1}, $subSampleIndices);
my $score2 = &$scoreFunc($data->{refs}, $data->{hyp2}, $subSampleIndices);
push @subSampleDiffArr, abs($score2 - $score1);
push @subSample1Arr, $score1;
push @subSample2Arr, $score2;
}
return (\@subSampleDiffArr, \@subSample1Arr, \@subSample2Arr);
}
#####
#
#####
sub bootstrap_pvalue {
my $subSampleDiffArr = shift;
my $realScore1 = shift;
my $realScore2 = shift;
my $realDiff = abs($realScore2 - $realScore1);
#get subsample difference mean
my $averageSubSampleDiff = 0;
for my $subSampleDiff (@$subSampleDiffArr) {
$averageSubSampleDiff += $subSampleDiff;
}
$averageSubSampleDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
#calculating p-value
my $count = 0;
my $realScoreDiff = abs($realScore2 - $realScore1);
for my $subSampleDiff (@$subSampleDiffArr) {
if ($subSampleDiff - $averageSubSampleDiff >= $realDiff) {
$count++;
}
}
return $count / $TIMES_TO_REPEAT_SUBSAMPLING;
}
#####
#
#####
sub bootstrap_interval {
my $subSampleArr = shift;
my @sorted = sort @$subSampleArr;
my $lowerIdx = int($TIMES_TO_REPEAT_SUBSAMPLING / 40);
my $higherIdx = $TIMES_TO_REPEAT_SUBSAMPLING - $lowerIdx - 1;
my $lower = $sorted[$lowerIdx];
my $higher = $sorted[$higherIdx];
my $diff = $higher - $lower;
return ($lower + 0.5 * $diff, 0.5 * $diff);
}
#####
# read 2 hyp and 1 to \infty ref data files
#####
sub readAllData {
my ($hypFile1, $hypFile2, @refFiles) = @_;
my %result;
#reading hypotheses and checking for matching sizes
$result{hyp1} = readData($hypFile1);
$result{size} = scalar @{$result{hyp1}};
$result{hyp2} = readData($hypFile2);
unless (scalar @{$result{hyp2}} == $result{size}) {
die ("ERROR: sizes of hypothesis sets 1 and 2 don't match");
}
#reading reference(s) and checking for matching sizes
$result{refs} = [];
$result{ngramCounts} = { };
my $i = 0;
for my $refFile (@refFiles) {
$i++;
my $refDataX = readData($refFile);
unless (scalar @$refDataX == $result{size}) {
die ("ERROR: ref set $i size doesn't match the size of hyp sets");
}
updateCounts($result{ngramCounts}, $refDataX);
push @{$result{refs}}, $refDataX;
}
return \%result;
}
#####
#
#####
sub updateCounts {
my ($countHash, $refData) = @_;
for my $snt(@$refData) {
my $size = scalar @{$snt->{words}};
$countHash->{""} += $size;
for my $order(1..$MAX_NGRAMS) {
my $ngram;
for my $i (0..($size-$order)) {
$ngram = join(" ", @{$snt->{words}}[$i..($i + $order - 1)]);
$countHash->{$ngram}++;
}
}
}
}
#####
#
#####
sub ngramInfo {
my ($data, $ngram) = @_;
my @nwords = split(/ /, $ngram);
pop @nwords;
my $smallGram = join(" ", @nwords);
return log($data->{ngramCounts}->{$smallGram} / $data->{ngramCounts}->{$ngram}) / log(2.0);
}
#####
# read sentences from file
#####
sub readData {
my $file = shift;
my @result;
open (FILE, $file) or die ("Failed to open `$file' for reading");
while (<FILE>) {
push @result, { words => [split(/\s+/, $_)] };
}
close (FILE);
return \@result;
}
#####
# calculate each sentence's contribution to the ngram precision and brevity penalty
#####
sub preEvalHypo {
my $data = shift;
my $hypId = shift;
for my $lineIdx (0..($data->{size} - 1)) {
preEvalHypoSnt($data, $hypId, $lineIdx);
}
}
#####
#
#####
sub preEvalHypoSnt {
my ($data, $hypId, $lineIdx) = @_;
my ($correctNgramCounts, $totalNgramCounts);
my ($refNgramCounts, $hypNgramCounts);
my ($coocNgramInfoSum, $totalNgramAmt);
my $hypSnt = $data->{$hypId}->[$lineIdx];
#update total hyp len
$hypSnt->{hyplen} = scalar @{$hypSnt->{words}};
#update total ref len with closest current ref len
$hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen});
$hypSnt->{avgreflen} = getAvgLength($data->{refs}, $lineIdx);
$hypSnt->{correctNgrams} = [];
$hypSnt->{totalNgrams} = [];
#update ngram precision for each n-gram order
for my $order (1..$MAX_NGRAMS) {
#hyp ngrams
$hypNgramCounts = groupNgrams($hypSnt, $order);
#ref ngrams
$refNgramCounts = groupNgramsMultiSrc($data->{refs}, $lineIdx, $order);
$correctNgramCounts = 0;
$totalNgramCounts = 0;
$coocNgramInfoSum = 0;
$totalNgramAmt = 0;
my $coocUpd;
#correct, total
for my $ngram (keys %$hypNgramCounts) {
$coocUpd = min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram});
$correctNgramCounts += $coocUpd;
$totalNgramCounts += $hypNgramCounts->{$ngram};
if ($coocUpd > 0) {
$coocNgramInfoSum += ngramInfo($data, $ngram);
}
$totalNgramAmt++;
}
$hypSnt->{correctNgrams}->[$order] = $correctNgramCounts;
$hypSnt->{totalNgrams}->[$order] = $totalNgramCounts;
$hypSnt->{ngramNistInfoSum}->[$order] = $coocNgramInfoSum;
$hypSnt->{ngramNistCount}->[$order] = $totalNgramAmt;
}
}
#####
# draw a subsample of size $subSize from set (0..$setSize) with replacement
#####
sub drawWithReplacement {
my ($setSize, $subSize) = @_;
my @result;
for (1..$subSize) {
push @result, int(rand($setSize));
}
return \@result;
}
#####
#
#####
sub getNist {
my ($refs, $hyp, $idxs) = @_;
#default value for $idxs
unless (defined($idxs)) {
$idxs = [0..((scalar @$hyp) - 1)];
}
#vars
my ($hypothesisLength, $referenceLength) = (0, 0);
my (@infosum, @totalamt);
#gather info from each line
for my $lineIdx (@$idxs) {
my $hypSnt = $hyp->[$lineIdx];
#update total hyp len
$hypothesisLength += $hypSnt->{hyplen};
#update total ref len with closest current ref len
$referenceLength += $hypSnt->{avgreflen};
#update ngram precision for each n-gram order
for my $order (1..$MAX_NGRAMS) {
$infosum[$order] += $hypSnt->{ngramNistInfoSum}->[$order];
$totalamt[$order] += $hypSnt->{ngramNistCount}->[$order];
}
}
my $toplog = log($hypothesisLength / $referenceLength);
my $btmlog = log(2.0/3.0);
#compose nist score
my $brevityPenalty = ($hypothesisLength > $referenceLength)? 1.0: exp(log(0.5) * $toplog * $toplog / ($btmlog * $btmlog));
my $sum = 0;
for my $order (1..$MAX_NGRAMS) {
$sum += $infosum[$order]/$totalamt[$order];
}
my $result = $sum * $brevityPenalty;
return $result;
}
#####
# refs: arrayref of different references, reference = array of lines, line = array of words, word = string
# hyp: arrayref of lines of hypothesis translation, line = array of words, word = string
# idxs: indices of lines to include; default value - full set (0..size_of_hyp-1)
#####
sub getBleu {
my ($refs, $hyp, $idxs) = @_;
#default value for $idxs
unless (defined($idxs)) {
$idxs = [0..((scalar @$hyp) - 1)];
}
#vars
my ($hypothesisLength, $referenceLength) = (0, 0);
my (@correctNgramCounts, @totalNgramCounts);
my ($refNgramCounts, $hypNgramCounts);
#gather info from each line
for my $lineIdx (@$idxs) {
my $hypSnt = $hyp->[$lineIdx];
#update total hyp len
$hypothesisLength += $hypSnt->{hyplen};
#update total ref len with closest current ref len
$referenceLength += $hypSnt->{reflen};
#update ngram precision for each n-gram order
for my $order (1..$MAX_NGRAMS) {
$correctNgramCounts[$order] += $hypSnt->{correctNgrams}->[$order];
$totalNgramCounts[$order] += $hypSnt->{totalNgrams}->[$order];
}
}
#compose bleu score
my $brevityPenalty = ($hypothesisLength < $referenceLength)? exp(1 - $referenceLength/$hypothesisLength): 1;
my $logsum = 0;
for my $order (1..$MAX_NGRAMS) {
$logsum += safeLog($correctNgramCounts[$order] / $totalNgramCounts[$order]);
}
return $brevityPenalty * exp($logsum / $MAX_NGRAMS);
}
#####
#
#####
sub getAvgLength {
my ($refs, $lineIdx) = @_;
my $result = 0;
my $count = 0;
for my $ref (@$refs) {
$result += scalar @{$ref->[$lineIdx]->{words}};
$count++;
}
return $result / $count;
}
#####
#
#####
sub getClosestLength {
my ($refs, $lineIdx, $hypothesisLength) = @_;
my $bestDiff = infty();
my $bestLen = infty();
my ($currLen, $currDiff);
for my $ref (@$refs) {
$currLen = scalar @{$ref->[$lineIdx]->{words}};
$currDiff = abs($currLen - $hypothesisLength);
if ($currDiff < $bestDiff or ($currDiff == $bestDiff and $currLen < $bestLen)) {
$bestDiff = $currDiff;
$bestLen = $currLen;
}
}
return $bestLen;
}
#####
#
#####
sub groupNgrams {
my ($snt, $order) = @_;
my %result;
my $size = scalar @{$snt->{words}};
my $ngram;
for my $i (0..($size-$order)) {
$ngram = join(" ", @{$snt->{words}}[$i..($i + $order - 1)]);
$result{$ngram}++;
}
return \%result;
}
#####
#
#####
sub groupNgramsMultiSrc {
my ($refs, $lineIdx, $order) = @_;
my %result;
for my $ref (@$refs) {
my $currNgramCounts = groupNgrams($ref->[$lineIdx], $order);
for my $currNgram (keys %$currNgramCounts) {
$result{$currNgram} = max($result{$currNgram}, $currNgramCounts->{$currNgram});
}
}
return \%result;
}
#####
#
#####
sub safeLog {
my $x = shift;
return ($x > 0)? log($x): -infty();
}
#####
#
#####
sub infty {
return 1e6000;
}
#####
#
#####
sub min {
my ($a, $b) = @_;
return ($a < $b)? $a: $b;
}
#####
#
#####
sub max {
my ($a, $b) = @_;
return ($a > $b)? $a: $b;
}
sub poww {
my ($a, $b) = @_;
return exp($b * log($a));
}