altered the bootstrap significance script algorithm according to (Riezler and Maxwell 2005 @ MTSE'05)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1914 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
mphi 2008-10-23 09:03:41 +00:00
parent a09242ad16
commit 88d3b775ce

View File

@ -2,21 +2,22 @@
############################################### ###############################################
# An implementation of paired bootstrap resampling for testing the statistical # An implementation of paired bootstrap resampling for testing the statistical
# significance of the difference between two systems from Koehn 2004 @ EMNLP # significance of the difference between two systems from (Koehn 2004 @ EMNLP)
# #
# Usage: ./compare-hypotheses-with-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ] # Usage: ./compare-hypotheses-with-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ]
# #
# Author: Mark Fishel, fishel@ut.ee # Author: Mark Fishel, fishel@ut.ee
#
# 22.10: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
############################################### ###############################################
use strict; use strict;
#constants #constants
my $TIMES_TO_REPEAT_SUBSAMPLING = 1000; my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
my $SUBSAMPLE_SIZE = 500; # if 0 then subsample size is equal to the whole set my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
my $TMP_PREFIX = "/tmp/signigicance_test_file_"; my $TMP_PREFIX = "/tmp/signigicance_test_file_";
my $MAX_NGRAMS_FOR_BLEU = 4; my $MAX_NGRAMS_FOR_BLEU = 4;
my $DEBUG = 0;
#checking cmdline argument consistency #checking cmdline argument consistency
if (@ARGV < 3) { if (@ARGV < 3) {
@ -29,14 +30,15 @@ if (@ARGV < 3) {
exit 1; exit 1;
} }
print "reading data; " . `date` if ($DEBUG); print "reading data; " . `date`;
#read all data #read all data
my $data = readAllData(@ARGV); my $data = readAllData(@ARGV);
my $secondWin = 0; #start comparing
print "comparing hypotheses; " . `date`;
print "comparing hypotheses; " . `date` if ($DEBUG); my @subSampleBleuDiffArr;
#applying sampling #applying sampling
for (1..$TIMES_TO_REPEAT_SUBSAMPLING) { for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
@ -45,35 +47,48 @@ for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices); my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices);
my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices); my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices);
my $op; push @subSampleBleuDiffArr, abs($bleu2 - $bleu1);
if ($bleu1 < $bleu2) {
$secondWin++;
$op = "worse than";
}
elsif ($bleu1 == $bleu2) {
$op = "equally good as";
}
else {
$op = "better than";
}
print "$_: 1st ($bleu1) $op 2nd ($bleu2)\n" if ($DEBUG);
if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) { if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) {
print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING\n"; print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING " . `date`;
} }
} }
my $result = $secondWin / $TIMES_TO_REPEAT_SUBSAMPLING; #get subsample bleu difference mean
my $averageSubSampleBleuDiff = 0;
if ($result >= 0.5) { for my $subSampleDiff (@subSampleBleuDiffArr) {
print "The second system is better than the first one with " . (int(10000 * $result) / 100) . "% confidence; " . `date`; $averageSubSampleBleuDiff += $subSampleDiff;
} }
else {
print "The first system is better than the second one with " . (int(10000 * $result) / 100) . "% confidence; " . `date`; $averageSubSampleBleuDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
print "average subsample bleu: $averageSubSampleBleuDiff " . `date`;
#calculating p-value
my $count = 0;
my $realBleuDiff = abs(getBleu($data->{refs}, $data->{hyp2}) - getBleu($data->{refs}, $data->{hyp1}));
for my $subSampleDiff (@subSampleBleuDiffArr) {
# my $op;
if ($subSampleDiff - $averageSubSampleBleuDiff >= $realBleuDiff) {
$count++;
# $op = ">=";
}
else {
# $op = "< ";
}
# print "$subSampleDiff - $averageSubSampleBleuDiff $op $realBleuDiff\n";
} }
my $result = ($count + 1) / $TIMES_TO_REPEAT_SUBSAMPLING;
print "The null-hypothesis is that the second system is equivalent with the first one;\n";
print "The p-value for that is $result\n";
##### #####
# read 2 hyp and 1 to \infty ref data files # read 2 hyp and 1 to \infty ref data files
##### #####