altered the bootstrap significance script algorithm according to (Riezler and Maxwell 2005 @ MTSE'05)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1914 1f5c12ca-751b-0410-a591-d2e778427230
2024-09-20 15:48:05 +03:00 · 2008-10-23 09:03:41 +00:00 · 2008-10-23 09:03:41 +00:00 · 88d3b775ce
commit 88d3b775ce
parent a09242ad16
1 changed files with 41 additions and 26 deletions
--- a/scripts/generic/compare-hypotheses-with-significance.pl
+++ b/scripts/generic/compare-hypotheses-with-significance.pl
@ -2,21 +2,22 @@

 ###############################################
 # An implementation of paired bootstrap resampling for testing the statistical
-# significance of the difference between two systems from Koehn 2004 @ EMNLP
+# significance of the difference between two systems from (Koehn 2004 @ EMNLP)
 #
 # Usage: ./compare-hypotheses-with-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ]
 #
 # Author: Mark Fishel, fishel@ut.ee
+# 
+# 22.10: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
 ###############################################

 use strict;

 #constants
 my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
-my $SUBSAMPLE_SIZE = 500; # if 0 then subsample size is equal to the whole set
+my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
 my $TMP_PREFIX = "/tmp/signigicance_test_file_";
 my $MAX_NGRAMS_FOR_BLEU = 4;
-my $DEBUG = 0;

 #checking cmdline argument consistency
 if (@ARGV < 3) {
@ -29,14 +30,15 @@ if (@ARGV < 3) {
 	exit 1;
 }

-print "reading data; " . `date` if ($DEBUG);
+print "reading data; " . `date`;

 #read all data
 my $data = readAllData(@ARGV);

-my $secondWin = 0;
+#start comparing
+print "comparing hypotheses; " . `date`;

-print "comparing hypotheses; " . `date` if ($DEBUG);
+my @subSampleBleuDiffArr;

 #applying sampling
 for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
@ -45,35 +47,48 @@ for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
 	my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices);
 	my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices);
 	
-	my $op;
-	
-	if ($bleu1 < $bleu2) {
-		$secondWin++;
-		$op = "worse than";
-	}
-	elsif ($bleu1 == $bleu2) {
-		$op = "equally good as";
-	}
-	else {
-		$op = "better than";
-	}
-	
-	print "$_: 1st ($bleu1) $op 2nd ($bleu2)\n" if ($DEBUG);
+	push @subSampleBleuDiffArr, abs($bleu2 - $bleu1);
 	
 	if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) {
-		print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING\n";
+		print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING " . `date`;
 	}
 }

-my $result = $secondWin / $TIMES_TO_REPEAT_SUBSAMPLING;
+#get subsample bleu difference mean
+my $averageSubSampleBleuDiff = 0;

-if ($result >= 0.5) {
-	print "The second system is better than the first one with " . (int(10000 * $result) / 100) . "% confidence; " . `date`;
+for my $subSampleDiff (@subSampleBleuDiffArr) {
+	$averageSubSampleBleuDiff += $subSampleDiff;
 }
-else {
-	print "The first system is better than the second one with " . (int(10000 * $result) / 100) . "% confidence; " . `date`;
+
+$averageSubSampleBleuDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
+
+print "average subsample bleu: $averageSubSampleBleuDiff " . `date`;
+
+#calculating p-value
+my $count = 0;
+
+my $realBleuDiff = abs(getBleu($data->{refs}, $data->{hyp2}) - getBleu($data->{refs}, $data->{hyp1}));
+
+for my $subSampleDiff (@subSampleBleuDiffArr) {
+#	my $op;
+	
+	if ($subSampleDiff - $averageSubSampleBleuDiff >= $realBleuDiff) {
+		$count++;
+#		$op = ">=";
+	}
+	else {
+#		$op = "< ";
+	}
+	
+#	print "$subSampleDiff - $averageSubSampleBleuDiff $op $realBleuDiff\n";
 }

+my $result = ($count + 1) / $TIMES_TO_REPEAT_SUBSAMPLING;
+
+print "The null-hypothesis is that the second system is equivalent with the first one;\n";
+print "The p-value for that is $result\n";
+
 #####
 # read 2 hyp and 1 to \infty ref data files
 #####