altered the bootstrap significance script algorithm according to (Riezler and Maxwell 2005 @ MTSE'05)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1914 1f5c12ca-751b-0410-a591-d2e778427230
2024-09-20 15:48:05 +03:00 · 2008-10-23 09:03:41 +00:00 · 2008-10-23 09:03:41 +00:00 · 88d3b775ce
commit 88d3b775ce
parent a09242ad16
1 changed files with 41 additions and 26 deletions
--- a/scripts/generic/compare-hypotheses-with-significance.pl
+++ b/scripts/generic/compare-hypotheses-with-significance.pl
@ -2,21 +2,22 @@
 ###############################################
 # An implementation of paired bootstrap resampling for testing the statistical
-# significance of the difference between two systems from Koehn 2004 @ EMNLP
+# significance of the difference between two systems from (Koehn 2004 @ EMNLP)
 #
 # Usage: ./compare-hypotheses-with-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ]
 #
 # Author: Mark Fishel, fishel@ut.ee
 # 
 # 22.10: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
 ###############################################
 use strict;
 #constants
 my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
-my $SUBSAMPLE_SIZE = 500; # if 0 then subsample size is equal to the whole set
+my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
 my $TMP_PREFIX = "/tmp/signigicance_test_file_";
 my $MAX_NGRAMS_FOR_BLEU = 4;
 my $DEBUG = 0;
 #checking cmdline argument consistency
 if (@ARGV < 3) {
@ -29,14 +30,15 @@ if (@ARGV < 3) {
 	exit 1;
 }
-print "reading data; " . `date` if ($DEBUG);
+print "reading data; " . `date`;
 #read all data
 my $data = readAllData(@ARGV);
-my $secondWin = 0;
+#start comparing
 print "comparing hypotheses; " . `date`;
-print "comparing hypotheses; " . `date` if ($DEBUG);
+my @subSampleBleuDiffArr;
 #applying sampling
 for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
@ -45,35 +47,48 @@ for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
 	my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices);
 	my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices);
-	my $op;
+	push @subSampleBleuDiffArr, abs($bleu2 - $bleu1);
 	if ($bleu1 < $bleu2) {
 		$secondWin++;
 		$op = "worse than";
 	}
 	elsif ($bleu1 == $bleu2) {
 		$op = "equally good as";
 	}
 	else {
 		$op = "better than";
 	}
 	print "$_: 1st ($bleu1) $op 2nd ($bleu2)\n" if ($DEBUG);
 	if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) {
-		print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING\n";
+		print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING " . `date`;
 	}
 }
-my $result = $secondWin / $TIMES_TO_REPEAT_SUBSAMPLING;
+#get subsample bleu difference mean
 my $averageSubSampleBleuDiff = 0;
-if ($result >= 0.5) {
+for my $subSampleDiff (@subSampleBleuDiffArr) {
-	print "The second system is better than the first one with " . (int(10000 * $result) / 100) . "% confidence; " . `date`;
+	$averageSubSampleBleuDiff += $subSampleDiff;
 }
-else {
+
-	print "The first system is better than the second one with " . (int(10000 * $result) / 100) . "% confidence; " . `date`;
+$averageSubSampleBleuDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
 print "average subsample bleu: $averageSubSampleBleuDiff " . `date`;
 #calculating p-value
 my $count = 0;
 my $realBleuDiff = abs(getBleu($data->{refs}, $data->{hyp2}) - getBleu($data->{refs}, $data->{hyp1}));
 for my $subSampleDiff (@subSampleBleuDiffArr) {
 #	my $op;
 	if ($subSampleDiff - $averageSubSampleBleuDiff >= $realBleuDiff) {
 		$count++;
 #		$op = ">=";
 	}
 	else {
 #		$op = "< ";
 	}
 #	print "$subSampleDiff - $averageSubSampleBleuDiff $op $realBleuDiff\n";
 }
 my $result = ($count + 1) / $TIMES_TO_REPEAT_SUBSAMPLING;
 print "The null-hypothesis is that the second system is equivalent with the first one;\n";
 print "The p-value for that is $result\n";
 #####
 # read 2 hyp and 1 to \infty ref data files
 #####