the script now calculates the p-value and confidence intervals not only using BLEU, but also the NIST score;

improved confidence interval representation (avg+-stddev); fixed bugs git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3345 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-30 23:42:30 +03:00 · 2010-06-22 20:17:42 +00:00 · 2010-06-22 20:17:42 +00:00 · 1f6e9b488b
commit 1f6e9b488b
parent a21c9bff68
1 changed files with 261 additions and 85 deletions
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@ -8,7 +8,9 @@
 #
 # Author: Mark Fishel, fishel@ut.ee
 # 
-# 22.10: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
+# 22.10.2008: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
+#
+# 23.01.2010: added NIST p-value and interval computation
 ###############################################

 use strict;
@ -16,7 +18,7 @@ use strict;
 #constants
 my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
 my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
-my $MAX_NGRAMS_FOR_BLEU = 4;
+my $MAX_NGRAMS = 4;

 #checking cmdline argument consistency
 if (@ARGV < 3) {
@ -34,6 +36,7 @@ print "reading data; " . `date`;
 #read all data
 my $data = readAllData(@ARGV);

+my $verbose = $ARGV[3];

 #calculate each sentence's contribution to BP and ngram precision
 print "performing preliminary calculations (hypothesis 1); " . `date`;
@ -43,74 +46,113 @@ print "performing preliminary calculations (hypothesis 2); " . `date`;
 preEvalHypo($data, "hyp2");

 #start comparing
-print "comparing hypotheses; " . `date`;
+print "comparing hypotheses -- this may take some time; " . `date`;

-my @subSampleBleuDiffArr;
-my @subSampleBleu1Arr;
-my @subSampleBleu2Arr;
+bootstrap_report("BLEU", \&getBleu);
+bootstrap_report("NIST", \&getNist);

-#applying sampling
-for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
-	my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
+#####
+#
+#####
+sub bootstrap_report {
+	my $title = shift;
+	my $proc = shift;
 	
-	my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices);
-	my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices);
-	
-	push @subSampleBleuDiffArr, abs($bleu2 - $bleu1);
-	push @subSampleBleu1Arr, $bleu1;
-	push @subSampleBleu2Arr, $bleu2;
-	
-	if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) {
-		print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING " . `date`;
-	}
+	my ($subSampleScoreDiffArr, $subSampleScore1Arr, $subSampleScore2Arr) = bootstrap_pass($proc);
+
+	my $realScore1 = &$proc($data->{refs}, $data->{hyp1});
+	my $realScore2 = &$proc($data->{refs}, $data->{hyp2});
+
+	my $scorePValue = bootstrap_pvalue($subSampleScoreDiffArr, $realScore1, $realScore2);
+
+	my ($scoreAvg1, $scoreVar1) = bootstrap_interval($subSampleScore1Arr);
+	my ($scoreAvg2, $scoreVar2) = bootstrap_interval($subSampleScore2Arr);
+
+	print "\n---=== $title score ===---\n";
+
+	print "actual score of hypothesis 1: $realScore1\n";
+	print "95% confidence interval for hypothesis 1 score: $scoreAvg1 +- $scoreVar1\n-----\n";
+	print "actual score of hypothesis 1: $realScore2\n";
+	print "95% confidence interval for hypothesis 2 score: $scoreAvg2 +- $scoreVar2\n-----\n";
+	print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n";
+	print "the probability of actually getting them (p-value) is: $scorePValue.\n";
 }

-#get subsample bleu difference mean
-my $averageSubSampleBleuDiff = 0;
-
-for my $subSampleDiff (@subSampleBleuDiffArr) {
-	$averageSubSampleBleuDiff += $subSampleDiff;
-}
-
-$averageSubSampleBleuDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
-
-print "average subsample bleu: $averageSubSampleBleuDiff " . `date`;
-
-#calculating p-value
-my $count = 0;
-
-my $realBleu1 = getBleu($data->{refs}, $data->{hyp1});
-my $realBleu2 = getBleu($data->{refs}, $data->{hyp2});
-
-print "actual BLEU of hypothesis 1: $realBleu1\n";
-print "actual BLEU of hypothesis 1: $realBleu2\n";
-
-my $realBleuDiff = abs($realBleu2 - $realBleu1);
-
-for my $subSampleDiff (@subSampleBleuDiffArr) {
-	my $op;
+#####
+#
+#####
+sub bootstrap_pass {
+	my $scoreFunc = shift;
 	
-	if ($subSampleDiff - $averageSubSampleBleuDiff >= $realBleuDiff) {
-		$count++;
-		$op = ">=";
-	}
-	else {
-		$op = "< ";
+	my @subSampleDiffArr;
+	my @subSample1Arr;
+	my @subSample2Arr;
+
+	#applying sampling
+	for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
+		my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
+		
+		my $score1 = &$scoreFunc($data->{refs}, $data->{hyp1}, $subSampleIndices);
+		my $score2 = &$scoreFunc($data->{refs}, $data->{hyp2}, $subSampleIndices);
+		
+		push @subSampleDiffArr, abs($score2 - $score1);
+		push @subSample1Arr, $score1;
+		push @subSample2Arr, $score2;
 	}
 	
-	#print "$subSampleDiff - $averageSubSampleBleuDiff $op $realBleuDiff\n";
+	return (\@subSampleDiffArr, \@subSample1Arr, \@subSample2Arr);
 }

-my $result = $count / $TIMES_TO_REPEAT_SUBSAMPLING;
+#####
+#
+#####
+sub bootstrap_pvalue {
+	my $subSampleDiffArr = shift;
+	my $realScore1 = shift;
+	my $realScore2 = shift;
+	
+	my $realDiff = abs($realScore2 - $realScore1);
+	
+	#get subsample difference mean
+	my $averageSubSampleDiff = 0;

-print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n";
-print "the probability of actually getting them (p-value) is: $result.\n";
+	for my $subSampleDiff (@$subSampleDiffArr) {
+		$averageSubSampleDiff += $subSampleDiff;
+	}

-my @sorted1 = sort @subSampleBleu1Arr;
-my @sorted2 = sort @subSampleBleu2Arr;
+	$averageSubSampleDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;

-print "95% confidence interval for hypothesis 1: " . $sorted1[25] . " -- " . $sorted1[924] . "\n";
-print "95% confidence interval for hypothesis 2: " . $sorted2[25] . " -- " . $sorted2[924] . "\n";
+	#calculating p-value
+	my $count = 0;
+
+	my $realScoreDiff = abs($realScore2 - $realScore1);
+
+	for my $subSampleDiff (@$subSampleDiffArr) {
+		if ($subSampleDiff - $averageSubSampleDiff >= $realDiff) {
+			$count++;
+		}
+	}
+
+	return $count / $TIMES_TO_REPEAT_SUBSAMPLING;
+}
+
+#####
+#
+#####
+sub bootstrap_interval {
+	my $subSampleArr = shift;
+	
+	my @sorted = sort @$subSampleArr;
+	
+	my $lowerIdx = int($TIMES_TO_REPEAT_SUBSAMPLING / 40);
+	my $higherIdx = $TIMES_TO_REPEAT_SUBSAMPLING - $lowerIdx - 1;
+	
+	my $lower = $sorted[$lowerIdx];
+	my $higher = $sorted[$higherIdx];
+	my $diff = $higher - $lower;
+	
+	return ($lower + 0.5 * $diff, 0.5 * $diff);
+}

 #####
 # read 2 hyp and 1 to \infty ref data files
@ -131,6 +173,7 @@ sub readAllData {

 	#reading reference(s) and checking for matching sizes
 	$result{refs} = [];
+	$result{ngramCounts} = { };
 	my $i = 0;

 	for my $refFile (@refFiles) {
@ -141,12 +184,49 @@ sub readAllData {
 			die ("ERROR: ref set $i size doesn't match the size of hyp sets");
 		}
 		
+		updateCounts($result{ngramCounts}, $refDataX);
+		
 		push @{$result{refs}}, $refDataX;
 	}
 	
 	return \%result;
 }

+#####
+#
+#####
+sub updateCounts {
+	my ($countHash, $refData) = @_;
+	
+	for my $snt(@$refData) {
+		my $size = scalar @{$snt->{words}};
+		$countHash->{""} += $size;
+		
+		for my $order(1..$MAX_NGRAMS) {
+			my $ngram;
+			
+			for my $i (0..($size-$order)) {
+				$ngram = join(" ", @{$snt->{words}}[$i..($i + $order - 1)]);
+				
+				$countHash->{$ngram}++;
+			}
+		}
+	}
+}
+
+#####
+#
+#####
+sub ngramInfo {
+	my ($data, $ngram) = @_;
+	
+	my @nwords = split(/ /, $ngram);
+	pop @nwords;
+	my $smallGram = join(" ", @nwords);
+	
+	return log($data->{ngramCounts}->{$smallGram} / $data->{ngramCounts}->{$ngram}) / log(2.0);
+}
+
 #####
 # read sentences from file
 #####
@ -172,41 +252,64 @@ sub preEvalHypo {
 	my $data = shift;
 	my $hypId = shift;
 	
+	for my $lineIdx (0..($data->{size} - 1)) {
+		preEvalHypoSnt($data, $hypId, $lineIdx);
+	}
+}
+
+#####
+#
+#####
+sub preEvalHypoSnt {
+	my ($data, $hypId, $lineIdx) = @_;
+	
 	my ($correctNgramCounts, $totalNgramCounts);
 	my ($refNgramCounts, $hypNgramCounts);
+	my ($coocNgramInfoSum, $totalNgramAmt);
 	
-	for my $lineIdx (0..($data->{size} - 1)) {
-		my $hypSnt = $data->{$hypId}->[$lineIdx];
+	my $hypSnt = $data->{$hypId}->[$lineIdx];
+	
+	#update total hyp len
+	$hypSnt->{hyplen} = scalar @{$hypSnt->{words}};
+	
+	#update total ref len with closest current ref len
+	$hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen});
+	$hypSnt->{avgreflen} = getAvgLength($data->{refs}, $lineIdx);
+	
+	$hypSnt->{correctNgrams} = [];
+	$hypSnt->{totalNgrams} = [];
+	
+	#update ngram precision for each n-gram order
+	for my $order (1..$MAX_NGRAMS) {
+		#hyp ngrams
+		$hypNgramCounts = groupNgrams($hypSnt, $order);
 		
-		#update total hyp len
-		$hypSnt->{hyplen} = scalar @{$hypSnt->{words}};
+		#ref ngrams
+		$refNgramCounts = groupNgramsMultiSrc($data->{refs}, $lineIdx, $order);
 		
-		#update total ref len with closest current ref len
-		$hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen});
+		$correctNgramCounts = 0;
+		$totalNgramCounts = 0;
+		$coocNgramInfoSum = 0;
+		$totalNgramAmt = 0;
+		my $coocUpd;
 		
-		$hypSnt->{correctNgrams} = [];
-		$hypSnt->{totalNgrams} = [];
-		
-		#update ngram precision for each n-gram order
-		for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
-			#hyp ngrams
-			$hypNgramCounts = groupNgrams($hypSnt, $order);
+		#correct, total
+		for my $ngram (keys %$hypNgramCounts) {
+			$coocUpd = min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram});
+			$correctNgramCounts += $coocUpd;
+			$totalNgramCounts += $hypNgramCounts->{$ngram};
 			
-			#ref ngrams
-			$refNgramCounts = groupNgramsMultiSrc($data->{refs}, $lineIdx, $order);
-			
-			$correctNgramCounts = 0;
-			$totalNgramCounts = 0;
-			
-			#correct, total
-			for my $ngram (keys %$hypNgramCounts) {
-				$correctNgramCounts += min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram});
-				$totalNgramCounts += $hypNgramCounts->{$ngram};
+			if ($coocUpd > 0) {
+				$coocNgramInfoSum += ngramInfo($data, $ngram);
 			}
 			
-			$hypSnt->{correctNgrams}->[$order] = $correctNgramCounts;
-			$hypSnt->{totalNgrams}->[$order] = $totalNgramCounts;
+			$totalNgramAmt++;
 		}
+		
+		$hypSnt->{correctNgrams}->[$order] = $correctNgramCounts;
+		$hypSnt->{totalNgrams}->[$order] = $totalNgramCounts;
+		$hypSnt->{ngramNistInfoSum}->[$order] = $coocNgramInfoSum;
+		$hypSnt->{ngramNistCount}->[$order] = $totalNgramAmt;
 	}
 }

@ -225,6 +328,56 @@ sub drawWithReplacement {
 	return \@result;
 }

+#####
+#
+#####
+sub getNist {
+	my ($refs, $hyp, $idxs) = @_;
+	
+	#default value for $idxs
+	unless (defined($idxs)) {
+		$idxs = [0..((scalar @$hyp) - 1)];
+	}
+	
+	#vars
+	my ($hypothesisLength, $referenceLength) = (0, 0);
+	my (@infosum, @totalamt);
+	
+	#gather info from each line
+	for my $lineIdx (@$idxs) {
+		
+		my $hypSnt = $hyp->[$lineIdx];
+		
+		#update total hyp len
+		$hypothesisLength += $hypSnt->{hyplen};
+		
+		#update total ref len with closest current ref len
+		$referenceLength += $hypSnt->{avgreflen};
+		
+		#update ngram precision for each n-gram order
+		for my $order (1..$MAX_NGRAMS) {
+			$infosum[$order] += $hypSnt->{ngramNistInfoSum}->[$order];
+			$totalamt[$order] += $hypSnt->{ngramNistCount}->[$order];
+		}
+	}
+	
+	my $toplog = log($hypothesisLength / $referenceLength);
+	my $btmlog = log(2.0/3.0);
+	
+	#compose nist score
+	my $brevityPenalty = ($hypothesisLength > $referenceLength)? 1.0: exp(log(0.5) * $toplog * $toplog / ($btmlog * $btmlog));
+	
+	my $sum = 0;
+	
+	for my $order (1..$MAX_NGRAMS) {
+		$sum += $infosum[$order]/$totalamt[$order];
+	}
+	
+	my $result = $sum * $brevityPenalty;
+	
+	return $result;
+}
+
 #####
 # refs: arrayref of different references, reference = array of lines, line = array of words, word = string
 # hyp: arrayref of lines of hypothesis translation, line = array of words, word = string
@ -254,7 +407,7 @@ sub getBleu {
 		$referenceLength += $hypSnt->{reflen};
 		
 		#update ngram precision for each n-gram order
-		for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
+		for my $order (1..$MAX_NGRAMS) {
 			$correctNgramCounts[$order] += $hypSnt->{correctNgrams}->[$order];
 			$totalNgramCounts[$order] += $hypSnt->{totalNgrams}->[$order];
 		}
@ -265,11 +418,28 @@ sub getBleu {
 	
 	my $logsum = 0;
 	
-	for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
+	for my $order (1..$MAX_NGRAMS) {
 		$logsum += safeLog($correctNgramCounts[$order] / $totalNgramCounts[$order]);
 	}
 	
-	return $brevityPenalty * exp($logsum / $MAX_NGRAMS_FOR_BLEU);
+	return $brevityPenalty * exp($logsum / $MAX_NGRAMS);
+}
+
+#####
+#
+#####
+sub getAvgLength {
+	my ($refs, $lineIdx) = @_;
+	
+	my $result = 0;
+	my $count = 0;
+	
+	for my $ref (@$refs) {
+		$result += scalar @{$ref->[$lineIdx]->{words}};
+		$count++;
+	}
+	
+	return $result / $count;
 }

 #####
@ -366,3 +536,9 @@ sub max {
 	
 	return ($a > $b)? $a: $b;
 }
+
+sub poww {
+	my ($a, $b) = @_;
+	
+	return exp($b * log($a));
+}