mosesdecoder/scripts/analysis/sentence-by-sentence.pl

#!/usr/bin/perl -w

#sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
#usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html

use strict;
use Getopt::Long;

my $sourcefile = undef;
GetOptions(
	"source=s" => \$sourcefile,
) or exit(1);

my ($sysoutfile, @truthfiles) = @ARGV;

if (!defined $sysoutfile || scalar(@truthfiles) == 0) {
	print STDERR "
usage: $0 system_output reference(s...) > sentence-by-sentence.html
Options:
  --source STRING  ... foreign original

N-grams are colored by the number of supporting references:
 red for fewest, green for most, mediate shades otherwise.
";
  exit(1);
}

my @TRUTHS = () x scalar(@truthfiles);
for(my $i = 0; $i < scalar(@truthfiles); $i++)
{
	open($TRUTHS[$i], "<$truthfiles[$i]") or die "couldn't open '$truthfiles[$i]' for read: $!\n";
	binmode($TRUTHS[$i], ":utf8");
}
open(SYSOUT, "<$sysoutfile") or die "couldn't open '$sysoutfile' for read: $!\n";
binmode(SYSOUT, ":utf8");
binmode(STDOUT, ":utf8");
if (defined $sourcefile)
{
	open(SOURCE, "<$sourcefile") or die "couldn't open '$sourcefile' for read: $!\n";
	binmode(SOURCE, ":utf8");
}
my @bleuScores;
my @htmlSentences;
my @htmlColors = ('#99ff99', '#aaaaff', '#ffff99', '#ff9933', '#ff9999'); #color sentences by rank (split in n tiers)
my $ngramSingleRefColor = '#aaffaa';
my @ngramMultirefColors = ('ff9999', 'ff9933', 'ffff99', 'a0a0ff', '99ff99'); #arbitrary-length list; first entry is used for worst n-grams
my $i = 0;
while(my $sLine = <SYSOUT>)
{
	my @sFactors = @{extractFactorArrays($sLine)};
	my @eLines = () x scalar(@truthfiles);
	my @eFactors;
	for(my $j = 0; $j < scalar(@truthfiles); $j++)
	{
		my $fh = $TRUTHS[$j];
		$eLines[$j] = <$fh>;
		push @eFactors, extractFactorArrays($eLines[$j], "$truthfiles[$j] shorter than $sysoutfile");
	}
	my $sourceFactors;
	if (defined $sourcefile)
	{
		my $sourceLine = <SOURCE>;
		$sourceFactors = extractFactorArrays($sourceLine, "$sourcefile shorter than $sysoutfile");
	}

	my $bleuData = getBLEUSentenceDetails(\@sFactors, \@eFactors, 0);
	push @bleuScores, [$i, $bleuData->[0], 0]; #the last number will be the rank
	my $pwerData = getPWERSentenceDetails(\@sFactors, \@eFactors, 0);
	my $html = "<div class=\"sentence\" style=\"background-color: %%%%\" id=\"sentence$i\">"; #the %%%% and other tokens like it are flags to be replaced
	$html .= "<div class=\"bleu_report\"><b>Sentence $i)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;BLEU:</b> " . sprintf("%.4lg", $bleuData->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$bleuData->[0]}[1 .. 4]) . ")</div><table>\n";
	if(defined $sourcefile)
	{
		$html .= "<tr><td class=\"sent_title\">Source</td><td class=\"source_sentence\" id=\"source$i\">" . getFactoredSentenceHTML($sourceFactors) . "</td></tr>\n";
	}
	for(my $j = 0; $j < scalar(@truthfiles); $j++)
	{
		$html .= "<tr><td class=\"sent_title\">Ref $j</td><td class=\"truth_sentence\" id=\"truth${i}_$j\">" . getFactoredSentenceHTML($eFactors[$j]) . "</td></tr>\n";
	}
	my $j = 0;
	$html .= "<tr><td class=\"sent_title\">Output</td><td class=\"sysout_sentence\" id=\"sysout$i\">" . getFactoredSentenceHTML(\@sFactors, $pwerData) . "</td></tr>\n";
	$j = 0;
	$html .= "<tr><td class=\"sent_title\">N-grams</td><td class=\"sysout_ngrams\" id=\"ngrams$i\">" . getAllNgramsHTML(\@sFactors, $bleuData->[1], scalar(@truthfiles)) . "</td></tr>\n";
	$html .= "</table></div>\n";
	push @htmlSentences, $html;
	$i++;
}
close(SYSOUT);
foreach my $truthfh (@TRUTHS) {close($truthfh);}

rankSentencesByBLEU(\@bleuScores);
my $stylesheet = <<EOHTML;
<style type="text/css">
.legend {background: #fff; border: 1px solid #000; padding: 2px; margin-bottom: 10px; margin-right: 15px}
.legend_title {font-weight: bold; font-size: medium; text-decoration: underline}
div.sentence {background: #ffffee; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition for a given sentence
div.sentence td {margin: 8px 0px 8px 0px}
div.bleu_report {margin-bottom: 5px}
td.sent_title {font-weight: bold; font-size: medium; margin-bottom: 12px}
.source_sentence {background: #ffcccc; border: 1px solid #bbb}
.truth_sentence {background: #ccffcc; border: 1px solid #bbb}
.sysout_sentence {background: #ccccff; border: 1px solid #bbb}
table.sentence_table {border: none}
.sysout_ngrams {background: #fff; border: 1px solid #bbb}
table.ngram_table {}
td.ngram_cell {padding: 1px}
</style>
EOHTML
print "<html><head>\n";
print "<meta http-equiv=\"Content-type: text/html; charset=utf-8\">\n";
print "<title>$sysoutfile vs. [" . join(' ', @truthfiles) . "]: Sentence-by-sentence Comparison</title>$stylesheet</head><body>\n";

#javascript to sort by BLEU, by order in corpus, ...
my %rank2index = map {$bleuScores[$_]->[2] => $_} (0 .. scalar(@htmlSentences) - 1);
print "<script type=\"text/javascript\">
function sortByBLEU()
{
	var body = document.getElementById('all_sentences'); var row;\n";
foreach my $rank (sort {$a <=> $b} keys %rank2index)
{
	print "\trow = document.getElementById('everything" . $rank2index{$rank} . "');\n";
	print "\tbody.removeChild(row); body.appendChild(row);\n";
}
print "}
function sortByCorpusOrder()
{
	var body = document.getElementById('all_sentences'); var row;\n";
for(my $j = 0; $j < scalar(@htmlSentences); $j++)
{
	print "\trow = document.getElementById('everything$j');\n";
	print "\tbody.removeChild(row); body.appendChild(row);\n";
}
print "}
</script>\n";

#legend for background colors
my @minBLEU = (1e9) x scalar(@htmlColors);
my @maxBLEU = (-1e9) x scalar(@htmlColors);
for(my $k = 0; $k < scalar(@htmlSentences); $k++)
{
	my $tier = int($bleuScores[$k]->[2] / (scalar(@htmlSentences) / scalar(@htmlColors)));
	if($bleuScores[$k]->[1]->[0] < $minBLEU[$tier]) {$minBLEU[$tier] = $bleuScores[$k]->[1]->[0];}
	elsif($bleuScores[$k]->[1]->[0] > $maxBLEU[$tier]) {$maxBLEU[$tier] = $bleuScores[$k]->[1]->[0];}
}
print "<table border=0><tr><td><div class=\"legend\"><span class=\"legend_title\">Sentence Background Colors => BLEU Ranges</span><table border=0>";
for(my $k = 0; $k < scalar(@htmlColors); $k++)
{
	print "<tr><td style=\"width: 15px; height: 15px; background: " . $htmlColors[$k] . "\"></td><td align=left style=\"padding-left: 12px\">"
							. sprintf("%.4lg", $minBLEU[$k]) . " - " . sprintf("%.4lg", $maxBLEU[$k]) . "</td>";
}
print "</table></div></td>\n";
print "<td><div class=\"legend\"><span class=\"legend_title\">N-gram Colors => Number of Matching Reference Translations</span><table border=0>";
for(my $k = 1; $k <= scalar(@truthfiles); $k++)
{
	print "<tr><td style=\"width: 15px; height: 15px; background: " . getNgramColorHTML($k, scalar(@truthfiles)) . "\"></td><td align=left style=\"padding-left: 12px\">$k</td>";
}
print "</table></div></td></tr></table><div style=\"font-weight: bold; margin-bottom: 15px\">
PWER errors are marked in red on output sentence displays.</div>
<div style=\"margin-bottom: 8px\">Sort by <a href=\"javascript:sortByBLEU();\">BLEU score</a> | <a href=\"javascript:sortByCorpusOrder();\">corpus order</a> (default)</div>\n";

#sentence boxes
print "<div id=\"all_sentences\">";
my $j = 0;
foreach my $sentenceHTML (@htmlSentences)
{
	print "<div id=\"everything$j\" style=\"margin: 0px; padding: 0px\">";
	print "<hr width=98%>";
	my $bgcolor = getSentenceBGColorHTML($bleuScores[$j], $i); #i is now # of sentences
	$sentenceHTML =~ s/%%%%/$bgcolor/;
	print "$sentenceHTML</div>\n";
	$j++;
}
print "</div></body></html>";

##################### utils #####################

#arguments: a, b (scalars)
sub min
{
	my ($a, $b) = @_;
	return ($a < $b) ? $a : $b;
}
#arguments: a, b (scalars)
sub max
{
	my ($a, $b) = @_;
	return ($a > $b) ? $a : $b;
}
#arguments: a list of elements
#return undef for an empty list, the max element otherwise
sub maxN
{
	if(scalar @_ == 0) {return undef;}
	my $val = shift @_;
	foreach my $e (@_) {if($e > $val) {$val = $e;}}
	return $val;
}
#arguments: x
sub my_log
{
  return -9999999999 unless $_[0];
  return log($_[0]);
}
#arguments: x
sub round
{
	my $x = shift;
	return ($x - int($x) < .5) ? int($x) : int($x) + 1;
}

###############################################################################################################################################################

#arguments: line read from corpus file, (optionally) string to die with if line isn't defined (default die-msg is empty)
#return: sentence (arrayref of arrayrefs of factor strings) taken from line
sub extractFactorArrays
{
	my ($line, $msg) = (shift, '');
	$msg = shift if scalar(@_);
	die $msg if !defined $line;
	chomp $line;
	$line =~ s/^\s*|\s*$//g; #added by Ondrej to handle moses-mert-parallel output
	my @words = split(/\s+/, $line);
	my @factors = map {my @f = split(/\|/, $_); \@f;} @words;
	return \@factors;
}

#can handle multiple reference translations; assume at least one
#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use
#return: arrayref of [arrayref of [overall BLEU score, n-gram precisions], arrayref of matching n-gram [start index, length, arrayref of indices of matching references]]
sub getBLEUSentenceDetails
{
	my $maxNgramOrder = 4;
	my ($refSysOutput, $refTruths, $factorIndex) = @_;
	my $length_translation = scalar(@$refSysOutput); #length of sysout sentence
	my @length_references = map {scalar(@$_)} @$refTruths;
	my $closestTruthLength = (sort(map {abs($_ - $length_translation)} @length_references))[0];
	my @correct = (0) x $maxNgramOrder; #n-gram counts
	my @total = (0) x $maxNgramOrder; #n-gram counts
	my $returnData = [[], []];
	my %REF_GRAM; #hash from n-gram to arrayref with # of times found in each reference
	my $ngramMatches = []; #arrayref of n-gram [start index, length]
	for(my $j = 0; $j < scalar(@$refTruths); $j++)
	{
		for(my $i = 0; $i < $length_references[$j]; $i++)
		{
			my $gram = '';
			for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders
			{
				$gram = $refTruths->[$j]->[$i - $k]->[$factorIndex] . " " . $gram;
				#increment the count for the given n-gram and given reference number
				if(!exists $REF_GRAM{$gram})
				{
					my @tmp = (0) x scalar @$refTruths;
					$tmp[$j] = 1;
					$REF_GRAM{$gram} = \@tmp;
				}
				else
				{
					$REF_GRAM{$gram}->[$j]++;
				}
			}
		}
	}
	for(my $i = 0; $i < $length_translation; $i++)
	{
		my $gram = '';
		for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders
		{
			$gram = $refSysOutput->[$i - $k]->[$factorIndex] . " " . $gram;
			if(exists $REF_GRAM{$gram}) #this n-gram was found in at least one reference
			{
				$correct[$k]++;
				my @indices = ();
				for(my $m = 0; $m < scalar(@{$REF_GRAM{$gram}}); $m++)
				{
					if($REF_GRAM{$gram}->[$m] > 0)
					{
						push @indices, $m;
						$REF_GRAM{$gram}->[$m]--;
					}
				}
				push @$ngramMatches, [$i - $k, $k + 1, \@indices];
			}
		}
	}
	my $brevity = ($length_translation > $closestTruthLength || $length_translation == 0) ? 1 : exp(1 - $closestTruthLength / $length_translation);
	my @pct;
	my ($logsum, $logcount) = (0, 0);
	for(my $i = 0; $i < $maxNgramOrder; $i++)
	{
		$total[$i] = max(1, $length_translation - $i);
		push @pct, ($total[$i] == 0) ? -1 : $correct[$i] / $total[$i];
		if($total[$i] > 0)
		{
			$logsum += my_log($pct[$i]);
			$logcount++;
		}
	}
	my $bleu = $brevity * exp($logsum / $logcount);
	$returnData->[0] = [$bleu, @pct];
	$returnData->[1] = $ngramMatches;
	return $returnData;
}

#can handle multiple sentence translations; assume at least one
#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use
#return: hashref of sysout word index => whether word matches
sub getPWERSentenceDetails
{
	my ($refSysOutput, $refTruths, $factorIndex) = @_;
	my $matches = {};
	my %truthWords; #hash from word to arrayref with number of times seen in each reference (but later holds only the max)
	for(my $i = 0; $i < scalar(@$refTruths); $i++)
	{
		foreach my $eWord (@{$refTruths->[$i]})
		{
			my $factor = $eWord->[$factorIndex];
			if(exists $truthWords{$factor}) {$truthWords{$factor}->[$i]++;}
			else {my @tmp = (0) x scalar(@$refTruths); $tmp[$i] = 1; $truthWords{$factor} = \@tmp;}
		}
	}
	%truthWords = map {$_ => maxN(@{$truthWords{$_}})} (keys %truthWords); #save only the max times each word is seen in a reference
	for(my $j = 0; $j < scalar(@$refSysOutput); $j++)
	{
		if(exists $truthWords{$refSysOutput->[$j]->[$factorIndex]} && $truthWords{$refSysOutput->[$j]->[$factorIndex]} > 0)
		{
			$truthWords{$refSysOutput->[$j]->[$factorIndex]}--;
			$matches->{$j} = 1;
		}
		else
		{
			$matches->{$j} = 0;
		}
	}
	return $matches;
}

#assign ranks to sentences by BLEU score
#arguments: arrayref of arrayrefs of [sentence index, arrayref of [bleu score, n-gram precisions], rank to be assigned]
#return: none
sub rankSentencesByBLEU
{
	my $bleuData = shift;
	my $i = 0;
	#sort first on score, then on 1-gram accuracy, then on sentence index
	foreach my $sentenceData (reverse sort {my $c = $a->[1]->[0] <=> $b->[1]->[0]; if($c == 0) {my $d = $a->[1]->[1] <=> $b->[1]->[1]; if($d == 0) {$a->[0] cmp $b->[0];} else {$d;}} else {$c;}} @$bleuData) {$sentenceData->[2] = $i++;}
}

###############################################################################################################################################################

#write HTML for a sentence containing factors (display words in a row)
#arguments: sentence  (arrayref of arrayrefs of factor strings), PWER results (hashref from word indices to 0/1 whether matched a truth word)
#return: HTML string
sub getFactoredSentenceHTML
{
	my $sentence = shift;
	my $pwer = 0; if(scalar(@_) > 0) {$pwer = shift;}
	my $html = "<table class=\"sentence_table\"><tr>";
	for(my $i = 0; $i < scalar(@$sentence); $i++) #loop over words
	{
		my $style = ''; #default
		if($pwer ne '0' && $pwer->{$i} == 0) {$style = 'color: #cc0000; font-weight: bold';}
		$html .= "<td align=center style=\"$style\">" . join("<br>", @{$sentence->[$i]}) . "</td>";
	}
	return $html . "</tr></table>";
}

#arguments: arrayref of [sentence index, arrayref of [bleu score, n-gram precisions], rank], number of sentences
#return: HTML color string
sub getSentenceBGColorHTML
{
	my ($scoreData, $numSentences) = @_;
	my $tier = int($scoreData->[2] / ($numSentences / scalar(@htmlColors))); #0..n-1
	return $htmlColors[$tier];
}

#display all matching n-grams in the given sentence, with all 1-grams on one line, then arranged by picking, for each, the first line on which it fits,
# where a given word position can only be filled by one n-gram per line, so that all n-grams can be shown
#arguments: sentence (arrayref of arrayrefs of factor strings), arrayref of arrayrefs of matching n-gram [start, length, arrayref of matching reference indices],
# number of reference translations
#return: HTML string
sub getAllNgramsHTML
{
	my ($sentence, $ngrams, $numTruths) = @_;
	my $factorIndex = 0;
	my @table = (); #array or arrayrefs each of which represents a line; each position has the index of the occupying n-gram, or -1 if none
	my $n = 0; #n-gram index
	foreach my $ngram (sort {$a->[0] <=> $b->[0]} @$ngrams)
	{
		#check for an open slot in an existing row
		my $foundRow = 0;
		my $r = 0;
		foreach my $row (@table)
		{
			if(rowIsClear($row, $ngram) == 1)
			{
				@{$row}[$ngram->[0] .. ($ngram->[0] + $ngram->[1] - 1)] = ($n) x $ngram->[1];
				push @$ngram, $r; #add row index
				$foundRow = 1;
				last;
			}
			$r++;
		}
		#add row if necessary
		if($foundRow == 0)
		{
			my @row = (-1) x scalar(@$sentence);
			@row[$ngram->[0] .. ($ngram->[0] + $ngram->[1] - 1)] = ($n) x $ngram->[1];
			push @$ngram, scalar(@table); #add row index
			push @table, \@row;
		}
		$n++;
	}

	my $html = "<table class=\"ngram_table\"><tr><td align=center>" . join("</td><td align=center>", map {$_->[$factorIndex]} @$sentence) . "</td></tr>";

	my $numWords = scalar(@$sentence);
	my ($curRow, $curCol) = (0, 0); #address in table
	$html .= "<tr>";
	foreach my $ngram (sort {my $c = $a->[3] <=> $b->[3]; if($c == 0) {$a->[0] <=> $b->[0]} else {$c}} @$ngrams) #sort by row, then word num
	{
		while($ngram->[0] > $curCol || $ngram->[3] > $curRow) {$html .= "<td></td>"; $curCol = ($curCol + 1) % $numWords; if($curCol == 0) {$html .= "</tr><tr>"; $curRow++;}}
		$html .= "<td colspan=" . $ngram->[1] . " align=center class=\"ngram_cell\" style=\"background: " . getNgramColorHTML(scalar(@{$ngram->[2]}), $numTruths) . "\">" . join(' ', map {$_->[$factorIndex]} @{$sentence}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) . "</td>";
		$curCol = ($curCol + $ngram->[1]) % $numWords; if($curCol == 0) {$html .= "</tr><tr>"; $curRow++;}
	}
	$html .= "</tr>";

	return $html . "</table>\n";
}

#auxiliary to getAllNgramsHTML(): check a table row for an empty piece at the right place for the given n-gram
#arguments: row (arrayref of ints), n-gram (arrayref of [start index, length])
#return: whether (0/1) row is clear
sub rowIsClear
{
	my ($row, $ngram) = @_;
	return (maxN(@{$row}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) == -1) ? 1 : 0;
}

#auxiliary to getAllNgramsHTML()
#arguments: number of reference translations matching the n-gram, total number of references
#return: HTML color string
sub getNgramColorHTML
{
	my ($matches, $total) = @_;
	if($total == 1) {return $ngramSingleRefColor;}
	return $ngramMultirefColors[round($matches / $total * (scalar(@ngramMultirefColors) - 1))];
}