From 7e9108dd29ff3f03820a968bf81cd022fe35c43f Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 20 Oct 2017 10:08:22 +0100 Subject: [PATCH] multi-bleu-detok.perl - a plain text alternative to mteval-v13a.perl --- scripts/generic/multi-bleu-detok.perl | 211 ++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100755 scripts/generic/multi-bleu-detok.perl diff --git a/scripts/generic/multi-bleu-detok.perl b/scripts/generic/multi-bleu-detok.perl new file mode 100755 index 000000000..d2ef60c90 --- /dev/null +++ b/scripts/generic/multi-bleu-detok.perl @@ -0,0 +1,211 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# This file uses the internal tokenization of mteval-v13a.pl, +# giving the exact same (case-sensitive) results on untokenized text. +# Using this script with detokenized output and untokenized references is +# preferrable over multi-bleu.perl, since scores aren't affected by tokenization differences. +# +# like multi-bleu.perl , it supports plain text input and multiple references. + +# $Id$ +use warnings; +use strict; + +my $lowercase = 0; +if ($ARGV[0] eq "-lc") { + $lowercase = 1; + shift; +} + +my $stem = $ARGV[0]; +if (!defined $stem) { + print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n"; + print STDERR "Reads the references from reference or reference0, reference1, ...\n"; + exit(1); +} + +$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; + +my @REF; +my $ref=0; +while(-e "$stem$ref") { + &add_to_ref("$stem$ref",\@REF); + $ref++; +} +&add_to_ref($stem,\@REF) if -e $stem; +die("ERROR: could not find reference file $stem") unless scalar @REF; + +# add additional references explicitly specified on the command line +shift; +foreach my $stem (@ARGV) { + &add_to_ref($stem,\@REF) if -e $stem; +} + + + +sub add_to_ref { + my ($file,$REF) = @_; + my $s=0; + if ($file =~ /.gz$/) { + open(REF,"gzip -dc $file|") or die "Can't read $file"; + } else { + open(REF,$file) or die "Can't read $file"; + } + while() { + chop; + $_ = tokenization($_); + push @{$$REF[$s++]}, $_; + } + close(REF); +} + +my(@CORRECT,@TOTAL,$length_translation,$length_reference); +my $s=0; +while() { + chop; + $_ = lc if $lowercase; + $_ = tokenization($_); + my @WORD = split; + my %REF_NGRAM = (); + my $length_translation_this_sentence = scalar(@WORD); + my ($closest_diff,$closest_length) = (9999,9999); + foreach my $reference (@{$REF[$s]}) { +# print "$s $_ <=> $reference\n"; + $reference = lc($reference) if $lowercase; + my @WORD = split(' ',$reference); + my $length = scalar(@WORD); + my $diff = abs($length_translation_this_sentence-$length); + if ($diff < $closest_diff) { + $closest_diff = $diff; + $closest_length = $length; + # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; + } elsif ($diff == $closest_diff) { + $closest_length = $length if $length < $closest_length; + # from two references with the same closeness to me + # take the *shorter* into account, not the "first" one. + } + for(my $n=1;$n<=4;$n++) { + my %REF_NGRAM_N = (); + for(my $start=0;$start<=$#WORD-($n-1);$start++) { + my $ngram = "$n"; + for(my $w=0;$w<$n;$w++) { + $ngram .= " ".$WORD[$start+$w]; + } + $REF_NGRAM_N{$ngram}++; + } + foreach my $ngram (keys %REF_NGRAM_N) { + if (!defined($REF_NGRAM{$ngram}) || + $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { + $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; +# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; + } + } + } + } + $length_translation += $length_translation_this_sentence; + $length_reference += $closest_length; + for(my $n=1;$n<=4;$n++) { + my %T_NGRAM = (); + for(my $start=0;$start<=$#WORD-($n-1);$start++) { + my $ngram = "$n"; + for(my $w=0;$w<$n;$w++) { + $ngram .= " ".$WORD[$start+$w]; + } + $T_NGRAM{$ngram}++; + } + foreach my $ngram (keys %T_NGRAM) { + $ngram =~ /^(\d+) /; + my $n = $1; + # my $corr = 0; +# print "$i e $ngram $T_NGRAM{$ngram}
\n"; + $TOTAL[$n] += $T_NGRAM{$ngram}; + if (defined($REF_NGRAM{$ngram})) { + if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { + $CORRECT[$n] += $T_NGRAM{$ngram}; + # $corr = $T_NGRAM{$ngram}; +# print "$i e correct1 $T_NGRAM{$ngram}
\n"; + } + else { + $CORRECT[$n] += $REF_NGRAM{$ngram}; + # $corr = $REF_NGRAM{$ngram}; +# print "$i e correct2 $REF_NGRAM{$ngram}
\n"; + } + } + # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; + # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" + } + } + $s++; +} +my $brevity_penalty = 1; +my $bleu = 0; + +my @bleu=(); + +for(my $n=1;$n<=4;$n++) { + if (defined ($TOTAL[$n])){ + $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; + # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; + }else{ + $bleu[$n]=0; + } +} + +if ($length_reference==0){ + printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; + exit(1); +} + +if ($length_translation<$length_reference) { + $brevity_penalty = exp(1-$length_reference/$length_translation); +} +$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + + my_log( $bleu[2] ) + + my_log( $bleu[3] ) + + my_log( $bleu[4] ) ) / 4) ; +printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", + 100*$bleu, + 100*$bleu[1], + 100*$bleu[2], + 100*$bleu[3], + 100*$bleu[4], + $brevity_penalty, + $length_translation / $length_reference, + $length_translation, + $length_reference; + +sub my_log { + return -9999999999 unless $_[0]; + return log($_[0]); +} + + + +sub tokenization +{ + my ($norm_text) = @_; + +# language-independent part: + $norm_text =~ s///g; # strip "skipped" tags + $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines + $norm_text =~ s/\n/ /g; # join lines + $norm_text =~ s/"/"/g; # convert SGML tag for quote to " + $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & + $norm_text =~ s/</ + $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < + +# language-dependent part (assuming Western languages): + $norm_text = " $norm_text "; + $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation + $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit + $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit + $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit + $norm_text =~ s/\s+/ /g; # one space only between words + $norm_text =~ s/^\s+//; # no leading space + $norm_text =~ s/\s+$//; # no trailing space + + return $norm_text; +}