Fixed bug about nbest generation when a sentence is not translated.

Now, one fictitious "empty translation" with score 0 is added.
Before, problems happened with MERT due to a misalignement.


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@972 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
nicolabertoldi 2006-11-10 10:36:23 +00:00
parent 5736284365
commit 5c17fe6505

View File

@ -58,7 +58,7 @@ my $old_sge = 0; # assume old Sun Grid Engine (<6.0) where qsub does not
####################### #######################
# Command line options processing # Command line options processing
sub init(){ sub init(){
use Getopt::Long qw(:config pass_through); use Getopt::Long qw(:config pass_through no_ignore_case);
GetOptions('version'=>\$version, GetOptions('version'=>\$version,
'help'=>\$help, 'help'=>\$help,
'debug'=>\$dbg, 'debug'=>\$dbg,
@ -372,17 +372,57 @@ sub preparing_script(){
sub concatenate_nbest(){ sub concatenate_nbest(){
my $oldcode=""; my $oldcode="";
my $newcode=-1; my $newcode=-1;
my %inplength = ();
my $offset = 0;
# get the list of feature and set a fictitious string with zero scores
open (IN, "${nbestfile}.${splitpfx}$idxlist[0]");
my $str = <IN>;
chomp($str);
close(IN);
my ($code,$trans,$featurescores,$globalscore)=split(/\|\|\|/,$str);
my $emptytrans = " ";
my $emptyglobalscore = " 0.0";
my $emptyfeaturescores = $featurescores;
$emptyfeaturescores =~ s/[-0-9\.]+/0/g;
open (OUT, "> ${orinbestfile}"); open (OUT, "> ${orinbestfile}");
foreach my $idx (@idxlist){ foreach my $idx (@idxlist){
#computing the length of each input file
my @in=();
open (IN, "${testfile}.${splitpfx}${idx}.trans");
@in=<IN>;
close(IN);
$inplength{$idx} = scalar(@in);
open (IN, "${nbestfile}.${splitpfx}${idx}"); open (IN, "${nbestfile}.${splitpfx}${idx}");
while (<IN>){ while (<IN>){
my ($code,@extra)=split(/\|\|\|/,$_); my ($code,@extra)=split(/\|\|\|/,$_);
$newcode++ if $code ne $oldcode; $code += $offset;
if ($code ne $oldcode){
# if there is a jump between two consecutive codes
# it means that an input sentence is not translated
# fill this hole with a "fictitious" list of translation
# comprising just one "emtpy translation" with zero scores
while ($code - $oldcode > 1){
$oldcode++;
print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n";
}
}
$oldcode=$code; $oldcode=$code;
print OUT join("\|\|\|",($newcode,@extra)); print OUT join("\|\|\|",($oldcode,@extra));
} }
close(IN); close(IN);
$oldcode=""; $offset += $inplength{$idx};
while ($offset - $oldcode > 1){
$oldcode++;
print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n";
}
} }
close(OUT); close(OUT);
} }