Fixed bug about nbest generation when a sentence is not translated.

Now, one fictitious "empty translation" with score 0 is added. Before, problems happened with MERT due to a misalignement. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@972 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-29 06:52:34 +03:00 · 2006-11-10 10:36:23 +00:00 · 2006-11-10 10:36:23 +00:00 · 5c17fe6505
commit 5c17fe6505
parent 5736284365
1 changed files with 44 additions and 4 deletions
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@ -58,7 +58,7 @@ my $old_sge = 0; # assume old Sun Grid Engine (<6.0) where qsub does not
 #######################
 # Command line options processing
 sub init(){
-  use Getopt::Long qw(:config pass_through);
+  use Getopt::Long qw(:config pass_through no_ignore_case);
  GetOptions('version'=>\$version,
 	     'help'=>\$help,
 	     'debug'=>\$dbg,
@ -372,17 +372,57 @@ sub preparing_script(){
 sub concatenate_nbest(){
  my $oldcode="";
  my $newcode=-1;
+  my %inplength = ();
+  my $offset = 0;
+ 
+
+# get the list of feature and set a fictitious string with zero scores
+  open (IN, "${nbestfile}.${splitpfx}$idxlist[0]");
+  my $str = <IN>;
+  chomp($str);
+  close(IN);
+  my ($code,$trans,$featurescores,$globalscore)=split(/\|\|\|/,$str);
+  
+  my $emptytrans = "  ";
+  my $emptyglobalscore = " 0.0";
+  my $emptyfeaturescores = $featurescores;
+  $emptyfeaturescores =~ s/[-0-9\.]+/0/g;
+
  open (OUT, "> ${orinbestfile}");
  foreach my $idx (@idxlist){
+
+#computing the length of each input file
+    my @in=();
+    open (IN, "${testfile}.${splitpfx}${idx}.trans");
+    @in=<IN>;
+    close(IN);
+    $inplength{$idx} = scalar(@in);
+
    open (IN, "${nbestfile}.${splitpfx}${idx}");
    while (<IN>){
      my ($code,@extra)=split(/\|\|\|/,$_);
-      $newcode++ if $code ne $oldcode;
+      $code += $offset;
+      if ($code ne $oldcode){
+
+# if there is a jump between two consecutive codes
+# it means that an input sentence is not translated
+# fill this hole with a "fictitious" list of translation
+# comprising just one "emtpy translation" with zero scores
+        while ($code - $oldcode > 1){
+           $oldcode++;
+           print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n";
+        }
+      }
      $oldcode=$code;
-      print OUT join("\|\|\|",($newcode,@extra));
+      print OUT join("\|\|\|",($oldcode,@extra));
    }
    close(IN);
-    $oldcode="";
+    $offset += $inplength{$idx};
+
+    while ($offset - $oldcode > 1){
+      $oldcode++;
+      print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n";
+    }
  }
  close(OUT);
 }