allows arbitrary mixing of 'kept' and 'added' factors in output

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@627 1f5c12ca-751b-0410-a591-d2e778427230
2024-09-20 23:58:15 +03:00 · 2006-08-10 22:00:02 +00:00 · 2006-08-10 22:00:02 +00:00 · 68ef1413cd
commit 68ef1413cd
parent 5a0787310b
1 changed files with 41 additions and 34 deletions
--- a/scripts/training/reduce_combine.pl
+++ b/scripts/training/reduce_combine.pl
@ -4,6 +4,7 @@
 # produces new corpus on stdout

 use strict;
+use warnings;
 use Getopt::Long;
 use IO::File;
 use File::Basename;
@ -18,29 +19,24 @@ GetOptions(
 );
 my $corppathname = shift;

-my @args = split /\+/, join("+", @ARGV);
-my $keepfactors = shift @args;
-my @addfactors = @args;
-die "usage: reduce_combine.pl corpusfile 0,1,2 add_factor_label1 add_factor_label2 ..."
-  if !defined $corppathname || !defined $keepfactors;
+my @requested_factors = split /[\+,]/, join("+", @ARGV);
+die "usage: reduce_combine.pl corpusfile 0 add_factor_label1 2 add_factor_label2 ..."
+  if !defined $corppathname || 0 == scalar @requested_factors;
+
+my @addfactors = grep { ! /^[0-9]+$/ } @requested_factors;
+# these are the labelled factors we need to load;

-my @keepfactors = ();
-if ($keepfactors =~ /^[0-9,]+$/) {
-  # assume these are really factors to keep
-  @keepfactors = split /,/, $keepfactors;
-} else {
-  # assume there are no factors to keep, just to output some added
-  unshift @addfactors, $keepfactors;
-}

 open CORP, $corppathname or die "Can't read $corppathname";
 binmode(CORP, ":utf8");

 my $corpdn = dirname($corppathname);
 my $corpbn = basename($corppathname);
-my @streams = map {
+my %streams = map {
  my $fn = "$corpdn/$factordir/$corpbn.$_";
-  IO::File->new($fn, "<:utf8") or die "Can't read '$fn'"
+  my $stream = IO::File->new($fn, "<:utf8");
+  die "Can't read '$fn'" if !defined $stream;
+  ( $_, $stream ); # define a mapping factorlabel->stream
 } @addfactors;

 my $nr=0;
@ -50,34 +46,45 @@ while (<CORP>) {
  print STDERR "($nr)" if $nr % 100000 == 0;
  chomp;
  my @intokens = split / /;
-  my $extratokens = undef;
-  for(my $i=0; $i<=$#streams; $i++) {
-    my $line = readline($streams[$i]);
-    die "Additional factor file $addfactors[$i] contains too few sentences!"
+  # load lines of corresponding streams and ensure equal number of words
+  my %lines_of_extratoks;
+  foreach my $factor (keys %streams) {
+    my $line = readline($streams{$factor});
+    die "Additional factor file $factor contains too few sentences!"
      if !defined $line;
    chomp($line);
-    my @extrafactors = split / /, $line;
-    die "Incompatible number of words in factor $addfactors[$i] on line $nr."
-      if $#extrafactors != $#intokens;
-    for(my $j=0; $j<=$#extrafactors; $j++) {
-      $extratokens->[$j]->[$i] = $extrafactors[$j];
-    }
+    my @toks = split / /, $line;
+    die "Incompatible number of words in factor $factor on line $nr."
+      if $#toks != $#intokens;
+    $lines_of_extratoks{$factor} = \@toks;
  }
-  my @outline = ();
+  
+  # for every token, print the factors in the order as user wished
  for(my $i=0; $i<=$#intokens; $i++) {
    my $token = $intokens[$i];
    my @outtoken = ();
    my @factors = split /\|/, $token;
-    foreach my $fid (@keepfactors) {
-      my $f = @factors[$fid];
-      die "Missed factor $fid in $token on line $nr"
-        if !defined $f;
-      push @outtoken, $factors[$fid];
+    # print STDERR "Token: $token\n";
+    foreach my $name (@requested_factors) {
+      my $f = undef;
+      if ($name =~ /^[0-9]+$/o) {
+        # numeric factors should be copied from original corpus
+        $f = $factors[$name];
+        die "Missed factor $name in $token on line $nr"
+          if !defined $f || $f eq "";
+      } else {
+        # named factors should be obtained from the streams
+	$f = $lines_of_extratoks{$name}->[$i];
+        die "Missed factor $name on line $nr"
+          if !defined $f || $f eq "";
+      }
+      # print STDERR "  Factor $name: $f\n";
+      push @outtoken, $f;
    }
-    push @outtoken, @{$extratokens->[$i]} if 0 < scalar @addfactors;
-    push @outline, join("|", @outtoken);
+    print " " if $i != 0;
+    print join("|", @outtoken);
  }
-  print join(" ", @outline)."\n";
+  print "\n";
 }
 close CORP;
 print STDERR "Done.\n";