added yet another combiner for factored corpora

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1026 1f5c12ca-751b-0410-a591-d2e778427230
2024-09-20 23:58:15 +03:00 · 2006-11-30 06:17:45 +00:00 · 2006-11-30 06:17:45 +00:00 · 72ff1f8450
commit 72ff1f8450
parent 412f04737c
2 changed files with 73 additions and 0 deletions
--- a/scripts/released-files
+++ b/scripts/released-files
@ -32,6 +32,7 @@ training/phrase-extract/extract
 training/phrase-extract/score
 training/postprocess-lopar.perl
 training/reduce_combine.pl
+training/combine_factors.pl
 training/train-factored-phrase-model.perl
 training/symal/symal
 training/symal/giza2bal.pl
--- a/scripts/training/combine_factors.pl
+++ b/scripts/training/combine_factors.pl
@ -0,0 +1,72 @@
+#!/usr/bin/perl
+# given a list of files, combines them to a single corpus (sent to stdout)
+
+use strict;
+use warnings;
+use Getopt::Long;
+use IO::File;
+use File::Basename;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+my @addfactors = @ARGV;
+die "usage: combine_factors.pl corpusfile1 corpusfile2 ..."
+  if 0 == scalar @addfactors;
+
+my @streams = map {
+  my $fn = $_;
+  my $opn = ($fn =~ /\.gz$/ ? "zcat $fn |" : "$fn");
+  my $stream = new IO::File;
+  $stream->open($opn) or die "Can't open '$opn'";
+  binmode($stream, ":utf8");
+  $stream;
+} @addfactors;
+
+my $nr=0;
+my $firststream = shift @streams;
+shift @addfactors; # just to keep the lengths sync'ed
+$_ = readline($firststream);
+while (defined $_) {
+  $nr++;
+  print STDERR "." if $nr % 10000 == 0;
+  print STDERR "($nr)" if $nr % 100000 == 0;
+  chomp;
+  my @intokens = split / /;
+  # load lines of corresponding streams and ensure equal number of words
+  my @lines_of_extratoks;
+  foreach my $factor (0..$#streams) {
+    my $line = readline($streams[$factor]);
+    die "Additional factor file $addfactors[$factor] contains too few sentences!"
+      if !defined $line;
+    chomp($line);
+    my @toks = split / /, $line;
+    die "Incompatible number of words in factor $factor on line $nr."
+      if $#toks != $#intokens;
+    $lines_of_extratoks[$factor] = \@toks;
+  }
+  
+  # for every token, print the factors in the order as user wished
+  for(my $i=0; $i<=$#intokens; $i++) {
+    my $token = $intokens[$i];
+    my @outtoken = ();
+    push @outtoken, $token; # add the first one
+    # print STDERR "Token: $token\n";
+    foreach my $factor (0..$#streams) {
+      my $f = $lines_of_extratoks[$factor]->[$i];
+      die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]"
+        if !defined $f || $f eq "";
+      push @outtoken, $f;
+    }
+    print " " if $i != 0;
+    print join("|", @outtoken);
+  }
+  print "\n";
+  $_ = readline($firststream);
+}
+close $firststream;
+print STDERR "Done.\n";
+
+
+