mosesdecoder/scripts/training/combine_factors_syntax.pl

#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

# $Id$
# given a list of files, combines them to a single corpus (sent to stdout)

use strict;
use warnings;
use Getopt::Long;
use IO::File;
use File::Basename;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my @addfactors = @ARGV;
die "usage: combine_factors.pl corpusfile1 corpusfile2 ..."
  if 0 == scalar @addfactors;

my @streams = map {
  my $fn = $_;
  my $opn = ($fn =~ /\.gz$/ ? "zcat $fn |" : "$fn");
  my $stream = new IO::File;
  $stream->open($opn) or die "Can't open '$opn'";
  binmode($stream, ":utf8");
  $stream;
} @addfactors;

my $nr=0;
my $firststream = shift @streams;
shift @addfactors; # just to keep the lengths sync'ed
$_ = readline($firststream);
while (defined $_) {
  $nr++;
  print STDERR "." if $nr % 10000 == 0;
  print STDERR "($nr)" if $nr % 100000 == 0;
  my ($intokens,$MARKUP) = split_xml($_);
  # load lines of corresponding streams and ensure equal number of words
  my @lines_of_extratoks;
  foreach my $factor (0..$#streams) {
    my $line = readline($streams[$factor]);
    die "Additional factor file $addfactors[$factor] contains too few sentences!"
      if !defined $line;
    chomp($line);
    $line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
    my @toks = split / /, $line;
    die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)"
      if $#toks != $#$intokens;
    $lines_of_extratoks[$factor] = \@toks;
  }

  # for every token, print the factors in the order as user wished
  for(my $i=0; $i<=$#$intokens; $i++) {
    print "" if $i && $$MARKUP[$i] eq '';
    print $$MARKUP[$i];

    my $token = $$intokens[$i];
    my @outtoken = ();
    push @outtoken, $token; # add the first one
    # print STDERR "Token: $token\n";
    foreach my $factor (0..$#streams) {
      my $f = $lines_of_extratoks[$factor]->[$i];
      die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]"
        if !defined $f || $f eq "";
      push @outtoken, $f;
    }
    print " " if $i != 0;
    print join("|", @outtoken);
  }
  print $$MARKUP[$#$MARKUP];
  print "\n";
  $_ = readline($firststream);
}
close $firststream;
print STDERR "Done.\n";

# store away xml markup
sub split_xml {
  my ($line) = @_;
  my (@WORD,@MARKUP);
  my $i = 0;
  $MARKUP[0] = "";
  while($line =~ /\S/) {
    # XML tag
    if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
      my $potential_xml = $1;
      my $line_next = $2;
      # exception for factor that is an XML tag
      if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
	$WORD[$i-1] .= $potential_xml;
	if ($line_next =~ /^(\|+)(.*)$/) {
	  $WORD[$i-1] .= $1;
	  $line_next = $2;
	}
      }
      else {
        $MARKUP[$i] .= $potential_xml." ";
      }
      $line = $line_next;
    }
    # non-XML text
    elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
      $WORD[$i++] = $1;
      $MARKUP[$i] = "";
      $line = $2;
    }
    # '<' or '>' occurs in word, but it's not an XML tag
    elsif ($line =~ /^\s*(\S+)(.*)$/) {
      $WORD[$i++] = $1;
      $MARKUP[$i] = "";
      $line = $2;
      }
    else {
      die("ERROR: huh? $line\n");
    }
  }
  chop($MARKUP[$#MARKUP]);
  return (\@WORD,\@MARKUP);
}