separate xml version of combine_factors.pl

xml version causes slow-down for users who use factors, but not xml (which are most)
2024-12-27 22:14:57 +03:00 · 2015-07-28 23:27:59 +01:00 · 2015-07-28 23:27:59 +01:00 · 0173512ddc
commit 0173512ddc
parent 2cda286a06
2 changed files with 132 additions and 53 deletions
--- a/scripts/training/combine_factors.pl
+++ b/scripts/training/combine_factors.pl
@ -37,7 +37,9 @@ while (defined $_) {
  $nr++;
  print STDERR "." if $nr % 10000 == 0;
  print STDERR "($nr)" if $nr % 100000 == 0;
-  my ($intokens,$MARKUP) = split_xml($_);
+  chomp;
+  s/\s+/ /g; s/^ //; s/ $//;
+  my @intokens = split / /;
  # load lines of corresponding streams and ensure equal number of words
  my @lines_of_extratoks;
  foreach my $factor (0..$#streams) {
@ -47,17 +49,14 @@ while (defined $_) {
    chomp($line);
    $line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
    my @toks = split / /, $line;
-    die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)"
-      if $#toks != $#$intokens;
+    die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#intokens)"
+      if $#toks != $#intokens;
    $lines_of_extratoks[$factor] = \@toks;
  }

  # for every token, print the factors in the order as user wished
-  for(my $i=0; $i<=$#$intokens; $i++) {
-    print " " if $i && $$MARKUP[$i] eq '';
-    print $$MARKUP[$i];
-
-    my $token = $$intokens[$i];
+  for(my $i=0; $i<=$#intokens; $i++) {
+    my $token = $intokens[$i];
    my @outtoken = ();
    push @outtoken, $token; # add the first one
    # print STDERR "Token: $token\n";
@ -70,56 +69,11 @@ while (defined $_) {
    print " " if $i != 0;
    print join("|", @outtoken);
  }
-  print $$MARKUP[$#$MARKUP];
  print "\n";
  $_ = readline($firststream);
 }
 close $firststream;
 print STDERR "Done.\n";

-# store away xml markup
-sub split_xml {
-  my ($line) = @_;
-  my (@WORD,@MARKUP);
-  my $i = 0;
-  $MARKUP[0] = "";
-  while($line =~ /\S/) {
-    # XML tag
-    if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
-      my $potential_xml = $1;
-      my $line_next = $2;
-      # exception for factor that is an XML tag
-      if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
-	$WORD[$i-1] .= $potential_xml;
-	if ($line_next =~ /^(\|+)(.*)$/) {
-	  $WORD[$i-1] .= $1;
-	  $line_next = $2;
-	}
-      }
-      else {
-        $MARKUP[$i] .= $potential_xml." ";
-      }
-      $line = $line_next;
-    }
-    # non-XML text
-    elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
-      $WORD[$i++] = $1;
-      $MARKUP[$i] = "";
-      $line = $2;
-    }
-    # '<' or '>' occurs in word, but it's not an XML tag
-    elsif ($line =~ /^\s*(\S+)(.*)$/) {
-      $WORD[$i++] = $1;
-      $MARKUP[$i] = "";
-      $line = $2;
-      }
-    else {
-      die("ERROR: huh? $line\n");
-    }
-  }
-  chop($MARKUP[$#MARKUP]);
-  return (\@WORD,\@MARKUP);
-}
-


--- a/scripts/training/combine_factors_syntax.pl
+++ b/scripts/training/combine_factors_syntax.pl
@ -0,0 +1,125 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+# $Id$
+# given a list of files, combines them to a single corpus (sent to stdout)
+
+use strict;
+use warnings;
+use Getopt::Long;
+use IO::File;
+use File::Basename;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+my @addfactors = @ARGV;
+die "usage: combine_factors.pl corpusfile1 corpusfile2 ..."
+  if 0 == scalar @addfactors;
+
+my @streams = map {
+  my $fn = $_;
+  my $opn = ($fn =~ /\.gz$/ ? "zcat $fn |" : "$fn");
+  my $stream = new IO::File;
+  $stream->open($opn) or die "Can't open '$opn'";
+  binmode($stream, ":utf8");
+  $stream;
+} @addfactors;
+
+my $nr=0;
+my $firststream = shift @streams;
+shift @addfactors; # just to keep the lengths sync'ed
+$_ = readline($firststream);
+while (defined $_) {
+  $nr++;
+  print STDERR "." if $nr % 10000 == 0;
+  print STDERR "($nr)" if $nr % 100000 == 0;
+  my ($intokens,$MARKUP) = split_xml($_);
+  # load lines of corresponding streams and ensure equal number of words
+  my @lines_of_extratoks;
+  foreach my $factor (0..$#streams) {
+    my $line = readline($streams[$factor]);
+    die "Additional factor file $addfactors[$factor] contains too few sentences!"
+      if !defined $line;
+    chomp($line);
+    $line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
+    my @toks = split / /, $line;
+    die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)"
+      if $#toks != $#$intokens;
+    $lines_of_extratoks[$factor] = \@toks;
+  }
+
+  # for every token, print the factors in the order as user wished
+  for(my $i=0; $i<=$#$intokens; $i++) {
+    print "" if $i && $$MARKUP[$i] eq '';
+    print $$MARKUP[$i];
+
+    my $token = $$intokens[$i];
+    my @outtoken = ();
+    push @outtoken, $token; # add the first one
+    # print STDERR "Token: $token\n";
+    foreach my $factor (0..$#streams) {
+      my $f = $lines_of_extratoks[$factor]->[$i];
+      die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]"
+        if !defined $f || $f eq "";
+      push @outtoken, $f;
+    }
+    print " " if $i != 0;
+    print join("|", @outtoken);
+  }
+  print $$MARKUP[$#$MARKUP];
+  print "\n";
+  $_ = readline($firststream);
+}
+close $firststream;
+print STDERR "Done.\n";
+
+# store away xml markup
+sub split_xml {
+  my ($line) = @_;
+  my (@WORD,@MARKUP);
+  my $i = 0;
+  $MARKUP[0] = "";
+  while($line =~ /\S/) {
+    # XML tag
+    if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
+      my $potential_xml = $1;
+      my $line_next = $2;
+      # exception for factor that is an XML tag
+      if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
+	$WORD[$i-1] .= $potential_xml;
+	if ($line_next =~ /^(\|+)(.*)$/) {
+	  $WORD[$i-1] .= $1;
+	  $line_next = $2;
+	}
+      }
+      else {
+        $MARKUP[$i] .= $potential_xml." ";
+      }
+      $line = $line_next;
+    }
+    # non-XML text
+    elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
+      $WORD[$i++] = $1;
+      $MARKUP[$i] = "";
+      $line = $2;
+    }
+    # '<' or '>' occurs in word, but it's not an XML tag
+    elsif ($line =~ /^\s*(\S+)(.*)$/) {
+      $WORD[$i++] = $1;
+      $MARKUP[$i] = "";
+      $line = $2;
+      }
+    else {
+      die("ERROR: huh? $line\n");
+    }
+  }
+  chop($MARKUP[$#MARKUP]);
+  return (\@WORD,\@MARKUP);
+}
+
+
+