mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
separate xml version of combine_factors.pl
xml version causes slow-down for users who use factors, but not xml (which are most)
This commit is contained in:
parent
2cda286a06
commit
0173512ddc
@ -37,7 +37,9 @@ while (defined $_) {
|
||||
$nr++;
|
||||
print STDERR "." if $nr % 10000 == 0;
|
||||
print STDERR "($nr)" if $nr % 100000 == 0;
|
||||
my ($intokens,$MARKUP) = split_xml($_);
|
||||
chomp;
|
||||
s/\s+/ /g; s/^ //; s/ $//;
|
||||
my @intokens = split / /;
|
||||
# load lines of corresponding streams and ensure equal number of words
|
||||
my @lines_of_extratoks;
|
||||
foreach my $factor (0..$#streams) {
|
||||
@ -47,17 +49,14 @@ while (defined $_) {
|
||||
chomp($line);
|
||||
$line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
|
||||
my @toks = split / /, $line;
|
||||
die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)"
|
||||
if $#toks != $#$intokens;
|
||||
die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#intokens)"
|
||||
if $#toks != $#intokens;
|
||||
$lines_of_extratoks[$factor] = \@toks;
|
||||
}
|
||||
|
||||
# for every token, print the factors in the order as user wished
|
||||
for(my $i=0; $i<=$#$intokens; $i++) {
|
||||
print " " if $i && $$MARKUP[$i] eq '';
|
||||
print $$MARKUP[$i];
|
||||
|
||||
my $token = $$intokens[$i];
|
||||
for(my $i=0; $i<=$#intokens; $i++) {
|
||||
my $token = $intokens[$i];
|
||||
my @outtoken = ();
|
||||
push @outtoken, $token; # add the first one
|
||||
# print STDERR "Token: $token\n";
|
||||
@ -70,56 +69,11 @@ while (defined $_) {
|
||||
print " " if $i != 0;
|
||||
print join("|", @outtoken);
|
||||
}
|
||||
print $$MARKUP[$#$MARKUP];
|
||||
print "\n";
|
||||
$_ = readline($firststream);
|
||||
}
|
||||
close $firststream;
|
||||
print STDERR "Done.\n";
|
||||
|
||||
# store away xml markup
|
||||
sub split_xml {
|
||||
my ($line) = @_;
|
||||
my (@WORD,@MARKUP);
|
||||
my $i = 0;
|
||||
$MARKUP[0] = "";
|
||||
while($line =~ /\S/) {
|
||||
# XML tag
|
||||
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
|
||||
my $potential_xml = $1;
|
||||
my $line_next = $2;
|
||||
# exception for factor that is an XML tag
|
||||
if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
|
||||
$WORD[$i-1] .= $potential_xml;
|
||||
if ($line_next =~ /^(\|+)(.*)$/) {
|
||||
$WORD[$i-1] .= $1;
|
||||
$line_next = $2;
|
||||
}
|
||||
}
|
||||
else {
|
||||
$MARKUP[$i] .= $potential_xml." ";
|
||||
}
|
||||
$line = $line_next;
|
||||
}
|
||||
# non-XML text
|
||||
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
|
||||
$WORD[$i++] = $1;
|
||||
$MARKUP[$i] = "";
|
||||
$line = $2;
|
||||
}
|
||||
# '<' or '>' occurs in word, but it's not an XML tag
|
||||
elsif ($line =~ /^\s*(\S+)(.*)$/) {
|
||||
$WORD[$i++] = $1;
|
||||
$MARKUP[$i] = "";
|
||||
$line = $2;
|
||||
}
|
||||
else {
|
||||
die("ERROR: huh? $line\n");
|
||||
}
|
||||
}
|
||||
chop($MARKUP[$#MARKUP]);
|
||||
return (\@WORD,\@MARKUP);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
125
scripts/training/combine_factors_syntax.pl
Executable file
125
scripts/training/combine_factors_syntax.pl
Executable file
@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
# $Id$
|
||||
# given a list of files, combines them to a single corpus (sent to stdout)
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long;
|
||||
use IO::File;
|
||||
use File::Basename;
|
||||
|
||||
binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
binmode(STDERR, ":utf8");
|
||||
|
||||
my @addfactors = @ARGV;
|
||||
die "usage: combine_factors.pl corpusfile1 corpusfile2 ..."
|
||||
if 0 == scalar @addfactors;
|
||||
|
||||
my @streams = map {
|
||||
my $fn = $_;
|
||||
my $opn = ($fn =~ /\.gz$/ ? "zcat $fn |" : "$fn");
|
||||
my $stream = new IO::File;
|
||||
$stream->open($opn) or die "Can't open '$opn'";
|
||||
binmode($stream, ":utf8");
|
||||
$stream;
|
||||
} @addfactors;
|
||||
|
||||
my $nr=0;
|
||||
my $firststream = shift @streams;
|
||||
shift @addfactors; # just to keep the lengths sync'ed
|
||||
$_ = readline($firststream);
|
||||
while (defined $_) {
|
||||
$nr++;
|
||||
print STDERR "." if $nr % 10000 == 0;
|
||||
print STDERR "($nr)" if $nr % 100000 == 0;
|
||||
my ($intokens,$MARKUP) = split_xml($_);
|
||||
# load lines of corresponding streams and ensure equal number of words
|
||||
my @lines_of_extratoks;
|
||||
foreach my $factor (0..$#streams) {
|
||||
my $line = readline($streams[$factor]);
|
||||
die "Additional factor file $addfactors[$factor] contains too few sentences!"
|
||||
if !defined $line;
|
||||
chomp($line);
|
||||
$line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
|
||||
my @toks = split / /, $line;
|
||||
die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)"
|
||||
if $#toks != $#$intokens;
|
||||
$lines_of_extratoks[$factor] = \@toks;
|
||||
}
|
||||
|
||||
# for every token, print the factors in the order as user wished
|
||||
for(my $i=0; $i<=$#$intokens; $i++) {
|
||||
print "" if $i && $$MARKUP[$i] eq '';
|
||||
print $$MARKUP[$i];
|
||||
|
||||
my $token = $$intokens[$i];
|
||||
my @outtoken = ();
|
||||
push @outtoken, $token; # add the first one
|
||||
# print STDERR "Token: $token\n";
|
||||
foreach my $factor (0..$#streams) {
|
||||
my $f = $lines_of_extratoks[$factor]->[$i];
|
||||
die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]"
|
||||
if !defined $f || $f eq "";
|
||||
push @outtoken, $f;
|
||||
}
|
||||
print " " if $i != 0;
|
||||
print join("|", @outtoken);
|
||||
}
|
||||
print $$MARKUP[$#$MARKUP];
|
||||
print "\n";
|
||||
$_ = readline($firststream);
|
||||
}
|
||||
close $firststream;
|
||||
print STDERR "Done.\n";
|
||||
|
||||
# store away xml markup
|
||||
sub split_xml {
|
||||
my ($line) = @_;
|
||||
my (@WORD,@MARKUP);
|
||||
my $i = 0;
|
||||
$MARKUP[0] = "";
|
||||
while($line =~ /\S/) {
|
||||
# XML tag
|
||||
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
|
||||
my $potential_xml = $1;
|
||||
my $line_next = $2;
|
||||
# exception for factor that is an XML tag
|
||||
if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
|
||||
$WORD[$i-1] .= $potential_xml;
|
||||
if ($line_next =~ /^(\|+)(.*)$/) {
|
||||
$WORD[$i-1] .= $1;
|
||||
$line_next = $2;
|
||||
}
|
||||
}
|
||||
else {
|
||||
$MARKUP[$i] .= $potential_xml." ";
|
||||
}
|
||||
$line = $line_next;
|
||||
}
|
||||
# non-XML text
|
||||
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
|
||||
$WORD[$i++] = $1;
|
||||
$MARKUP[$i] = "";
|
||||
$line = $2;
|
||||
}
|
||||
# '<' or '>' occurs in word, but it's not an XML tag
|
||||
elsif ($line =~ /^\s*(\S+)(.*)$/) {
|
||||
$WORD[$i++] = $1;
|
||||
$MARKUP[$i] = "";
|
||||
$line = $2;
|
||||
}
|
||||
else {
|
||||
die("ERROR: huh? $line\n");
|
||||
}
|
||||
}
|
||||
chop($MARKUP[$#MARKUP]);
|
||||
return (\@WORD,\@MARKUP);
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user