mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 23:58:15 +03:00
allows arbitrary mixing of 'kept' and 'added' factors in output
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@627 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
5a0787310b
commit
68ef1413cd
@ -4,6 +4,7 @@
|
||||
# produces new corpus on stdout
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long;
|
||||
use IO::File;
|
||||
use File::Basename;
|
||||
@ -18,29 +19,24 @@ GetOptions(
|
||||
);
|
||||
my $corppathname = shift;
|
||||
|
||||
my @args = split /\+/, join("+", @ARGV);
|
||||
my $keepfactors = shift @args;
|
||||
my @addfactors = @args;
|
||||
die "usage: reduce_combine.pl corpusfile 0,1,2 add_factor_label1 add_factor_label2 ..."
|
||||
if !defined $corppathname || !defined $keepfactors;
|
||||
my @requested_factors = split /[\+,]/, join("+", @ARGV);
|
||||
die "usage: reduce_combine.pl corpusfile 0 add_factor_label1 2 add_factor_label2 ..."
|
||||
if !defined $corppathname || 0 == scalar @requested_factors;
|
||||
|
||||
my @addfactors = grep { ! /^[0-9]+$/ } @requested_factors;
|
||||
# these are the labelled factors we need to load;
|
||||
|
||||
my @keepfactors = ();
|
||||
if ($keepfactors =~ /^[0-9,]+$/) {
|
||||
# assume these are really factors to keep
|
||||
@keepfactors = split /,/, $keepfactors;
|
||||
} else {
|
||||
# assume there are no factors to keep, just to output some added
|
||||
unshift @addfactors, $keepfactors;
|
||||
}
|
||||
|
||||
open CORP, $corppathname or die "Can't read $corppathname";
|
||||
binmode(CORP, ":utf8");
|
||||
|
||||
my $corpdn = dirname($corppathname);
|
||||
my $corpbn = basename($corppathname);
|
||||
my @streams = map {
|
||||
my %streams = map {
|
||||
my $fn = "$corpdn/$factordir/$corpbn.$_";
|
||||
IO::File->new($fn, "<:utf8") or die "Can't read '$fn'"
|
||||
my $stream = IO::File->new($fn, "<:utf8");
|
||||
die "Can't read '$fn'" if !defined $stream;
|
||||
( $_, $stream ); # define a mapping factorlabel->stream
|
||||
} @addfactors;
|
||||
|
||||
my $nr=0;
|
||||
@ -50,34 +46,45 @@ while (<CORP>) {
|
||||
print STDERR "($nr)" if $nr % 100000 == 0;
|
||||
chomp;
|
||||
my @intokens = split / /;
|
||||
my $extratokens = undef;
|
||||
for(my $i=0; $i<=$#streams; $i++) {
|
||||
my $line = readline($streams[$i]);
|
||||
die "Additional factor file $addfactors[$i] contains too few sentences!"
|
||||
# load lines of corresponding streams and ensure equal number of words
|
||||
my %lines_of_extratoks;
|
||||
foreach my $factor (keys %streams) {
|
||||
my $line = readline($streams{$factor});
|
||||
die "Additional factor file $factor contains too few sentences!"
|
||||
if !defined $line;
|
||||
chomp($line);
|
||||
my @extrafactors = split / /, $line;
|
||||
die "Incompatible number of words in factor $addfactors[$i] on line $nr."
|
||||
if $#extrafactors != $#intokens;
|
||||
for(my $j=0; $j<=$#extrafactors; $j++) {
|
||||
$extratokens->[$j]->[$i] = $extrafactors[$j];
|
||||
}
|
||||
my @toks = split / /, $line;
|
||||
die "Incompatible number of words in factor $factor on line $nr."
|
||||
if $#toks != $#intokens;
|
||||
$lines_of_extratoks{$factor} = \@toks;
|
||||
}
|
||||
my @outline = ();
|
||||
|
||||
# for every token, print the factors in the order as user wished
|
||||
for(my $i=0; $i<=$#intokens; $i++) {
|
||||
my $token = $intokens[$i];
|
||||
my @outtoken = ();
|
||||
my @factors = split /\|/, $token;
|
||||
foreach my $fid (@keepfactors) {
|
||||
my $f = @factors[$fid];
|
||||
die "Missed factor $fid in $token on line $nr"
|
||||
if !defined $f;
|
||||
push @outtoken, $factors[$fid];
|
||||
# print STDERR "Token: $token\n";
|
||||
foreach my $name (@requested_factors) {
|
||||
my $f = undef;
|
||||
if ($name =~ /^[0-9]+$/o) {
|
||||
# numeric factors should be copied from original corpus
|
||||
$f = $factors[$name];
|
||||
die "Missed factor $name in $token on line $nr"
|
||||
if !defined $f || $f eq "";
|
||||
} else {
|
||||
# named factors should be obtained from the streams
|
||||
$f = $lines_of_extratoks{$name}->[$i];
|
||||
die "Missed factor $name on line $nr"
|
||||
if !defined $f || $f eq "";
|
||||
}
|
||||
# print STDERR " Factor $name: $f\n";
|
||||
push @outtoken, $f;
|
||||
}
|
||||
push @outtoken, @{$extratokens->[$i]} if 0 < scalar @addfactors;
|
||||
push @outline, join("|", @outtoken);
|
||||
print " " if $i != 0;
|
||||
print join("|", @outtoken);
|
||||
}
|
||||
print join(" ", @outline)."\n";
|
||||
print "\n";
|
||||
}
|
||||
close CORP;
|
||||
print STDERR "Done.\n";
|
||||
|
Loading…
Reference in New Issue
Block a user