mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 15:04:05 +03:00
fa31d83421
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1561 1f5c12ca-751b-0410-a591-d2e778427230
122 lines
3.3 KiB
Perl
Executable File
122 lines
3.3 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
|
|
# $Id$
|
|
# given a pathname to a factored corpus, a list of (numeric) factors to keep
|
|
# and a list of (labelled) factors to attach from factors/<basename>.label
|
|
# produces new corpus on stdout
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Getopt::Long;
|
|
use IO::File;
|
|
use File::Basename;
|
|
|
|
binmode(STDIN, ":utf8");
|
|
binmode(STDOUT, ":utf8");
|
|
binmode(STDERR, ":utf8");
|
|
|
|
my $factordir = "factors";
|
|
GetOptions(
|
|
"factordir=s" => \$factordir,
|
|
);
|
|
my $corppathname = shift;
|
|
|
|
my @requested_factors = split /[\+,]/, join("+", @ARGV);
|
|
die "usage: reduce_combine.pl corpusfile 0 add_factor_label1 2 add_factor_label2 ..."
|
|
if !defined $corppathname || 0 == scalar @requested_factors;
|
|
|
|
my @addfactors = grep { ! /^[0-9]+$/ } @requested_factors;
|
|
# these are the labelled factors we need to load;
|
|
|
|
|
|
my $corp_stream;
|
|
if ($corppathname eq "-") {
|
|
$corp_stream=*STDIN;
|
|
|
|
die "Won't add factors to corpus coming from stdin." if scalar @addfactors;
|
|
} else {
|
|
$corp_stream = my_open($corppathname);
|
|
}
|
|
|
|
my $corpdn = dirname($corppathname);
|
|
my $corpbn = basename($corppathname);
|
|
$corpbn =~ s/\.gz$//;
|
|
my %streams = map {
|
|
my $fn = "$corpdn/$factordir/$corpbn.$_";
|
|
$fn .= ".gz" if ! -e $fn;
|
|
my $stream = my_open($fn);
|
|
die "Can't read '$fn'" if !defined $stream;
|
|
( $_, $stream ); # define a mapping factorlabel->stream
|
|
} @addfactors;
|
|
|
|
my $nr=0;
|
|
while (<$corp_stream>) {
|
|
$nr++;
|
|
print STDERR "." if $nr % 10000 == 0;
|
|
print STDERR "($nr)" if $nr % 100000 == 0;
|
|
chomp;
|
|
my @intokens = split / /;
|
|
# load lines of corresponding streams and ensure equal number of words
|
|
my %lines_of_extratoks;
|
|
foreach my $factor (keys %streams) {
|
|
my $line = readline($streams{$factor});
|
|
die "Additional factor file $factor contains too few sentences!"
|
|
if !defined $line;
|
|
chomp($line);
|
|
my @toks = split / /, $line;
|
|
die "Incompatible number of words in factor $factor on line $nr."
|
|
if $#toks != $#intokens;
|
|
$lines_of_extratoks{$factor} = \@toks;
|
|
}
|
|
|
|
# for every token, print the factors in the order as user wished
|
|
for(my $i=0; $i<=$#intokens; $i++) {
|
|
my $token = $intokens[$i];
|
|
my @outtoken = ();
|
|
my @factors = split /\|/, $token;
|
|
# print STDERR "Token: $token\n";
|
|
foreach my $name (@requested_factors) {
|
|
my $f = undef;
|
|
if ($name =~ /^[0-9]+$/o) {
|
|
# numeric factors should be copied from original corpus
|
|
$f = $factors[$name];
|
|
die "Missed factor $name in $token on line $nr"
|
|
if !defined $f || $f eq "";
|
|
} else {
|
|
# named factors should be obtained from the streams
|
|
$f = $lines_of_extratoks{$name}->[$i];
|
|
die "Missed factor $name on line $nr"
|
|
if !defined $f || $f eq "";
|
|
}
|
|
# print STDERR " Factor $name: $f\n";
|
|
push @outtoken, $f;
|
|
}
|
|
print " " if $i != 0;
|
|
print join("|", @outtoken);
|
|
}
|
|
print "\n";
|
|
}
|
|
close $corp_stream;
|
|
print STDERR "Done.\n";
|
|
|
|
|
|
sub my_open {
|
|
my $f = shift;
|
|
die "Not found: $f" if ! -e $f;
|
|
|
|
my $opn;
|
|
my $hdl;
|
|
my $ft = `file $f`;
|
|
# file might not recognize some files!
|
|
if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
|
|
$opn = "zcat $f |";
|
|
} elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
|
|
$opn = "bzcat $f |";
|
|
} else {
|
|
$opn = "$f";
|
|
}
|
|
open $hdl, $opn or die "Can't open '$opn': $!";
|
|
binmode $hdl, ":utf8";
|
|
return $hdl;
|
|
}
|