mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
2a88fd0730
It should be sufficient to add a line like ttable-binarizer = "/home/bhaddow/moses/dist/bin/processPhraseTableMin" to your EMS config, and everything else will be taken care of. You can add other arguments to the processPhraseTableMin, for example for threading, by putting them in the quotes. Note that this is not fully tested, since there are currently some issues with the compact phrase table introduced by the sparse feature merge.
368 lines
11 KiB
Perl
Executable File
368 lines
11 KiB
Perl
Executable File
#!/usr/bin/perl -w
|
|
|
|
# $Id$
|
|
# Given a moses.ini file and an input text prepare minimized translation
|
|
# tables and a new moses.ini, so that loading of tables is much faster.
|
|
|
|
# original code by Philipp Koehn
|
|
# changes by Ondrej Bojar
|
|
# adapted for hierarchical models by Phil Williams
|
|
|
|
use strict;
|
|
|
|
use FindBin qw($RealBin);
|
|
use Getopt::Long;
|
|
|
|
my $SCRIPTS_ROOTDIR;
|
|
if (defined($ENV{"SCRIPTS_ROOTDIR"})) {
|
|
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"};
|
|
} else {
|
|
$SCRIPTS_ROOTDIR = $RealBin;
|
|
if ($SCRIPTS_ROOTDIR eq '') {
|
|
$SCRIPTS_ROOTDIR = dirname(__FILE__);
|
|
}
|
|
$SCRIPTS_ROOTDIR =~ s/\/training$//;
|
|
$ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR;
|
|
}
|
|
|
|
# consider phrases in input up to $MAX_LENGTH
|
|
# in other words, all phrase-tables will be truncated at least to 10 words per
|
|
# phrase.
|
|
my $MAX_LENGTH = 10;
|
|
|
|
# utilities
|
|
my $ZCAT = "gzip -cd";
|
|
|
|
# get optional parameters
|
|
my $opt_hierarchical = 0;
|
|
my $binarizer = undef;
|
|
my $opt_min_non_initial_rule_count = undef;
|
|
my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
|
|
|
|
GetOptions(
|
|
"gzip!" => \$opt_gzip,
|
|
"Hierarchical" => \$opt_hierarchical,
|
|
"Binarizer=s" => \$binarizer,
|
|
"MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count
|
|
) or exit(1);
|
|
|
|
# get command line parameters
|
|
my $dir = shift;
|
|
my $config = shift;
|
|
my $input = shift;
|
|
|
|
if (!defined $dir || !defined $config || !defined $input) {
|
|
print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical]\n";
|
|
exit 1;
|
|
}
|
|
$dir = ensure_full_path($dir);
|
|
|
|
# buggy directory in place?
|
|
if (-d $dir && ! -e "$dir/info") {
|
|
print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n";
|
|
exit(1);
|
|
}
|
|
|
|
# already filtered? check if it can be re-used
|
|
if (-d $dir) {
|
|
my @INFO = `cat $dir/info`;
|
|
chop(@INFO);
|
|
if($INFO[0] ne $config
|
|
|| ($INFO[1] ne $input &&
|
|
$INFO[1].".tagged" ne $input)) {
|
|
print STDERR "WARNING: directory exists but does not match parameters:\n";
|
|
print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n";
|
|
exit 1;
|
|
}
|
|
print STDERR "The filtered model was ready in $dir, not doing anything.\n";
|
|
exit 0;
|
|
}
|
|
|
|
|
|
# filter the translation and distortion tables
|
|
safesystem("mkdir -p $dir") or die "Can't mkdir $dir";
|
|
|
|
# get tables to be filtered (and modify config file)
|
|
my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER);
|
|
my %new_name_used = ();
|
|
open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini";
|
|
open(INI,$config) or die "Can't read $config";
|
|
while(<INI>) {
|
|
print INI_OUT $_;
|
|
if (/ttable-file\]/) {
|
|
while(1) {
|
|
my $table_spec = <INI>;
|
|
if ($table_spec !~ /^(\d+) ([\d\,\-]+) ([\d\,\-]+) (\d+) (\S+)( \S+)?$/) {
|
|
print INI_OUT $table_spec;
|
|
last;
|
|
}
|
|
my ($phrase_table_impl,$source_factor,$t,$w,$file,$table_flag) = ($1,$2,$3,$4,$5,$6);
|
|
$table_flag = "" if (!defined($table_flag));
|
|
|
|
if (($phrase_table_impl ne "0" && $phrase_table_impl ne "6") || $file =~ /glue-grammar/) {
|
|
# Only Memory ("0") and NewFormat ("6") can be filtered.
|
|
print INI_OUT $table_spec;
|
|
next;
|
|
}
|
|
|
|
chomp($file);
|
|
push @TABLE, $file;
|
|
push @TABLE_WEIGHTS,$w;
|
|
$KNOWN_TTABLE{$#TABLE}++;
|
|
|
|
my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"});
|
|
my $cnt = 1;
|
|
$cnt ++ while (defined $new_name_used{"$new_name.$cnt"});
|
|
$new_name .= ".$cnt";
|
|
$new_name_used{$new_name} = 1;
|
|
if ($binarizer && $phrase_table_impl == 6) {
|
|
print INI_OUT "2 $source_factor $t $w $new_name.bin$table_flag\n";
|
|
}
|
|
elsif ($binarizer && $phrase_table_impl == 0) {
|
|
if ($binarizer =~ /processPhraseTableMin/) {
|
|
print INI_OUT "12 $source_factor $t $w $new_name$table_flag\n";
|
|
} else {
|
|
print INI_OUT "1 $source_factor $t $w $new_name$table_flag\n";
|
|
}
|
|
} else {
|
|
$new_name .= ".gz" if $opt_gzip;
|
|
print INI_OUT "$phrase_table_impl $source_factor $t $w $new_name$table_flag\n";
|
|
}
|
|
push @TABLE_NEW_NAME,$new_name;
|
|
|
|
$CONSIDER_FACTORS{$source_factor} = 1;
|
|
print STDERR "Considering factor $source_factor\n";
|
|
push @TABLE_FACTORS, $source_factor;
|
|
}
|
|
}
|
|
elsif (/distortion-file/) {
|
|
while(1) {
|
|
my $table_spec = <INI>;
|
|
if ($table_spec !~ /^([\d\,\-]+) (\S+) (\d+) (\S+)$/) {
|
|
print INI_OUT $table_spec;
|
|
last;
|
|
}
|
|
my ($factors,$t,$w,$file) = ($1,$2,$3,$4);
|
|
my $source_factor = $factors;
|
|
$source_factor =~ s/\-[\d,]+$//;
|
|
|
|
chomp($file);
|
|
push @TABLE,$file;
|
|
|
|
$file =~ s/^.*\/+([^\/]+)/$1/g;
|
|
my $new_name = "$dir/$file";
|
|
$new_name =~ s/\.gz//;
|
|
print INI_OUT "$factors $t $w $new_name\n";
|
|
push @TABLE_NEW_NAME,$new_name;
|
|
|
|
$CONSIDER_FACTORS{$source_factor} = 1;
|
|
print STDERR "Considering factor $source_factor\n";
|
|
push @TABLE_FACTORS,$source_factor;
|
|
}
|
|
}
|
|
}
|
|
close(INI);
|
|
close(INI_OUT);
|
|
|
|
my %TMP_INPUT_FILENAME;
|
|
|
|
if ($opt_hierarchical)
|
|
{
|
|
# Write a separate, temporary input file for each combination of source
|
|
# factors
|
|
foreach my $key (keys %CONSIDER_FACTORS) {
|
|
my $filename = "$dir/input-$key";
|
|
open(FILEHANDLE,">$filename") or die "Can't open $filename for writing";
|
|
$TMP_INPUT_FILENAME{$key} = $filename;
|
|
my @FACTOR = split(/,/, $key);
|
|
open(PIPE,"$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |");
|
|
while (my $line = <PIPE>) {
|
|
print FILEHANDLE $line
|
|
}
|
|
close(FILEHANDLE);
|
|
}
|
|
}
|
|
|
|
my %PHRASE_USED;
|
|
if (!$opt_hierarchical) {
|
|
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
|
|
open(INPUT,mk_open_string($input)) or die "Can't read $input";
|
|
while(my $line = <INPUT>) {
|
|
chomp($line);
|
|
my @WORD = split(/ +/,$line);
|
|
for(my $i=0;$i<=$#WORD;$i++) {
|
|
for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
|
|
foreach (keys %CONSIDER_FACTORS) {
|
|
my @FACTOR = split(/,/);
|
|
my $phrase = "";
|
|
for(my $k=$i;$k<=$i+$j;$k++) {
|
|
my @WORD_FACTOR = split(/\|/,$WORD[$k]);
|
|
for(my $f=0;$f<=$#FACTOR;$f++) {
|
|
$phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
|
|
}
|
|
chop($phrase);
|
|
$phrase .= " ";
|
|
}
|
|
chop($phrase);
|
|
$PHRASE_USED{$_}{$phrase}++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close(INPUT);
|
|
}
|
|
|
|
sub mk_open_string {
|
|
my $file = shift;
|
|
my $openstring;
|
|
if ($file !~ /\.gz$/ && -e "$file.gz") {
|
|
$openstring = "$ZCAT $file.gz |";
|
|
} elsif ($file =~ /\.gz$/) {
|
|
$openstring = "$ZCAT $file |";
|
|
} elsif ($opt_hierarchical) {
|
|
$openstring = "cat $file |";
|
|
} else {
|
|
$openstring = "< $file";
|
|
}
|
|
return $openstring;
|
|
}
|
|
|
|
|
|
# filter files
|
|
for(my $i=0;$i<=$#TABLE;$i++) {
|
|
my ($used,$total) = (0,0);
|
|
my $file = $TABLE[$i];
|
|
my $factors = $TABLE_FACTORS[$i];
|
|
my $new_file = $TABLE_NEW_NAME[$i];
|
|
print STDERR "filtering $file -> $new_file...\n";
|
|
|
|
my $openstring = mk_open_string($file);
|
|
|
|
my $new_openstring;
|
|
if ($new_file =~ /\.gz$/) {
|
|
$new_openstring = "| gzip -c > $new_file";
|
|
} else {
|
|
$new_openstring = ">$new_file";
|
|
}
|
|
|
|
open(FILE_OUT,$new_openstring) or die "Can't write to $new_openstring";
|
|
|
|
if ($opt_hierarchical) {
|
|
my $tmp_input = $TMP_INPUT_FILENAME{$factors};
|
|
my $options = "";
|
|
$options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
|
|
open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
|
|
while (my $line = <PIPE>) {
|
|
print FILE_OUT $line
|
|
}
|
|
close(FILEHANDLE);
|
|
} else {
|
|
open(FILE,$openstring) or die "Can't open '$openstring'";
|
|
while(my $entry = <FILE>) {
|
|
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
|
|
$foreign =~ s/ $//;
|
|
if (defined($PHRASE_USED{$factors}{$foreign})) {
|
|
print FILE_OUT $entry;
|
|
$used++;
|
|
}
|
|
$total++;
|
|
}
|
|
close(FILE);
|
|
die "No phrases found in $file!" if $total == 0;
|
|
printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
|
|
}
|
|
|
|
if(defined($binarizer)) {
|
|
print STDERR "binarizing...";
|
|
# translation model
|
|
if ($KNOWN_TTABLE{$i}) {
|
|
# ... hierarchical translation model
|
|
if ($opt_hierarchical) {
|
|
my $cmd = "$binarizer $new_file $new_file.bin";
|
|
print STDERR $cmd."\n";
|
|
print STDERR `$cmd`;
|
|
}
|
|
# ... phrase translation model
|
|
elsif ($binarizer =~ /processPhraseTableMin/) {
|
|
#compact phrase table
|
|
my $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
|
|
print STDERR $cmd."\n";
|
|
print STDERR `$cmd`;
|
|
} else {
|
|
my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
|
|
print STDERR $cmd."\n";
|
|
print STDERR `$cmd`;
|
|
}
|
|
}
|
|
# reordering model
|
|
else {
|
|
my $lexbin = $binarizer;
|
|
$lexbin =~ s/PhraseTable/LexicalTable/;
|
|
my $cmd;
|
|
if ($lexbin =~ /processLexicalTableMin/) {
|
|
$cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
|
|
} else {
|
|
$lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
|
|
$cmd = "$lexbin -in $new_file -out $new_file";
|
|
}
|
|
print STDERR $cmd."\n";
|
|
print STDERR `$cmd`;
|
|
}
|
|
}
|
|
|
|
close(FILE_OUT);
|
|
}
|
|
|
|
if ($opt_hierarchical)
|
|
{
|
|
# Remove the temporary input files
|
|
unlink values %TMP_INPUT_FILENAME;
|
|
}
|
|
|
|
open(INFO,">$dir/info");
|
|
print INFO "$config\n$input\n";
|
|
close(INFO);
|
|
|
|
|
|
print "To run the decoder, please call:
|
|
moses -f $dir/moses.ini -i $input\n";
|
|
|
|
sub safesystem {
|
|
print STDERR "Executing: @_\n";
|
|
system(@_);
|
|
if ($? == -1) {
|
|
print STDERR "Failed to execute: @_\n $!\n";
|
|
exit(1);
|
|
}
|
|
elsif ($? & 127) {
|
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
|
exit(1);
|
|
}
|
|
else {
|
|
my $exitcode = $? >> 8;
|
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
|
return ! $exitcode;
|
|
}
|
|
}
|
|
sub ensure_full_path {
|
|
my $PATH = shift;
|
|
return $PATH if $PATH =~ /^\//;
|
|
my $dir = `pawd 2>/dev/null`;
|
|
if (!$dir) {$dir = `pwd`;}
|
|
chomp $dir;
|
|
$PATH = $dir."/".$PATH;
|
|
$PATH =~ s/[\r\n]//g;
|
|
$PATH =~ s/\/\.\//\//g;
|
|
$PATH =~ s/\/+/\//g;
|
|
my $sanity = 0;
|
|
while($PATH =~ /\/\.\.\// && $sanity++<10) {
|
|
$PATH =~ s/\/+/\//g;
|
|
$PATH =~ s/\/[^\/]+\/\.\.\//\//g;
|
|
}
|
|
$PATH =~ s/\/[^\/]+\/\.\.$//;
|
|
$PATH =~ s/\/+$//;
|
|
return $PATH;
|
|
}
|
|
|