mosesdecoder/scripts/training/filter-model-given-input.pl
Barry Haddow 2a88fd0730 Support for compact phrase table in EMS
It should be sufficient to add a line like
ttable-binarizer = "/home/bhaddow/moses/dist/bin/processPhraseTableMin"
to your EMS config, and everything else will be taken care of. You can
add other arguments to the processPhraseTableMin, for example for
threading, by putting them in the quotes.

Note that this is not fully tested, since there are currently some
issues with the compact phrase table introduced by the sparse feature
merge.
2012-11-16 15:07:07 +00:00

368 lines
11 KiB
Perl
Executable File

#!/usr/bin/perl -w
# $Id$
# Given a moses.ini file and an input text prepare minimized translation
# tables and a new moses.ini, so that loading of tables is much faster.
# original code by Philipp Koehn
# changes by Ondrej Bojar
# adapted for hierarchical models by Phil Williams
use strict;
use FindBin qw($RealBin);
use Getopt::Long;
my $SCRIPTS_ROOTDIR;
if (defined($ENV{"SCRIPTS_ROOTDIR"})) {
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"};
} else {
$SCRIPTS_ROOTDIR = $RealBin;
if ($SCRIPTS_ROOTDIR eq '') {
$SCRIPTS_ROOTDIR = dirname(__FILE__);
}
$SCRIPTS_ROOTDIR =~ s/\/training$//;
$ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR;
}
# consider phrases in input up to $MAX_LENGTH
# in other words, all phrase-tables will be truncated at least to 10 words per
# phrase.
my $MAX_LENGTH = 10;
# utilities
my $ZCAT = "gzip -cd";
# get optional parameters
my $opt_hierarchical = 0;
my $binarizer = undef;
my $opt_min_non_initial_rule_count = undef;
my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
GetOptions(
"gzip!" => \$opt_gzip,
"Hierarchical" => \$opt_hierarchical,
"Binarizer=s" => \$binarizer,
"MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count
) or exit(1);
# get command line parameters
my $dir = shift;
my $config = shift;
my $input = shift;
if (!defined $dir || !defined $config || !defined $input) {
print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical]\n";
exit 1;
}
$dir = ensure_full_path($dir);
# buggy directory in place?
if (-d $dir && ! -e "$dir/info") {
print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n";
exit(1);
}
# already filtered? check if it can be re-used
if (-d $dir) {
my @INFO = `cat $dir/info`;
chop(@INFO);
if($INFO[0] ne $config
|| ($INFO[1] ne $input &&
$INFO[1].".tagged" ne $input)) {
print STDERR "WARNING: directory exists but does not match parameters:\n";
print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n";
exit 1;
}
print STDERR "The filtered model was ready in $dir, not doing anything.\n";
exit 0;
}
# filter the translation and distortion tables
safesystem("mkdir -p $dir") or die "Can't mkdir $dir";
# get tables to be filtered (and modify config file)
my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER);
my %new_name_used = ();
open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini";
open(INI,$config) or die "Can't read $config";
while(<INI>) {
print INI_OUT $_;
if (/ttable-file\]/) {
while(1) {
my $table_spec = <INI>;
if ($table_spec !~ /^(\d+) ([\d\,\-]+) ([\d\,\-]+) (\d+) (\S+)( \S+)?$/) {
print INI_OUT $table_spec;
last;
}
my ($phrase_table_impl,$source_factor,$t,$w,$file,$table_flag) = ($1,$2,$3,$4,$5,$6);
$table_flag = "" if (!defined($table_flag));
if (($phrase_table_impl ne "0" && $phrase_table_impl ne "6") || $file =~ /glue-grammar/) {
# Only Memory ("0") and NewFormat ("6") can be filtered.
print INI_OUT $table_spec;
next;
}
chomp($file);
push @TABLE, $file;
push @TABLE_WEIGHTS,$w;
$KNOWN_TTABLE{$#TABLE}++;
my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"});
my $cnt = 1;
$cnt ++ while (defined $new_name_used{"$new_name.$cnt"});
$new_name .= ".$cnt";
$new_name_used{$new_name} = 1;
if ($binarizer && $phrase_table_impl == 6) {
print INI_OUT "2 $source_factor $t $w $new_name.bin$table_flag\n";
}
elsif ($binarizer && $phrase_table_impl == 0) {
if ($binarizer =~ /processPhraseTableMin/) {
print INI_OUT "12 $source_factor $t $w $new_name$table_flag\n";
} else {
print INI_OUT "1 $source_factor $t $w $new_name$table_flag\n";
}
} else {
$new_name .= ".gz" if $opt_gzip;
print INI_OUT "$phrase_table_impl $source_factor $t $w $new_name$table_flag\n";
}
push @TABLE_NEW_NAME,$new_name;
$CONSIDER_FACTORS{$source_factor} = 1;
print STDERR "Considering factor $source_factor\n";
push @TABLE_FACTORS, $source_factor;
}
}
elsif (/distortion-file/) {
while(1) {
my $table_spec = <INI>;
if ($table_spec !~ /^([\d\,\-]+) (\S+) (\d+) (\S+)$/) {
print INI_OUT $table_spec;
last;
}
my ($factors,$t,$w,$file) = ($1,$2,$3,$4);
my $source_factor = $factors;
$source_factor =~ s/\-[\d,]+$//;
chomp($file);
push @TABLE,$file;
$file =~ s/^.*\/+([^\/]+)/$1/g;
my $new_name = "$dir/$file";
$new_name =~ s/\.gz//;
print INI_OUT "$factors $t $w $new_name\n";
push @TABLE_NEW_NAME,$new_name;
$CONSIDER_FACTORS{$source_factor} = 1;
print STDERR "Considering factor $source_factor\n";
push @TABLE_FACTORS,$source_factor;
}
}
}
close(INI);
close(INI_OUT);
my %TMP_INPUT_FILENAME;
if ($opt_hierarchical)
{
# Write a separate, temporary input file for each combination of source
# factors
foreach my $key (keys %CONSIDER_FACTORS) {
my $filename = "$dir/input-$key";
open(FILEHANDLE,">$filename") or die "Can't open $filename for writing";
$TMP_INPUT_FILENAME{$key} = $filename;
my @FACTOR = split(/,/, $key);
open(PIPE,"$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |");
while (my $line = <PIPE>) {
print FILEHANDLE $line
}
close(FILEHANDLE);
}
}
my %PHRASE_USED;
if (!$opt_hierarchical) {
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
open(INPUT,mk_open_string($input)) or die "Can't read $input";
while(my $line = <INPUT>) {
chomp($line);
my @WORD = split(/ +/,$line);
for(my $i=0;$i<=$#WORD;$i++) {
for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
foreach (keys %CONSIDER_FACTORS) {
my @FACTOR = split(/,/);
my $phrase = "";
for(my $k=$i;$k<=$i+$j;$k++) {
my @WORD_FACTOR = split(/\|/,$WORD[$k]);
for(my $f=0;$f<=$#FACTOR;$f++) {
$phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
}
chop($phrase);
$phrase .= " ";
}
chop($phrase);
$PHRASE_USED{$_}{$phrase}++;
}
}
}
}
close(INPUT);
}
sub mk_open_string {
my $file = shift;
my $openstring;
if ($file !~ /\.gz$/ && -e "$file.gz") {
$openstring = "$ZCAT $file.gz |";
} elsif ($file =~ /\.gz$/) {
$openstring = "$ZCAT $file |";
} elsif ($opt_hierarchical) {
$openstring = "cat $file |";
} else {
$openstring = "< $file";
}
return $openstring;
}
# filter files
for(my $i=0;$i<=$#TABLE;$i++) {
my ($used,$total) = (0,0);
my $file = $TABLE[$i];
my $factors = $TABLE_FACTORS[$i];
my $new_file = $TABLE_NEW_NAME[$i];
print STDERR "filtering $file -> $new_file...\n";
my $openstring = mk_open_string($file);
my $new_openstring;
if ($new_file =~ /\.gz$/) {
$new_openstring = "| gzip -c > $new_file";
} else {
$new_openstring = ">$new_file";
}
open(FILE_OUT,$new_openstring) or die "Can't write to $new_openstring";
if ($opt_hierarchical) {
my $tmp_input = $TMP_INPUT_FILENAME{$factors};
my $options = "";
$options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
while (my $line = <PIPE>) {
print FILE_OUT $line
}
close(FILEHANDLE);
} else {
open(FILE,$openstring) or die "Can't open '$openstring'";
while(my $entry = <FILE>) {
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
$foreign =~ s/ $//;
if (defined($PHRASE_USED{$factors}{$foreign})) {
print FILE_OUT $entry;
$used++;
}
$total++;
}
close(FILE);
die "No phrases found in $file!" if $total == 0;
printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
}
if(defined($binarizer)) {
print STDERR "binarizing...";
# translation model
if ($KNOWN_TTABLE{$i}) {
# ... hierarchical translation model
if ($opt_hierarchical) {
my $cmd = "$binarizer $new_file $new_file.bin";
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
# ... phrase translation model
elsif ($binarizer =~ /processPhraseTableMin/) {
#compact phrase table
my $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
print STDERR $cmd."\n";
print STDERR `$cmd`;
} else {
my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
}
# reordering model
else {
my $lexbin = $binarizer;
$lexbin =~ s/PhraseTable/LexicalTable/;
my $cmd;
if ($lexbin =~ /processLexicalTableMin/) {
$cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
} else {
$lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
$cmd = "$lexbin -in $new_file -out $new_file";
}
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
}
close(FILE_OUT);
}
if ($opt_hierarchical)
{
# Remove the temporary input files
unlink values %TMP_INPUT_FILENAME;
}
open(INFO,">$dir/info");
print INFO "$config\n$input\n";
close(INFO);
print "To run the decoder, please call:
moses -f $dir/moses.ini -i $input\n";
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit(1);
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
sub ensure_full_path {
my $PATH = shift;
return $PATH if $PATH =~ /^\//;
my $dir = `pawd 2>/dev/null`;
if (!$dir) {$dir = `pwd`;}
chomp $dir;
$PATH = $dir."/".$PATH;
$PATH =~ s/[\r\n]//g;
$PATH =~ s/\/\.\//\//g;
$PATH =~ s/\/+/\//g;
my $sanity = 0;
while($PATH =~ /\/\.\.\// && $sanity++<10) {
$PATH =~ s/\/+/\//g;
$PATH =~ s/\/[^\/]+\/\.\.\//\//g;
}
$PATH =~ s/\/[^\/]+\/\.\.$//;
$PATH =~ s/\/+$//;
return $PATH;
}