#!/usr/bin/perl -w # $Id$ # Given a moses.ini file and an input text prepare minimized translation # tables and a new moses.ini, so that loading of tables is much faster. # original code by Philipp Koehn # changes by Ondrej Bojar # adapted for hierarchical models by Phil Williams use strict; use FindBin qw($Bin); use Getopt::Long; my $SCRIPTS_ROOTDIR; if (defined($ENV{"SCRIPTS_ROOTDIR"})) { $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; } else { $SCRIPTS_ROOTDIR = $Bin; if ($SCRIPTS_ROOTDIR eq '') { $SCRIPTS_ROOTDIR = dirname(__FILE__); } $SCRIPTS_ROOTDIR =~ s/\/training$//; $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR; } # consider phrases in input up to $MAX_LENGTH # in other words, all phrase-tables will be truncated at least to 10 words per # phrase. my $MAX_LENGTH = 10; # utilities my $ZCAT = "gzip -cd"; # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; my $opt_min_non_initial_rule_count = undef; GetOptions( "Hierarchical" => \$opt_hierarchical, "Binarizer=s" => \$binarizer, "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count ) or exit(1); # get command line parameters my $dir = shift; my $config = shift; my $input = shift; if (!defined $dir || !defined $config || !defined $input) { print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical]\n"; exit 1; } $dir = ensure_full_path($dir); # buggy directory in place? if (-d $dir && ! -e "$dir/info") { print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n"; exit(1); } # already filtered? check if it can be re-used if (-d $dir) { my @INFO = `cat $dir/info`; chop(@INFO); if($INFO[0] ne $config || ($INFO[1] ne $input && $INFO[1].".tagged" ne $input)) { print STDERR "WARNING: directory exists but does not match parameters:\n"; print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n"; exit 1; } print STDERR "The filtered model was ready in $dir, not doing anything.\n"; exit 0; } # filter the translation and distortion tables safesystem("mkdir -p $dir") or die "Can't mkdir $dir"; # get tables to be filtered (and modify config file) my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER); my %new_name_used = (); open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini"; open(INI,$config) or die "Can't read $config"; while() { print INI_OUT $_; if (/ttable-file\]/) { while(1) { my $table_spec = ; if ($table_spec !~ /^(\d+) ([\d\,\-]+) ([\d\,\-]+) (\d+) (\S+)$/) { print INI_OUT $table_spec; last; } my ($phrase_table_impl,$source_factor,$t,$w,$file) = ($1,$2,$3,$4,$5); if (($phrase_table_impl ne "0" && $phrase_table_impl ne "6") || $file =~ /glue-grammar/) { # Only Memory ("0") and NewFormat ("6") can be filtered. print INI_OUT $table_spec; next; } chomp($file); push @TABLE, $file; push @TABLE_WEIGHTS,$w; $KNOWN_TTABLE{$#TABLE}++; my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"}); my $cnt = 1; $cnt ++ while (defined $new_name_used{"$new_name.$cnt"}); $new_name .= ".$cnt"; $new_name_used{$new_name} = 1; if ($binarizer && $phrase_table_impl == 6) { print INI_OUT "2 $source_factor $t $w $new_name.bin\n"; } elsif ($binarizer && $phrase_table_impl == 0) { print INI_OUT "1 $source_factor $t $w $new_name\n"; } else { print INI_OUT "$phrase_table_impl $source_factor $t $w $new_name\n"; } push @TABLE_NEW_NAME,$new_name; $CONSIDER_FACTORS{$source_factor} = 1; print STDERR "Considering factor $source_factor\n"; push @TABLE_FACTORS, $source_factor; } } elsif (/distortion-file/) { while(1) { my $table_spec = ; if ($table_spec !~ /^([\d\,\-]+) (\S+) (\d+) (\S+)$/) { print INI_OUT $table_spec; last; } my ($factors,$t,$w,$file) = ($1,$2,$3,$4); my $source_factor = $factors; $source_factor =~ s/\-[\d,]+$//; chomp($file); push @TABLE,$file; $file =~ s/^.*\/+([^\/]+)/$1/g; my $new_name = "$dir/$file"; $new_name =~ s/\.gz//; print INI_OUT "$factors $t $w $new_name\n"; push @TABLE_NEW_NAME,$new_name; $CONSIDER_FACTORS{$source_factor} = 1; print STDERR "Considering factor $source_factor\n"; push @TABLE_FACTORS,$source_factor; } } } close(INI); close(INI_OUT); my %TMP_INPUT_FILENAME; if ($opt_hierarchical) { # Write a separate, temporary input file for each combination of source # factors foreach my $key (keys %CONSIDER_FACTORS) { my $filename = "$dir/input-$key"; open(FILEHANDLE,">$filename") or die "Can't open $filename for writing"; $TMP_INPUT_FILENAME{$key} = $filename; my @FACTOR = split(/,/, $key); open(PIPE,"$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |"); while (my $line = ) { print FILEHANDLE $line } close(FILEHANDLE); } } my %PHRASE_USED; if (!$opt_hierarchical) { # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH open(INPUT,$input) or die "Can't read $input"; while(my $line = ) { chomp($line); my @WORD = split(/ +/,$line); for(my $i=0;$i<=$#WORD;$i++) { for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { foreach (keys %CONSIDER_FACTORS) { my @FACTOR = split(/,/); my $phrase = ""; for(my $k=$i;$k<=$i+$j;$k++) { my @WORD_FACTOR = split(/\|/,$WORD[$k]); for(my $f=0;$f<=$#FACTOR;$f++) { $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; } chop($phrase); $phrase .= " "; } chop($phrase); $PHRASE_USED{$_}{$phrase}++; } } } } close(INPUT); } # filter files for(my $i=0;$i<=$#TABLE;$i++) { my ($used,$total) = (0,0); my $file = $TABLE[$i]; my $factors = $TABLE_FACTORS[$i]; my $new_file = $TABLE_NEW_NAME[$i]; print STDERR "filtering $file -> $new_file...\n"; my $openstring; if ($file !~ /\.gz$/ && -e "$file.gz") { $openstring = "$ZCAT $file.gz |"; } elsif ($file =~ /\.gz$/) { $openstring = "$ZCAT $file |"; } elsif ($opt_hierarchical) { $openstring = "cat $file |"; } else { $openstring = "< $file"; } open(FILE_OUT,">$new_file") or die "Can't write $new_file"; if ($opt_hierarchical) { my $tmp_input = $TMP_INPUT_FILENAME{$factors}; my $options = ""; $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count); open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |"); while (my $line = ) { print FILE_OUT $line } close(FILEHANDLE); } else { open(FILE,$openstring) or die "Can't open '$openstring'"; while(my $entry = ) { my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); $foreign =~ s/ $//; if (defined($PHRASE_USED{$factors}{$foreign})) { print FILE_OUT $entry; $used++; } $total++; } close(FILE); die "No phrases found in $file!" if $total == 0; printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; } if(defined($binarizer)) { print STDERR "binarizing..."; # translation model if ($KNOWN_TTABLE{$i}) { # ... hierarchical translation model if ($opt_hierarchical) { my $cmd = "$binarizer $new_file $new_file.bin"; print STDERR $cmd."\n"; print STDERR `$cmd`; } # ... phrase translation model else { my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; print STDERR $cmd."\n"; print STDERR `$cmd`; } } # reordering model else { my $lexbin = $binarizer; $lexbin =~ s/PhraseTable/LexicalTable/; $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options my $cmd = "$lexbin -in $new_file -out $new_file"; print STDERR $cmd."\n"; print STDERR `$cmd`; } } close(FILE_OUT); } if ($opt_hierarchical) { # Remove the temporary input files unlink values %TMP_INPUT_FILENAME; } open(INFO,">$dir/info"); print INFO "$config\n$input\n"; close(INFO); print "To run the decoder, please call: moses -f $dir/moses.ini < $input\n"; sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } } sub ensure_full_path { my $PATH = shift; return $PATH if $PATH =~ /^\//; my $dir = `pawd 2>/dev/null`; if (!$dir) {$dir = `pwd`;} chomp $dir; $PATH = $dir."/".$PATH; $PATH =~ s/[\r\n]//g; $PATH =~ s/\/\.\//\//g; $PATH =~ s/\/+/\//g; my $sanity = 0; while($PATH =~ /\/\.\.\// && $sanity++<10) { $PATH =~ s/\/+/\//g; $PATH =~ s/\/[^\/]+\/\.\.\//\//g; } $PATH =~ s/\/[^\/]+\/\.\.$//; $PATH =~ s/\/+$//; return $PATH; }