#!/usr/bin/perl -w # $Id$ # Given a moses.ini file and an input text prepare minimized translation # tables and a new moses.ini, so that loading of tables is much faster. # original code by Philipp Koehn # changes by Ondrej Bojar # adapted for hierarchical models by Phil Williams use strict; use FindBin qw($RealBin); use Getopt::Long; my $SCRIPTS_ROOTDIR; if (defined($ENV{"SCRIPTS_ROOTDIR"})) { $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; } else { $SCRIPTS_ROOTDIR = $RealBin; if ($SCRIPTS_ROOTDIR eq '') { $SCRIPTS_ROOTDIR = dirname(__FILE__); } $SCRIPTS_ROOTDIR =~ s/\/training$//; $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR; } # consider phrases in input up to $MAX_LENGTH # in other words, all phrase-tables will be truncated at least to 10 words per # phrase. my $MAX_LENGTH = 10; # utilities my $ZCAT = "gzip -cd"; # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; my $min_score = undef; my $opt_min_non_initial_rule_count = undef; my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats) my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice GetOptions( "gzip!" => \$opt_gzip, "filter!" => \$opt_filter, "Hierarchical" => \$opt_hierarchical, "Binarizer=s" => \$binarizer, "MinScore=s" => \$min_score, "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count ) or exit(1); # get command line parameters my $dir = shift; my $config = shift; my $input = shift; if (!defined $dir || !defined $config || !defined $input) { print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*]\n"; exit 1; } $dir = ensure_full_path($dir); # decode min-score definitions my %MIN_SCORE; if ($min_score) { foreach (split(/ *, */,$min_score)) { my ($id,$score) = split(/ *: */); $MIN_SCORE{$id} = $score; print STDERR "score $id must be at least $score\n"; } } # buggy directory in place? if (-d $dir && ! -e "$dir/info") { print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n"; exit(1); } # already filtered? check if it can be re-used if (-d $dir) { my @INFO = `cat $dir/info`; chop(@INFO); if($INFO[0] ne $config || ($INFO[1] ne $input && $INFO[1].".tagged" ne $input)) { print STDERR "WARNING: directory exists but does not match parameters:\n"; print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n"; exit 1; } print STDERR "The filtered model was ready in $dir, not doing anything.\n"; exit 0; } # filter the translation and distortion tables safesystem("mkdir -p $dir") or die "Can't mkdir $dir"; my $inputStrippedXML = "$dir/input.$$"; my $cmd = "$RealBin/../generic/strip-xml.perl < $input > $inputStrippedXML"; print STDERR "Stripping XML: $cmd \n"; print `$cmd`; $input = $inputStrippedXML; # get tables to be filtered (and modify config file) my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER); my %new_name_used = (); open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini"; open(INI,$config) or die "Can't read $config"; while(my $line = ) { chomp($line); my @toks = split(/ /, $line); if ($line =~ /PhraseDictionaryMemory / || $line =~ /PhraseDictionaryBinary / || $line =~ /PhraseDictionaryOnDisk / || $line =~ /PhraseDictionarySCFG / ) { print STDERR "pt:$line\n"; my ($phrase_table_impl,$source_factor,$t,$w,$file,$table_flag,$skip); # = ($1,$2,$3,$4,$5,$6,$7); $table_flag = ""; $phrase_table_impl = $toks[0]; $skip = 0; for (my $i = 1; $i < scalar(@toks); ++$i) { my @args = split(/=/, $toks[$i]); chomp($args[0]); chomp($args[1]); if ($args[0] eq "num-features") { $w = $args[1]; } elsif ($args[0] eq "input-factor") { $source_factor = $args[1]; } elsif ($args[0] eq "output-factor") { $t = $args[1]; } elsif ($args[0] eq "path") { $file = $args[1]; } elsif ($args[0] eq "filterable" && $args[1] eq "false") { $skip = 1; } } #for (my $i = 1; $i < scalar(@toks); ++$i) { if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG") || $file =~ /glue-grammar/ || $skip) { # Only Memory ("0") and NewFormat ("6") can be filtered. print INI_OUT "$line\n"; next; } push @TABLE, $file; push @TABLE_WEIGHTS,$w; $KNOWN_TTABLE{$#TABLE}++; my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"}); my $cnt = 1; $cnt ++ while (defined $new_name_used{"$new_name.$cnt"}); $new_name .= ".$cnt"; $new_name_used{$new_name} = 1; if ($binarizer && $phrase_table_impl eq "PhraseDictionarySCFG") { $phrase_table_impl = "PhraseDictionaryOnDisk"; @toks = set_value(\@toks, "path", "$new_name.bin$table_flag"); } elsif ($binarizer && $phrase_table_impl eq "PhraseDictionaryMemory") { if ($binarizer =~ /processPhraseTableMin/) { $phrase_table_impl = "PhraseDictionaryCompact"; @toks = set_value(\@toks, "path", "$new_name$table_flag"); } elsif ($binarizer =~ /CreateOnDiskPt/) { $phrase_table_impl = "PhraseDictionaryOnDisk"; @toks = set_value(\@toks, "path", "$new_name.bin$table_flag"); } else { $phrase_table_impl = "PhraseDictionaryBinary"; @toks = set_value(\@toks, "path", "$new_name$table_flag"); } } else { $new_name .= ".gz" if $opt_gzip; @toks = set_value(\@toks, "path", "$new_name$table_flag"); } $toks[0] = $phrase_table_impl; print INI_OUT join_array(\@toks)."\n"; push @TABLE_NEW_NAME,$new_name; $CONSIDER_FACTORS{$source_factor} = 1; print STDERR "Considering factor $source_factor\n"; push @TABLE_FACTORS, $source_factor; } #if (/PhraseModel /) { elsif ($line =~ /LexicalReordering /) { print STDERR "ro:$line\n"; my ($source_factor, $t, $w, $file); # = ($1,$2,$3,$4); for (my $i = 1; $i < scalar(@toks); ++$i) { my @args = split(/=/, $toks[$i]); chomp($args[0]); chomp($args[1]); if ($args[0] eq "num-features") { $w = $args[1]; } elsif ($args[0] eq "input-factor") { $source_factor = chomp($args[1]); } elsif ($args[0] eq "output-factor") { #$t = chomp($args[1]); } elsif ($args[0] eq "type") { $t = $args[1]; } elsif ($args[0] eq "path") { $file = $args[1]; } } # for (my $i = 1; $i < scalar(@toks); ++$i) { push @TABLE, $file; push @TABLE_WEIGHTS,$w; $file =~ s/^.*\/+([^\/]+)/$1/g; my $new_name = "$dir/$file"; $new_name =~ s/\.gz//; #print INI_OUT "$source_factor $t $w $new_name\n"; @toks = set_value(\@toks, "path", "$new_name"); print INI_OUT join_array(\@toks)."\n"; push @TABLE_NEW_NAME,$new_name; $CONSIDER_FACTORS{$source_factor} = 1; print STDERR "Considering factor $source_factor\n"; push @TABLE_FACTORS,$source_factor; } #elsif (/LexicalReordering /) { else { print INI_OUT "$line\n"; } } # while() { close(INI); close(INI_OUT); my %TMP_INPUT_FILENAME; if ($opt_hierarchical) { # Write a separate, temporary input file for each combination of source # factors foreach my $key (keys %CONSIDER_FACTORS) { my $filename = "$dir/input-$key"; open(FILEHANDLE,">$filename") or die "Can't open $filename for writing"; $TMP_INPUT_FILENAME{$key} = $filename; my @FACTOR = split(/,/, $key); my $cmd = "$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |"; print STDERR "Executing: $cmd\n"; open(PIPE,$cmd); while (my $line = ) { print FILEHANDLE $line } close(FILEHANDLE); } # foreach my $key (keys %CONSIDER_FACTORS) { } #if ($opt_hierarchical) { my %PHRASE_USED; if ($opt_filter) { if (!$opt_hierarchical) { # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH open(INPUT,mk_open_string($input)) or die "Can't read $input"; while(my $line = ) { chomp($line); my @WORD = split(/ +/,$line); for(my $i=0;$i<=$#WORD;$i++) { for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { foreach (keys %CONSIDER_FACTORS) { my @FACTOR = split(/,/); my $phrase = ""; for(my $k=$i;$k<=$i+$j;$k++) { my @WORD_FACTOR = split(/\|/,$WORD[$k]); for(my $f=0;$f<=$#FACTOR;$f++) { $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; } chop($phrase); $phrase .= " "; } chop($phrase); $PHRASE_USED{$_}{$phrase}++; } } } } close(INPUT); } } # filter files print STDERR "Filtering files...\n"; for(my $i=0;$i<=$#TABLE;$i++) { my ($used,$total) = (0,0); my $file = $TABLE[$i]; my $factors = $TABLE_FACTORS[$i]; my $new_file = $TABLE_NEW_NAME[$i]; print STDERR "filtering $file -> $new_file...\n"; my $mid_file = $new_file; # used when both filtering and binarizing if (!$opt_filter) { # check if original file was gzipped if ($file !~ /\.gz$/ && -e "$file.gz") { $file .= ".gz"; } $mid_file .= ".gz" if $file =~ /\.gz$/; $cmd = "ln -s $file $mid_file"; print STDERR "Executing: $cmd\n"; safesystem($cmd); } else { $mid_file .= ".gz" if $mid_file !~ /\.gz/ && $binarizer && $binarizer =~ /processPhraseTable/; my $openstring = mk_open_string($file); my $mid_openstring; if ($mid_file =~ /\.gz$/) { $mid_openstring = "| gzip -c > $mid_file"; } else { $mid_openstring = ">$mid_file"; } open(FILE_OUT,$mid_openstring) or die "Can't write to $mid_openstring"; if ($opt_hierarchical) { my $tmp_input = $TMP_INPUT_FILENAME{$factors}; my $options = ""; $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count); $cmd = "$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |"; print STDERR "Executing: $cmd\n"; open(PIPE,$cmd); while (my $line = ) { print FILE_OUT $line } close(FILEHANDLE); } else { open(FILE,$openstring) or die "Can't open '$openstring'"; while(my $entry = ) { my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); $foreign =~ s/ $//; if (defined($PHRASE_USED{$factors}{$foreign})) { # handle min_score thresholds if ($min_score) { my @ITEM = split(/ *\|\|\| */,$rest); if(scalar (@ITEM)>2) { # do not filter reordering table my @SCORE = split(/ /,$ITEM[1]); my $okay = 1; foreach my $id (keys %MIN_SCORE) { $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id}; } next unless $okay; } } print FILE_OUT $entry; $used++; } $total++; } close(FILE); die "No phrases found in $file!" if $total == 0; printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; } close(FILE_OUT); } my $catcmd = ($mid_file =~ /\.gz$/ ? "$ZCAT" : "cat"); if(defined($binarizer)) { print STDERR "binarizing..."; # translation model if ($KNOWN_TTABLE{$i}) { # ... hierarchical translation model if ($opt_hierarchical) { my $cmd = "$binarizer $mid_file $new_file.bin"; print STDERR $cmd."\n"; print STDERR `$cmd`; } # ... phrase translation model elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir > $mid_file.sorted; $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $mid_file.sorted"; print STDERR $cmd."\n"; print STDERR `$cmd`; } elsif ($binarizer =~ /CreateOnDiskPt/) { my $cmd = "$binarizer $mid_file $new_file.bin"; print STDERR $cmd."\n"; print STDERR `$cmd`; } else { my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; print STDERR $cmd."\n"; print STDERR `$cmd`; } } # reordering model else { my $lexbin; $lexbin = $binarizer; if ($binarizer =~ /CreateOnDiskPt/) { $lexbin =~ s/CreateOnDiskPt/processLexicalTable/; } $lexbin =~ s/PhraseTable/LexicalTable/; my $cmd; if ($lexbin =~ /processLexicalTableMin/) { $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir > $mid_file.sorted; $lexbin -in $mid_file.sorted -out $new_file; rm $mid_file.sorted"; } else { $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options $cmd = "$lexbin -in $mid_file -out $new_file"; } print STDERR $cmd."\n"; print STDERR `$cmd`; } } } if ($opt_hierarchical) { # Remove the temporary input files unlink values %TMP_INPUT_FILENAME; } open(INFO,">$dir/info"); print INFO "$config\n$input\n"; close(INFO); print "To run the decoder, please call: moses -f $dir/moses.ini -i $input\n"; # functions sub mk_open_string { my $file = shift; my $openstring; if ($file !~ /\.gz$/ && -e "$file.gz") { $openstring = "$ZCAT $file.gz |"; } elsif ($file =~ /\.gz$/) { $openstring = "$ZCAT $file |"; } elsif ($opt_hierarchical) { $openstring = "cat $file |"; } else { $openstring = "< $file"; } return $openstring; } sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } } sub ensure_full_path { my $PATH = shift; return $PATH if $PATH =~ /^\//; my $dir = `pawd 2>/dev/null`; if (!$dir) {$dir = `pwd`;} chomp $dir; $PATH = $dir."/".$PATH; $PATH =~ s/[\r\n]//g; $PATH =~ s/\/\.\//\//g; $PATH =~ s/\/+/\//g; my $sanity = 0; while($PATH =~ /\/\.\.\// && $sanity++<10) { $PATH =~ s/\/+/\//g; $PATH =~ s/\/[^\/]+\/\.\.\//\//g; } $PATH =~ s/\/[^\/]+\/\.\.$//; $PATH =~ s/\/+$//; return $PATH; } sub join_array { my @outside = @{$_[0]}; my $ret = ""; for (my $i = 0; $i < scalar(@outside); ++$i) { my $tok = $outside[$i]; $ret .= "$tok "; } return $ret; } sub set_value { my @arr = @{$_[0]}; my $keySought = $_[1]; my $newValue = $_[2]; for (my $i = 1; $i < scalar(@arr); ++$i) { my @inside = split(/=/, $arr[$i]); my $key = $inside[0]; if ($key eq $keySought) { $arr[$i] = "$key=$newValue"; return @arr; } } return @arr; }