a large number of changes. besides little tweaks:

* training script now has proper default behaviour for single-factor models, 
* mert script has better handling of default lambda parameters that now
  works with lexicalized reordering models, and also with multiple 
  models files (e.g. multiple language models)
* parallel mert script is more robust when single jobs fail: detects it
  and resubmits the crashed (or killed) jobs
* recaser added that builds on moses
* filtering script added that also binarizes filtered model files
  (this will be eventually replaced when the lexicalized reordering
  model also uses the binary format)


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1210 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
phkoehn 2007-02-13 19:22:35 +00:00
parent e247f1da6f
commit 14839768c8
7 changed files with 583 additions and 157 deletions

View File

@ -19,9 +19,8 @@ use strict;
#######################
#Customizable parameters
#parameters for submiiting processes through SGE
#NOTE: group name is ws06ossmt (with 2 's') and not ws06osmt (with 1 's')
my $queueparameters="-l ws06ossmt=true -l mem_free=0.5G -hard";
#parameters for submiiting processes through Sun GridEngine
my $queueparameters="-l mem_free=0.5G -hard";
# look for the correct pwdcmd
my $pwdcmd = getPwdCmd();
@ -53,6 +52,7 @@ my $nbestfile=undef;
my $orinbestfile=undef;
my $nbest=undef;
my $nbestflag=0;
my $robust=1; # undef; # resubmit crashed jobs
my $orilogfile="";
my $logflag="";
my $qsubname="MOSES";
@ -69,6 +69,7 @@ sub init(){
'debug'=>\$dbg,
'jobs=i'=>\$jobs,
'decoder=s'=> \$mosescmd,
'robust' => \$robust,
'decoder-parameters=s'=> \$mosesparameters,
'logfile=s'=> \$orilogfile,
'i|inputfile|input-file=s'=> \$orifile,
@ -284,8 +285,16 @@ preparing_script();
#launching process through the queue
my @sgepids =();
my $failure=0;
foreach my $idx (@idxlist){
# if robust switch is used, redo jobs that crashed
my @idx_todo = ();
foreach (@idxlist) { push @idx_todo,$_; }
my $looped_once = 0;
while((!$robust && !$looped_once) || ($robust && scalar @idx_todo)) {
$looped_once = 1;
my $failure=0;
foreach my $idx (@idx_todo){
print STDERR "qsub $queueparameters -b no -j yes -o $qsubout$idx -e $qsuberr$idx -N $qsubname$idx ${jobscript}${idx}.bash\n" if $dbg;
$cmd="qsub $queueparameters -b no -j yes -o $qsubout$idx -e $qsuberr$idx -N $qsubname$idx ${jobscript}${idx}.bash >& ${jobscript}${idx}.log";
@ -302,16 +311,17 @@ foreach my $idx (@idxlist){
close(IN);
push @sgepids, $id;
}
}
#waiting until all jobs have finished
my $hj = "-hold_jid " . join(" -hold_jid ", @sgepids);
#waiting until all jobs have finished
my $hj = "-hold_jid " . join(" -hold_jid ", @sgepids);
if ($old_sge) {
if ($old_sge) {
# we need to implement our own waiting script
safesystem("echo 'date' > sync_workaround_script.sh") or kill_all_and_quit();
my $pwd = `$pwdcmd`; chomp $pwd;
my $checkpointfile = "sync_workaround_checkpoint";
# delete previous checkpoint, if left from previous runs
@ -344,18 +354,36 @@ if ($old_sge) {
print STDERR "Extra wait ($nr) for possibly unfinished processes.\n";
sleep 10;
}
} else {
} else {
# use the -sync option for qsub
$cmd="qsub $queueparameters -sync y $hj -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls >& $qsubname.W.log";
safesystem($cmd) or kill_all_and_quit();
$failure=&check_exit_status();
}
kill_all_and_quit() if $failure && !$robust;
# check if some translations failed
my @idx_still_todo = check_translation();
if ($robust) {
# if robust, redo crashed jobs
if ((scalar @idx_still_todo) == (scalar @idxlist)) {
# ... but not if all crashed
print STDERR "everything crashed, not trying to resubmit jobs\n";
kill_all_and_quit();
}
@idx_todo = @idx_still_todo;
}
else {
if (scalar (@idx_still_todo)) {
print STDERR "some jobs crashed: ".join(" ",@idx_still_todo)."\n";
kill_all_and_quit();
}
}
}
kill_all_and_quit() if $failure;
check_translation();
#concatenating translations and removing temporary files
concatenate_1best();
concatenate_logs() if $logflag;
@ -509,7 +537,8 @@ sub check_translation(){
#checking if all sentences were translated
my $inputN;
my $outputN;
foreach my $idx (@idxlist){
my @failed = ();
foreach my $idx (@idx_todo){
if ($inputtype==0){#text input
chomp($inputN=`wc -l ${testfile}.$splitpfx$idx | cut -d' ' -f1`);
}
@ -522,10 +551,10 @@ sub check_translation(){
print STDERR "Split ($idx) were not entirely translated\n";
print STDERR "outputN=$outputN inputN=$inputN\n";
print STDERR "outputfile=${testfile}.$splitpfx$idx.trans inputfile=${testfile}.$splitpfx$idx\n";
return 0;
push @failed,$idx;
}
}
return 1;
return @failed;
}
sub remove_temporary_files(){

78
scripts/recaser/recase.perl Executable file
View File

@ -0,0 +1,78 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long "GetOptions";
my ($SRC,$INFILE,$RECASE_MODEL);
my $MOSES = "moses";
die("recase.perl --in file --model ini-file > out")
unless &GetOptions('in=s' => \$INFILE,
'headline=s' => \$SRC,
'moses=s' => \$MOSES,
'model=s' => \$RECASE_MODEL)
&& defined($INFILE)
&& defined($RECASE_MODEL);
# lowercase even in headline
my %ALWAYS_LOWER;
foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
# find out about the headlines
my @HEADLINE;
if (defined($SRC)) {
open(SRC,$SRC);
my $headline_flag = 0;
while(<SRC>) {
$headline_flag = 1 if /<hl>/;
$headline_flag = 0 if /<.hl>/;
next unless /^<seg/;
push @HEADLINE, $headline_flag;
}
close(SRC);
}
my $sentence = 0;
my $infile = $INFILE;
$infile =~ s/[\.\/]/_/g;
open(MODEL,"$MOSES -f $RECASE_MODEL -i $INFILE -dl 1|");
while(<MODEL>) {
chomp;
s/\s+$//;
my @WORD = split(/ /);
# uppercase initial word
&uppercase(\$WORD[0]);
# uppercase after period
for(my $i=1;$i<scalar(@WORD);$i++) {
if ($WORD[$i-1] eq '.') {
&uppercase(\$WORD[$i]);
}
}
# uppercase headlines {
if (defined($SRC) && $HEADLINE[$sentence]) {
foreach (@WORD) {
&uppercase(\$_) unless $ALWAYS_LOWER{$_};
}
}
# output
my $first = 1;
foreach (@WORD) {
print " " unless $first;
$first = 0;
print $_;
}
print "\n";
$sentence++;
}
close(MODEL);
`rm -rf /tmp/filter.$infile`;
sub uppercase {
my ($W) = @_;
substr($$W,0,1) =~ tr/a-z/A-Z/;
substr($$W,0,1) =~ tr/à-þ/À-Þ/;
}

View File

@ -0,0 +1,98 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long "GetOptions";
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
# apply switches
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG);
my $NGRAM_COUNT = "ngram-count";
my $TRAIN_SCRIPT = "train-factored-phrase-model.perl";
my $MAX_LEN = 1;
my $FIRST_STEP = 1;
my $LAST_STEP = 11;
die("train-recaser.perl --dir recaser --corpus cased")
unless &GetOptions('first-step=i' => \$FIRST_STEP,
'last-step=i' => \$LAST_STEP,
'corpus=s' => \$CORPUS,
'config=s' => \$CONFIG,
'dir=s' => \$DIR,
'ngram-count=s' => \$NGRAM_COUNT,
'train-script=s' => \$TRAIN_SCRIPT,
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
'max-len=i' => \$MAX_LEN);
# check and set default to unset parameters
die("please specify working dir --dir") unless defined($DIR);
die("please specify --corpus") if !defined($CORPUS)
&& $FIRST_STEP <= 2 && $LAST_STEP >= 1;
# main loop
`mkdir -p $DIR`;
&truecase() if 0 && $FIRST_STEP == 1;
&train_lm() if $FIRST_STEP <= 2;
&prepare_data() if $FIRST_STEP <= 3 && $LAST_STEP >= 3;
&train_recase_model() if $FIRST_STEP <= 10 && $LAST_STEP >= 3;
&cleanup() if $LAST_STEP == 11;
### subs ###
sub truecase {
# to do
}
sub train_lm {
print STDERR "(2) Train language model on cased data @ ".`date`;
my $cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -kndiscount";
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
sub prepare_data {
print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
open(CORPUS,$CORPUS);
open(CASED,">$DIR/aligned.cased");
print "$DIR/aligned.lowercased\n";
open(LOWERCASED,">$DIR/aligned.lowercased");
open(ALIGNMENT,">$DIR/aligned.a");
while(<CORPUS>) {
next if length($_)>2000;
s/\x{0}//g;
s/\|//g;
s/ +/ /g;
s/^ //;
s/ [\r\n]*$/\n/;
next if /^$/;
print CASED $_;
print LOWERCASED lc($_);
my $i=0;
foreach (split) {
print ALIGNMENT "$i-$i ";
$i++;
}
print ALIGNMENT "\n";
}
close(CORPUS);
close(CASED);
close(LOWERCASED);
close(ALIGNMENT);
}
sub train_recase_model {
my $first = $FIRST_STEP;
$first = 4 if $first < 4;
print STDERR "\n(4) Training recasing model @ ".`date`;
my $cmd = "$TRAIN_SCRIPT --root-dir $DIR --model-dir $DIR --first-step $first --alignment a --corpus $DIR/aligned --f lowercased --e cased --max-phrase-length $MAX_LEN --lm 0:3:$DIR/cased.srilm.gz:0";
$cmd .= " -scripts-root-dir $SCRIPTS_ROOT_DIR" if $SCRIPTS_ROOT_DIR;
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
sub cleanup {
print STDERR "\n(11) Cleaning up @ ".`date`;
`rm -f $DIR/extract*`;
`rm -f $DIR/aligned*`;
`rm -f $DIR/lex*`;
}

View File

@ -0,0 +1,224 @@
#!/usr/bin/perl -w
# Given a moses.ini file and an input text prepare minimized translation
# tables and a new moses.ini, so that loading of tables is much faster.
# original code by Philipp Koehn
# changes by Ondrej Bojar
use strict;
my $MAX_LENGTH = 10;
# consider phrases in input up to this length
# in other words, all phrase-tables will be truncated at least to 10 words per
# phrase
my $binarizer = shift;
my $dir = shift;
my $config = shift;
my $input = shift;
if (!defined $dir || !defined $config || !defined $input) {
print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text\n";
exit 1;
}
$dir = ensure_full_path($dir);
# buggy directory in place?
if (-d $dir && ! -e "$dir/info") {
print STDERR "The directory $dir exists but does not belong to me. Delete $dir!\n";
exit(1);
}
# already filtered? check if it can be re-used
if (-d $dir) {
my @INFO = `cat $dir/info`;
chop(@INFO);
if($INFO[0] ne $config
|| ($INFO[1] ne $input &&
$INFO[1].".tagged" ne $input)) {
print STDERR "WARNING: directory exists but does not match parameters:\n";
print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n";
exit 1;
}
print STDERR "The filtered model was ready in $dir, not doing anything.\n";
exit 0;
}
# filter the translation and distortion tables
safesystem("mkdir -p $dir") or die "Can't mkdir $dir";
# get tables to be filtered (and modify config file)
my (@TABLE,@TABLE_WEIGHTS,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%BINARIZABLE);
open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini";
open(INI,$config) or die "Can't read $config";
while(<INI>) {
print INI_OUT $_;
if (/ttable-file\]/) {
while(1) {
my $table_spec = <INI>;
if ($table_spec !~ /^([\d\,\-]+) ([\d\-]+) (\d+) (\S+)$/) {
print INI_OUT $table_spec;
last;
}
my ($source_factor,$t,$weights,$file) = ($1,$2,$3,$4);
chomp($file);
push @TABLE, $file;
push @TABLE_WEIGHTS,$weights;
$BINARIZABLE{$#TABLE}++;
my $new_name = "$dir/phrase-table.$source_factor-$t";
print INI_OUT "$source_factor $t $weights $new_name\n";
push @TABLE_NEW_NAME,$new_name;
$CONSIDER_FACTORS{$source_factor} = 1;
print STDERR "Considering factor $source_factor\n";
push @TABLE_FACTORS, $source_factor;
}
}
elsif (/distortion-file/) {
while(1) {
my $table_spec = <INI>;
if ($table_spec !~ /^([\d\,\-]+) (\S+) (\d+) (\S+)$/) {
print INI_OUT $table_spec;
last;
}
my ($factors,$t,$weights,$file) = ($1,$2,$3,$4);
my $source_factor = $factors;
$source_factor =~ s/\-\d+$//;
chomp($file);
push @TABLE,$file;
push @TABLE_WEIGHTS,$weights;
$file =~ s/^.*\/+([^\/]+)/$1/g;
my $new_name = "$dir/$file";
$new_name =~ s/\.gz//;
print INI_OUT "$factors $t $weights $new_name\n";
push @TABLE_NEW_NAME,$new_name;
$CONSIDER_FACTORS{$source_factor} = 1;
print STDERR "Considering factor $source_factor\n";
push @TABLE_FACTORS,$source_factor;
}
}
}
close(INI);
close(INI_OUT);
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
my %PHRASE_USED;
open(INPUT,$input) or die "Can't read $input";
while(my $line = <INPUT>) {
chomp($line);
my @WORD = split(/ +/,$line);
for(my $i=0;$i<=$#WORD;$i++) {
for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
foreach (keys %CONSIDER_FACTORS) {
my @FACTOR = split(/,/);
my $phrase = "";
for(my $k=$i;$k<=$i+$j;$k++) {
my @WORD_FACTOR = split(/\|/,$WORD[$k]);
for(my $f=0;$f<=$#FACTOR;$f++) {
$phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
}
chop($phrase);
$phrase .= " ";
}
chop($phrase);
$PHRASE_USED{$_}{$phrase}++;
}
}
}
}
close(INPUT);
# filter files
for(my $i=0;$i<=$#TABLE;$i++) {
my ($used,$total) = (0,0);
my $file = $TABLE[$i];
my $factors = $TABLE_FACTORS[$i];
my $new_file = $TABLE_NEW_NAME[$i];
print STDERR "filtering $file -> $new_file...\n";
my $openstring;
if ($file !~ /\.gz$/ && -e "$file.gz") {
$openstring = "zcat $file.gz |";
} elsif ($file =~ /\.gz$/) {
$openstring = "zcat $file |";
} else {
$openstring = "< $file";
}
open(FILE,$openstring) or die "Can't open '$openstring'";
open(FILE_OUT,">$new_file") or die "Can't write $new_file";
while(my $entry = <FILE>) {
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
$foreign =~ s/ $//;
if (defined($PHRASE_USED{$factors}{$foreign})) {
print FILE_OUT $entry;
$used++;
}
$total++;
}
close(FILE);
close(FILE_OUT);
die "No phrases found in $file!" if $total == 0;
printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
if ($BINARIZABLE{$i}) {
print STDERR "binarizing...";
my $cmd = "cat $new_file | sort | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
}
open(INFO,">$dir/info");
print INFO "$config\n$input\n";
close(INFO);
print "To run the decoder, please call:
moses -f $dir/moses.ini < $input\n";
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit(1);
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
sub ensure_full_path {
my $PATH = shift;
return $PATH if $PATH =~ /^\//;
my $dir = `pawd 2>/dev/null`;
if (!$dir) {$dir = `pwd`;}
chomp $dir;
$PATH = $dir."/".$PATH;
$PATH =~ s/[\r\n]//g;
$PATH =~ s/\/\.\//\//g;
$PATH =~ s/\/+/\//g;
my $sanity = 0;
while($PATH =~ /\/\.\.\// && $sanity++<10) {
$PATH =~ s/\/+/\//g;
$PATH =~ s/\/[^\/]+\/\.\.\//\//g;
}
$PATH =~ s/\/[^\/]+\/\.\.$//;
$PATH =~ s/\/+$//;
return $PATH;
}

View File

@ -141,7 +141,9 @@ for(my $i=0;$i<=$#TABLE;$i++) {
print STDERR "filtering $file -> $new_file...\n";
my $openstring;
if ($file =~ /\.gz$/) {
if ($file !~ /\.gz$/ && -e "$file.gz") {
$openstring = "zcat $file.gz |";
} elsif ($file =~ /\.gz$/) {
$openstring = "zcat $file |";
} else {
$openstring = "< $file";

View File

@ -9,6 +9,8 @@
# Revision history
# 13 Feb 2007 Better handling of default values for lambda, now works with multiple
# models and lexicalized reordering
# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1]
# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi)
# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table)
@ -32,25 +34,38 @@
# 13 Oct 2004 Use alternative decoders (DWC)
# Original version by Philipp Koehn
# for each _d_istortion, _l_anguage _m_odel, _t_ranslation _m_odel and _w_ord penalty, there is a list
# of [ default value, lower bound, upper bound ]-triples. In most cases, only one triple is used,
# but the translation model has currently 5 features
# defaults for initial values and ranges are:
my $default_triples = {
# for each _d_istortion, _l_anguage _m_odel, _t_ranslation _m_odel and _w_ord penalty, there is a list
# of [ default value, lower bound, upper bound ]-triples. In most cases, only one triple is used,
# but the translation model has currently 5 features
"d" => [ [ 1.0, 0.0, 2.0 ] ],
"lm" => [ [ 1.0, 0.0, 2.0 ] ],
"tm" => [
[ 0.3, 0.0, 0.5 ],
[ 0.2, 0.0, 0.5 ],
[ 0.3, 0.0, 0.5 ],
[ 0.2, 0.0, 0.5 ],
[ 0.0, -1.0, 1.0 ],
],
"g" => [
[ 1.0, 0.0, 2.0 ],
[ 1.0, 0.0, 2.0 ],
],
"w" => [ [ 0.0, -1.0, 1.0 ] ],
# these two basic models exist even if not specified, they are
# not associated with any model file
"d" => [ [ 1.0, 0.0, 2.0 ] ], # distance-based distortion
"w" => [ [ 0.0, -1.0, 1.0 ] ], # word penalty
};
my $additional_triples = {
# if the more lambda parameters for the weights are needed
# (due to additional tables) use the following values for them
"d" => [ [ 1.0, 0.0, 2.0 ], # lexicalized reordering model
[ 1.0, 0.0, 2.0 ],
[ 1.0, 0.0, 2.0 ],
[ 1.0, 0.0, 2.0 ],
[ 1.0, 0.0, 2.0 ],
[ 1.0, 0.0, 2.0 ],
[ 1.0, 0.0, 2.0 ] ],
"lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model
"g" => [ [ 1.0, 0.0, 2.0 ], # generation model
[ 1.0, 0.0, 2.0 ] ],
"tm" => [ [ 0.3, 0.0, 0.5 ], # translation model
[ 0.2, 0.0, 0.5 ],
[ 0.3, 0.0, 0.5 ],
[ 0.2, 0.0, 0.5 ],
[ 0.0,-1.0, 1.0 ] ], # ... last weight is phrase penalty
};
# moses.ini file uses FULL names for lambdas, while this training script internally (and on the command line)
@ -66,13 +81,10 @@ my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d gene
my %TABLECONFIG2ABBR = map {split(/=/,$_,2)} split /\s+/, $TABLECONFIG_ABBR_MAP;
# There are weights that do not correspond to any input file, they just increase the total number of lambdas we optimize
my $extra_lambdas_for_model = {
"w" => 1, # word penalty
"d" => 1, # basic distortion
};
#my $extra_lambdas_for_model = {
# "w" => 1, # word penalty
# "d" => 1, # basic distortion
#};
my $minimum_required_change_in_weights = 0.00001;
# stop if no lambda changes more than this
@ -218,7 +230,7 @@ if ($___INPUTTYPE == 1)
%FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
push @{$default_triples -> {"I"}}, [ 1.0, 0.0, 2.0 ];
$extra_lambdas_for_model -> {"I"} = 1; #Confusion network posterior
#$extra_lambdas_for_model -> {"I"} = 1; #Confusion network posterior
}
# Check validity of input parameters and set defaults if needed
@ -230,9 +242,6 @@ if (!defined $SCRIPTS_ROOTDIR) {
print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
# path of script for filtering phrase tables and running the decoder
$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd;
@ -250,8 +259,8 @@ $pythonpath = "$cmertdir/python" if !defined $pythonpath;
$ENV{PYTHONPATH} = $pythonpath; # other scripts need to know
die "Not executable: $filtercmd" if ! -x $filtercmd;
my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd);
die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd;
die "Not executable: $cmertcmd" if ! -x $cmertcmd;
die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd;
die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
@ -298,13 +307,13 @@ $___CONFIG = $config_abs;
# check validity of moses.ini and collect number of models and lambdas per model
# need to make a copy of $extra_lambdas_for_model, scan_config spoils it
my %copy_of_extra_lambdas_for_model = %$extra_lambdas_for_model;
my ($lambdas_per_model, $models_used) = scan_config($___CONFIG, \%copy_of_extra_lambdas_for_model);
#my %copy_of_extra_lambdas_for_model = %$extra_lambdas_for_model;
my %used_triples = %{$default_triples};
my ($models_used) = scan_config($___CONFIG);
# Parse the lambda config string and convert it to a nice structure in the same format as $default_triples
my $use_triples = undef;
# Parse the lambda config string and convert it to a nice structure in the same format as $used_triples
if (defined $___LAMBDA) {
my %specified_triples;
# interpreting lambdas from command line
foreach (split(/\s+/,$___LAMBDA)) {
my ($name,$values) = split(/:/);
@ -314,43 +323,25 @@ if (defined $___LAMBDA) {
my $start = $1;
my $min = $2;
my $max = $3;
push @{$use_triples->{$name}}, [$start, $min, $max];
push @{$specified_triples{$name}}, [$start, $min, $max];
}
else {
die "Malformed feature range definition: $name => $startminmax\n";
}
}
}
} else {
# no lambdas supplied, use the default ones, but do not forget to repeat them accordingly
# first for or inherent models
foreach my $name (keys %$extra_lambdas_for_model) {
foreach (1..$extra_lambdas_for_model->{$name}) {
die "No default weights defined for -$name"
if !defined $default_triples->{$name};
# XXX here was a deadly bug: we need a deep copy of the default values
my @copy = ();
foreach my $triple (@{$default_triples->{$name}}) {
my @copy_triple = @$triple;
push @copy, [ @copy_triple ];
}
push @{$use_triples->{$name}}, @copy;
}
# sanity checks for specified lambda triples
foreach my $name (keys %used_triples) {
die "No lambdas specified for '$name', but ".($used_triples{$name})." needed.\n"
unless defined($specified_triples{$name});
die "Number of lambdas specified for '$name' (".($specified_triples{$name}).") does not match number needed (".($used_triples{$name}).")\n"
if scalar $used_triples{$name} != scalar $specified_triples{$name};
}
# and then for all models used
foreach my $name (keys %$models_used) {
foreach (1..$models_used->{$name}) {
die "No default weights defined for -$name"
if !defined $default_triples->{$name};
# XXX here was a deadly bug: we need a deep copy of the default values
my @copy = ();
foreach my $triple (@{$default_triples->{$name}}) {
my @copy_triple = @$triple;
push @copy, [ @copy_triple ];
}
push @{$use_triples->{$name}}, @copy;
}
foreach my $name (keys %specified_triples) {
die "Lambdas specified for '$name' ".($specified_triples{$name}).", but none needed.\n"
unless defined($used_triples{$name});
}
%used_triples = %specified_triples;
}
# moses should use our config
@ -363,24 +354,6 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files.";
}
# walk through all lambdas the user wishes to optimize and check
# if the number of lambdas matches
foreach my $name (keys %$use_triples) {
my $expected_lambdas = $lambdas_per_model->{$name};
$expected_lambdas = 0 if !defined $expected_lambdas;
my $got_lambdas = defined $use_triples->{$name} ? scalar @{$use_triples->{$name}} : 0;
if ($got_lambdas != $expected_lambdas) {
if ($allow_unknown_lambdas && $expected_lambdas == 0) {
print STDERR "Allowing to optimize $name, although I have no idea what it is.\n";
} else {
print STDERR "Wrong number of lambdas for $name. Expected (given the config file): $expected_lambdas, got: $got_lambdas.
Use --allow-unknown-lambdas to optimize lambdas that you are just introducing
and I cannot validate against the models mentioned in moses.ini.\n";
exit 1;
}
}
}
# as weights are normalized in the next steps (by cmert)
# normalize initial LAMBDAs, too
my $need_to_normalize = 1;
@ -399,6 +372,7 @@ my @order_of_lambdas_from_decoder = ();
my $cwd = `pawd 2>/dev/null`;
if(!$cwd){$cwd = `pwd`;}
chomp($cwd);
safesystem("mkdir -p $___WORKING_DIR") or die "Can't mkdir $___WORKING_DIR";
{
@ -440,12 +414,11 @@ if ($continue) {
close IN;
my @newweights = split /\s+/, $newweights;
# dump_triples($use_triples);
$use_triples = store_new_lambda_values($use_triples, \@order_of_lambdas_from_decoder, \@newweights);
# dump_triples($use_triples);
#dump_triples(\%used_triples);
store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights);
#dump_triples(\%used_triples);
}
if ($___FILTER_PHRASE_TABLE){
# filter the phrase tables wih respect to input, use --decoder-flags
print "filtering the phrase tables... ".`date`;
@ -480,13 +453,13 @@ while(1) {
print "run $run start at ".`date`;
# In case something dies later, we might wish to have a copy
create_config($___CONFIG, "./run$run.moses.ini", $use_triples, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
create_config($___CONFIG, "./run$run.moses.ini", \%used_triples, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
# skip if the user wanted
if (!$skip_decoder) {
print "($run) run decoder to produce n-best lists\n";
@order_of_lambdas_from_decoder = run_decoder($use_triples, $PARAMETERS, $run, \@order_of_lambdas_from_decoder, $need_to_normalize);
@order_of_lambdas_from_decoder = run_decoder(\%used_triples, $PARAMETERS, $run, \@order_of_lambdas_from_decoder, $need_to_normalize);
$need_to_normalize = 0;
safesystem("gzip -f run*out") or die "Failed to gzip run*out";
}
@ -566,8 +539,8 @@ while(1) {
next if $visited{$name};
$visited{$name} = 1;
die "The decoder produced also some '$name' scores, but we do not know the ranges for them, no way to optimize them\n"
if !defined $use_triples->{$name};
foreach my $feature (@{$use_triples->{$name}}) {
if !defined $used_triples{$name};
foreach my $feature (@{$used_triples{$name}}) {
my ($val, $min, $max) = @$feature;
push @CURR, $val;
push @MIN, $min;
@ -624,7 +597,7 @@ while(1) {
my @newweights = split /\s+/, $bestpoint;
# update my cache of lambda values
$use_triples = store_new_lambda_values($use_triples, \@order_of_lambdas_from_decoder, \@newweights);
store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights);
## additional stopping criterion: weights have not changed
my $shouldstop = 1;
@ -653,7 +626,7 @@ print "Training finished at ".`date`;
safesystem("cp init.opt run$run.init.opt") or die;
safesystem ("cp cmert.log run$run.cmert.log") or die;
create_config($___CONFIG, "./moses.ini", $use_triples, $run, $devbleu);
create_config($___CONFIG, "./moses.ini", \%used_triples, $run, $devbleu);
# just to be sure that we have the really last finished step marked
open F, "> finished_step.txt" or die "Can't mark finished step";
@ -693,7 +666,6 @@ sub store_new_lambda_values {
# print STDERR "Storing $i-th score as $name: $idx{$name}: $values->[$i]\n";
$triples->{$name}->[$idx{$name}]->[0] = $values->[$i];
}
return $triples;
}
sub dump_triples {
@ -820,7 +792,7 @@ sub create_config {
delete($P{$abbr});
delete($P{$ABBR2FULL{$abbr}});
# Then feed P with the current values
foreach my $feature (@{$use_triples->{$abbr}}) {
foreach my $feature (@{$used_triples{$abbr}}) {
my ($val, $min, $max) = @$feature;
my $name = defined $ABBR2FULL{$abbr} ? $ABBR2FULL{$abbr} : $abbr;
push @{$P{$name}}, $val;
@ -933,7 +905,6 @@ sub ensure_full_path {
sub scan_config {
my $ini = shift;
my $inishortname = $ini; $inishortname =~ s/^.*\///; # for error reporting
my $lambda_counts = shift;
# we get a pre-filled counts, because some lambdas are always needed (word penalty, for instance)
# as we walk though the ini file, we record how many extra lambdas do we need
# and finally, we report it
@ -978,12 +949,13 @@ sub scan_config {
my @flds = split / +/;
my $fn = $flds[$where_is_filename{$section}];
if (defined $fn && $fn !~ /^\s+$/) {
print "checking weight-count for $section\n";
# this is a filename! check it
if ($fn !~ /^\//) {
$error = 1;
print STDERR "$inishortname:$nr:Filename not absolute: $fn\n";
}
if (! -s $fn) {
if (! -s $fn && ! -s "$fn.gz") {
$error = 1;
print STDERR "$inishortname:$nr:File does not exist or empty: $fn\n";
}
@ -996,12 +968,20 @@ sub scan_config {
my $needlambdas = defined $where_is_lambda_count{$section} ? $flds[$where_is_lambda_count{$section}] : 1;
print STDERR "Config needs $needlambdas lambdas for $section (i.e. $shortname)\n" if $verbose;
$lambda_counts->{$shortname}+=$needlambdas;
if (!defined $___LAMBDA && (!defined $default_triples->{$shortname} || scalar(@{$default_triples->{$shortname}}) != $needlambdas)) {
print STDERR "$inishortname:$nr:Your model $shortname needs $needlambdas weights but we define the default ranges for "
.scalar(@{$default_triples->{$shortname}})." weights. Cannot use the default, you must supply lambdas by hand.\n";
if (!defined $___LAMBDA && (!defined $additional_triples->{$shortname} || scalar(@{$additional_triples->{$shortname}}) < $needlambdas)) {
print STDERR "$inishortname:$nr:Your model $shortname needs $needlambdas weights but we define the default ranges for only "
.scalar(@{$additional_triples->{$shortname}})." weights. Cannot use the default, you must supply lambdas by hand.\n";
$error = 1;
}
else {
# note: table may use less parameters than the maximum number
# of triples
for(my $lambda=0;$lambda<$needlambdas;$lambda++) {
my ($start, $min, $max)
= @{${$additional_triples->{$shortname}}[$lambda]};
push @{$used_triples{$shortname}}, [$start, $min, $max];
}
}
$defined_files{$shortname}++;
}
}
@ -1018,6 +998,6 @@ sub scan_config {
}
}
exit(1) if $error;
return ($lambda_counts, \%defined_files);
return (\%defined_files);
}

View File

@ -11,7 +11,7 @@ use Getopt::Long "GetOptions";
# -----------------------------------------------------
$ENV{"LC_ALL"} = "C";
my($_ROOT_DIR,$_CORPUS_DIR,$_GIZA_E2F,$_GIZA_F2E,$_MODEL_DIR,$_CORPUS,$_CORPUS_COMPRESSION,$_FIRST_STEP,$_LAST_STEP,$_F,$_E,$_MAX_PHRASE_LENGTH,$_LEXICAL_DIR,$_NO_LEXICAL_WEIGHTING,$_VERBOSE,$_ALIGNMENT,@_LM,$_EXTRACT_FILE,$_GIZA_OPTION,$_HELP,$_PARTS,$_DIRECTION,$_ONLY_PRINT_GIZA,$_REORDERING,$_REORDERING_SMOOTH,$_ALIGNMENT_FACTORS,$_TRANSLATION_FACTORS,$_REORDERING_FACTORS,$_GENERATION_FACTORS,$_DECODING_STEPS,$_PARALLEL, $SCRIPTS_ROOTDIR, $_FACTOR_DELIMITER);
my($_ROOT_DIR,$_CORPUS_DIR,$_GIZA_E2F,$_GIZA_F2E,$_MODEL_DIR,$_CORPUS,$_CORPUS_COMPRESSION,$_FIRST_STEP,$_LAST_STEP,$_F,$_E,$_MAX_PHRASE_LENGTH,$_LEXICAL_DIR,$_NO_LEXICAL_WEIGHTING,$_VERBOSE,$_ALIGNMENT,@_LM,$_EXTRACT_FILE,$_GIZA_OPTION,$_HELP,$_PARTS,$_DIRECTION,$_ONLY_PRINT_GIZA,$_REORDERING,$_REORDERING_SMOOTH,$_ALIGNMENT_FACTORS,$_TRANSLATION_FACTORS,$_REORDERING_FACTORS,$_GENERATION_FACTORS,$_DECODING_STEPS,$_PARALLEL, $SCRIPTS_ROOTDIR, $_FACTOR_DELIMITER,@_PHRASE_TABLE,@_REORDERING_TABLE,$_CONFIG);
my $debug = 0; # debug this script, do not delete any files in debug mode
@ -56,6 +56,9 @@ $_HELP = 1
'decoding-steps=s' => \$_DECODING_STEPS,
'scripts-root-dir=s' => \$SCRIPTS_ROOTDIR,
'factor-delimiter=s' => \$_FACTOR_DELIMITER,
'phrase-table=s' => \@_PHRASE_TABLE,
'config=s' => \$_CONFIG,
'reordering-table=s' => \@_REORDERING_TABLE,
);
if ($_HELP) {
@ -147,6 +150,8 @@ $___MODEL_DIR = $_MODEL_DIR if $_MODEL_DIR;
my $___EXTRACT_FILE = $___MODEL_DIR."/extract";
$___EXTRACT_FILE = $_EXTRACT_FILE if $_EXTRACT_FILE;
my $___CONFIG = $___ROOT_DIR."/model/moses.ini";
$___CONFIG = $_CONFIG if $_CONFIG;
my $___MAX_PHRASE_LENGTH = 7;
my $___LEXICAL_WEIGHTING = 1;
@ -167,12 +172,14 @@ if ($___LAST_STEP == 9) {
die "use --lm factor:order:filename to specify at least one language model"
if scalar @_LM == 0;
foreach my $lm (@_LM) {
my ($f, $order, $filename) = split /:/, $lm, 3;
my $type = 0; # default to srilm
my ($f, $order, $filename);
($f, $order, $filename, $type) = split /:/, $lm, 4;
die "Wrong format of --lm. Expected: --lm factor:order:filename"
if $f !~ /^[0-9]+$/ || $order !~ /^[0-9]+$/ || !defined $filename;
die "Language model file not found or empty: $filename"
if ! -s $filename;
push @___LM, [ $f, $order, $filename ];
push @___LM, [ $f, $order, $filename, $type ];
}
}
@ -196,16 +203,17 @@ $___REORDERING_SMOOTH = $_REORDERING_SMOOTH if $_REORDERING_SMOOTH;
my %REORDERING_MODEL;
my $REORDERING_LEXICAL = 0; # flag for building lexicalized reordering models
foreach my $r (split(/,/,$___REORDERING)) {
if (!( $r eq "orientation-f" ||
$r eq "orientation-fe" ||
$r eq "orientation-bidirectional-f" ||
$r eq "orientation-bidirectional-fe" ||
$r =~ s/orientation/msd/;
if (!( $r eq "msd-f" ||
$r eq "msd-fe" ||
$r eq "msd-bidirectional-f" ||
$r eq "msd-bidirectional-fe" ||
$r eq "monotonicity-f" ||
$r eq "monotonicity-fe" ||
$r eq "monotonicity-bidirectional-f" ||
$r eq "monotonicity-bidirectional-fe" ||
$r eq "distance")) {
print STDERR "unknwown reordering type: $r";
print STDERR "unknown reordering type: $r";
exit(1);
}
if ($r ne "distance") { $REORDERING_LEXICAL = 1; }
@ -225,11 +233,13 @@ $___ALIGNMENT_FACTORS = $_ALIGNMENT_FACTORS if defined($_ALIGNMENT_FACTORS);
die("format for alignment factors is \"0-0\" or \"0,1,2-0,1\", you provided $___ALIGNMENT_FACTORS\n") if $___ALIGNMENT_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*$/;
my $___TRANSLATION_FACTORS = undef;
$___TRANSLATION_FACTORS = "0-0" unless defined($_DECODING_STEPS); # single factor default
$___TRANSLATION_FACTORS = $_TRANSLATION_FACTORS if defined($_TRANSLATION_FACTORS);
die("format for translation factors is \"0-0\" or \"0-0+1-1\" or \"0-0+0,1-0,1\", you provided $___TRANSLATION_FACTORS\n")
if defined $___TRANSLATION_FACTORS && $___TRANSLATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
my $___REORDERING_FACTORS = undef;
$___REORDERING_FACTORS = "0-0" if defined($_REORDERING) && ! defined($_DECODING_STEPS); # single factor default
$___REORDERING_FACTORS = $_REORDERING_FACTORS if defined($_REORDERING_FACTORS);
die("format for reordering factors is \"0-0\" or \"0-0+1-1\" or \"0-0+0,1-0,1\", you provided $___REORDERING_FACTORS\n")
if defined $___REORDERING_FACTORS && $___REORDERING_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
@ -239,10 +249,10 @@ $___GENERATION_FACTORS = $_GENERATION_FACTORS if defined($_GENERATION_FACTORS);
die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n")
if defined $___GENERATION_FACTORS && $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
my $___DECODING_STEPS = $_DECODING_STEPS;
die("use --decoding-steps to specify decoding steps") if ( !defined $_DECODING_STEPS && $___LAST_STEP>=9 && $___FIRST_STEP<=9);
my $___DECODING_STEPS = "t0";
$___DECODING_STEPS = $_DECODING_STEPS if defined($_DECODING_STEPS);
die("format for decoding steps is \"t0,g0,t1,g1\", you provided $___DECODING_STEPS\n")
if defined $___DECODING_STEPS && $___DECODING_STEPS !~ /^[tg]\d+(,[tg]\d+)*$/;
if defined $_DECODING_STEPS && $_DECODING_STEPS !~ /^[tg]\d+(,[tg]\d+)*$/;
my ($factor,$factor_e,$factor_f);
@ -1029,14 +1039,14 @@ sub get_reordering {
print STDERR "(7.2) building tables @ ".`date`;
open(O,"$___EXTRACT_FILE.$factor.o.sorted")
or die "Can't read $___EXTRACT_FILE.$factor.o.sorted";
open(OF, "|gzip >$___MODEL_DIR/orientation-table.$factor.f.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"orientation-f"});
open(OFE, "|gzip >$___MODEL_DIR/orientation-table.$factor.fe.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"orientation-fe"});
open(OBF, "|gzip >$___MODEL_DIR/orientation-table.$factor.bi.f.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"orientation-bidirectional-f"});
open(OBFE,"|gzip >$___MODEL_DIR/orientation-table.$factor.bi.fe.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"orientation-bidirectional-fe"});
open(OF, "|gzip >$___MODEL_DIR/msd-table.$factor.f.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"msd-f"});
open(OFE, "|gzip >$___MODEL_DIR/msd-table.$factor.fe.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"msd-fe"});
open(OBF, "|gzip >$___MODEL_DIR/msd-table.$factor.bi.f.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"msd-bidirectional-f"});
open(OBFE,"|gzip >$___MODEL_DIR/msd-table.$factor.bi.fe.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"msd-bidirectional-fe"});
open(MF, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.f.$___REORDERING_SMOOTH.gz")
if defined($REORDERING_MODEL{"monotonicity-f"});
open(MFE, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.fe.$___REORDERING_SMOOTH.gz")
@ -1107,14 +1117,14 @@ sub get_reordering {
sub store_reordering_f {
my $total_previous_f = $mono_previous_f+$swap_previous_f+$other_previous_f;
my $total_following_f = $mono_following_f+$swap_following_f+$other_following_f;
if(defined($REORDERING_MODEL{"orientation-f"})) {
if(defined($REORDERING_MODEL{"msd-f"})) {
printf OF ("%s ||| %.5f %.5f %.5f\n",
$f_current,
$mono_previous_f/$total_previous_f,
$swap_previous_f/$total_previous_f,
$other_previous_f/$total_previous_f);
}
if(defined($REORDERING_MODEL{"orientation-bidirectional-f"})) {
if(defined($REORDERING_MODEL{"msd-bidirectional-f"})) {
printf OBF ("%s ||| %.5f %.5f %.5f %.5f %.5f %.5f\n",
$f_current,
$mono_previous_f/$total_previous_f,
@ -1144,14 +1154,14 @@ sub store_reordering_fe {
my $total_previous_fe = $mono_previous_fe+$swap_previous_fe+$other_previous_fe;
my $total_following_fe = $mono_following_fe+$swap_following_fe+$other_following_fe;
if(defined($REORDERING_MODEL{"orientation-fe"})) {
if(defined($REORDERING_MODEL{"msd-fe"})) {
printf OFE ("%s ||| %s ||| %.5f %.5f %.5f\n",
$f_current, $e_current,
$mono_previous_fe/$total_previous_fe,
$swap_previous_fe/$total_previous_fe,
$other_previous_fe/$total_previous_fe);
}
if(defined($REORDERING_MODEL{"orientation-bidirectional-fe"})) {
if(defined($REORDERING_MODEL{"msd-bidirectional-fe"})) {
printf OBFE ("%s ||| %s ||| %.5f %.5f %.5f %.5f %.5f %.5f\n",
$f_current, $e_current,
$mono_previous_fe/$total_previous_fe,
@ -1257,12 +1267,13 @@ sub create_ini {
&full_path(\$___MODEL_DIR);
&full_path(\$___VCB_E);
&full_path(\$___VCB_F);
open(INI,">$___MODEL_DIR/moses.ini") or die "Can't write $___MODEL_DIR/moses.ini";
`mkdir -p $___MODEL_DIR`;
open(INI,">$___CONFIG") or die("Can't write $___CONFIG");
print INI "#########################
### MOSES CONFIG FILE ###
#########################
\n";
if (defined $___TRANSLATION_FACTORS) {
print INI "# input factors\n";
print INI "[input-factors]\n";
@ -1278,7 +1289,6 @@ sub create_ini {
die "No translation steps defined, cannot prepare [input-factors] section\n";
}
my %stepsused;
print INI "\n# mapping steps
[mapping]\n";
@ -1292,11 +1302,14 @@ sub create_ini {
print INI "\n# translation tables: source-factors, target-factors, number of scores, file
[ttable-file]\n";
my $num_of_ttables = 0;
my @SPECIFIED_TABLE = @_PHRASE_TABLE;
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
$num_of_ttables++;
my $ff = $f;
$ff =~ s/\-/ /;
print INI "$ff 5 $___MODEL_DIR/phrase-table.$f.gz\n";
my $file = "$___MODEL_DIR/phrase-table.$f.gz";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
print INI "$ff 5 $file\n";
}
if ($num_of_ttables != $stepsused{"T"}) {
print STDERR "WARNING: Your [mapping-steps] require translation steps up to id $stepsused{T} but you defined translation steps 0..$num_of_ttables\n";
@ -1326,12 +1339,11 @@ sub create_ini {
print INI "\n# language models: type(srilm/irstlm), factors, order, file
[lmodel-file]\n";
foreach my $lm (@___LM) {
my ($f, $o, $fn) = @$lm;
my $type = 0; # default to srilm
my ($f, $o, $fn, $type) = @{$lm};
print INI "$type $f $o $fn\n";
}
print INI "\n\n# limit on how many phrase translations e for each phrase f are loaded
print INI "\n\n\# limit on how many phrase translations e for each phrase f are loaded
# 0 = all elements loaded
[ttable-limit]
20\n";
@ -1341,8 +1353,10 @@ print INI "\n\n# limit on how many phrase translations e for each phrase f are l
my $weight_d_count = 0;
if ($___REORDERING ne "distance") {
my $file = "# distortion (reordering) files\n[distortion-file]\n";
my $file = "# distortion (reordering) files\n\[distortion-file]\n";
my $factor_i = 0;
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
foreach my $r (keys %REORDERING_MODEL) {
next if $r eq "fe" || $r eq "f";
@ -1350,23 +1364,24 @@ print INI "\n\n# limit on how many phrase translations e for each phrase f are l
if ($r eq "distance") { $weight_d_count++; }
else {
my $type = $r;
$type =~ s/orientation/msd/;
$r =~ s/-bidirectional/.bi/;
$r =~ s/-f/.f/;
$r =~ s/orientation/orientation-table.$factor/;
$r =~ s/msd/msd-table.$factor/;
$r =~ s/monotonicity/monotonicity-table.$factor/;
my $w;
if ($r =~ /orient/) { $w = 3; } else { $w = 1; }
if ($r =~ /msd/) { $w = 3; } else { $w = 1; }
if ($r =~ /bi/) { $w *= 2; }
$weight_d_count += $w;
$file .= "$factor $type $w $___MODEL_DIR/$r.$___REORDERING_SMOOTH.gz\n";
my $table_file = "$___MODEL_DIR/$r.$___REORDERING_SMOOTH.gz";
$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$file .= "$factor $type $w $table_file\n";
}
}
$factor_i++;
}
print INI $file."\n";
}
print INI $file."\n";
}
else {
$weight_d_count = 1;