mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 22:45:50 +03:00
a large number of changes. besides little tweaks:
* training script now has proper default behaviour for single-factor models, * mert script has better handling of default lambda parameters that now works with lexicalized reordering models, and also with multiple models files (e.g. multiple language models) * parallel mert script is more robust when single jobs fail: detects it and resubmits the crashed (or killed) jobs * recaser added that builds on moses * filtering script added that also binarizes filtered model files (this will be eventually replaced when the lexicalized reordering model also uses the binary format) git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1210 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
e247f1da6f
commit
14839768c8
@ -19,9 +19,8 @@ use strict;
|
||||
#######################
|
||||
#Customizable parameters
|
||||
|
||||
#parameters for submiiting processes through SGE
|
||||
#NOTE: group name is ws06ossmt (with 2 's') and not ws06osmt (with 1 's')
|
||||
my $queueparameters="-l ws06ossmt=true -l mem_free=0.5G -hard";
|
||||
#parameters for submiiting processes through Sun GridEngine
|
||||
my $queueparameters="-l mem_free=0.5G -hard";
|
||||
|
||||
# look for the correct pwdcmd
|
||||
my $pwdcmd = getPwdCmd();
|
||||
@ -53,6 +52,7 @@ my $nbestfile=undef;
|
||||
my $orinbestfile=undef;
|
||||
my $nbest=undef;
|
||||
my $nbestflag=0;
|
||||
my $robust=1; # undef; # resubmit crashed jobs
|
||||
my $orilogfile="";
|
||||
my $logflag="";
|
||||
my $qsubname="MOSES";
|
||||
@ -69,6 +69,7 @@ sub init(){
|
||||
'debug'=>\$dbg,
|
||||
'jobs=i'=>\$jobs,
|
||||
'decoder=s'=> \$mosescmd,
|
||||
'robust' => \$robust,
|
||||
'decoder-parameters=s'=> \$mosesparameters,
|
||||
'logfile=s'=> \$orilogfile,
|
||||
'i|inputfile|input-file=s'=> \$orifile,
|
||||
@ -284,8 +285,16 @@ preparing_script();
|
||||
#launching process through the queue
|
||||
my @sgepids =();
|
||||
|
||||
my $failure=0;
|
||||
foreach my $idx (@idxlist){
|
||||
# if robust switch is used, redo jobs that crashed
|
||||
my @idx_todo = ();
|
||||
foreach (@idxlist) { push @idx_todo,$_; }
|
||||
|
||||
my $looped_once = 0;
|
||||
while((!$robust && !$looped_once) || ($robust && scalar @idx_todo)) {
|
||||
$looped_once = 1;
|
||||
|
||||
my $failure=0;
|
||||
foreach my $idx (@idx_todo){
|
||||
print STDERR "qsub $queueparameters -b no -j yes -o $qsubout$idx -e $qsuberr$idx -N $qsubname$idx ${jobscript}${idx}.bash\n" if $dbg;
|
||||
|
||||
$cmd="qsub $queueparameters -b no -j yes -o $qsubout$idx -e $qsuberr$idx -N $qsubname$idx ${jobscript}${idx}.bash >& ${jobscript}${idx}.log";
|
||||
@ -302,16 +311,17 @@ foreach my $idx (@idxlist){
|
||||
close(IN);
|
||||
|
||||
push @sgepids, $id;
|
||||
}
|
||||
}
|
||||
|
||||
#waiting until all jobs have finished
|
||||
my $hj = "-hold_jid " . join(" -hold_jid ", @sgepids);
|
||||
#waiting until all jobs have finished
|
||||
my $hj = "-hold_jid " . join(" -hold_jid ", @sgepids);
|
||||
|
||||
if ($old_sge) {
|
||||
if ($old_sge) {
|
||||
# we need to implement our own waiting script
|
||||
safesystem("echo 'date' > sync_workaround_script.sh") or kill_all_and_quit();
|
||||
|
||||
my $pwd = `$pwdcmd`; chomp $pwd;
|
||||
|
||||
my $checkpointfile = "sync_workaround_checkpoint";
|
||||
|
||||
# delete previous checkpoint, if left from previous runs
|
||||
@ -344,18 +354,36 @@ if ($old_sge) {
|
||||
print STDERR "Extra wait ($nr) for possibly unfinished processes.\n";
|
||||
sleep 10;
|
||||
}
|
||||
} else {
|
||||
} else {
|
||||
# use the -sync option for qsub
|
||||
$cmd="qsub $queueparameters -sync y $hj -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls >& $qsubname.W.log";
|
||||
safesystem($cmd) or kill_all_and_quit();
|
||||
|
||||
$failure=&check_exit_status();
|
||||
}
|
||||
|
||||
kill_all_and_quit() if $failure && !$robust;
|
||||
|
||||
# check if some translations failed
|
||||
my @idx_still_todo = check_translation();
|
||||
if ($robust) {
|
||||
# if robust, redo crashed jobs
|
||||
if ((scalar @idx_still_todo) == (scalar @idxlist)) {
|
||||
# ... but not if all crashed
|
||||
print STDERR "everything crashed, not trying to resubmit jobs\n";
|
||||
kill_all_and_quit();
|
||||
}
|
||||
@idx_todo = @idx_still_todo;
|
||||
}
|
||||
else {
|
||||
if (scalar (@idx_still_todo)) {
|
||||
print STDERR "some jobs crashed: ".join(" ",@idx_still_todo)."\n";
|
||||
kill_all_and_quit();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
kill_all_and_quit() if $failure;
|
||||
|
||||
check_translation();
|
||||
|
||||
#concatenating translations and removing temporary files
|
||||
concatenate_1best();
|
||||
concatenate_logs() if $logflag;
|
||||
@ -509,7 +537,8 @@ sub check_translation(){
|
||||
#checking if all sentences were translated
|
||||
my $inputN;
|
||||
my $outputN;
|
||||
foreach my $idx (@idxlist){
|
||||
my @failed = ();
|
||||
foreach my $idx (@idx_todo){
|
||||
if ($inputtype==0){#text input
|
||||
chomp($inputN=`wc -l ${testfile}.$splitpfx$idx | cut -d' ' -f1`);
|
||||
}
|
||||
@ -522,10 +551,10 @@ sub check_translation(){
|
||||
print STDERR "Split ($idx) were not entirely translated\n";
|
||||
print STDERR "outputN=$outputN inputN=$inputN\n";
|
||||
print STDERR "outputfile=${testfile}.$splitpfx$idx.trans inputfile=${testfile}.$splitpfx$idx\n";
|
||||
return 0;
|
||||
push @failed,$idx;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
return @failed;
|
||||
}
|
||||
|
||||
sub remove_temporary_files(){
|
||||
|
78
scripts/recaser/recase.perl
Executable file
78
scripts/recaser/recase.perl
Executable file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
||||
my ($SRC,$INFILE,$RECASE_MODEL);
|
||||
my $MOSES = "moses";
|
||||
die("recase.perl --in file --model ini-file > out")
|
||||
unless &GetOptions('in=s' => \$INFILE,
|
||||
'headline=s' => \$SRC,
|
||||
'moses=s' => \$MOSES,
|
||||
'model=s' => \$RECASE_MODEL)
|
||||
&& defined($INFILE)
|
||||
&& defined($RECASE_MODEL);
|
||||
|
||||
# lowercase even in headline
|
||||
my %ALWAYS_LOWER;
|
||||
foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
|
||||
|
||||
# find out about the headlines
|
||||
my @HEADLINE;
|
||||
if (defined($SRC)) {
|
||||
open(SRC,$SRC);
|
||||
my $headline_flag = 0;
|
||||
while(<SRC>) {
|
||||
$headline_flag = 1 if /<hl>/;
|
||||
$headline_flag = 0 if /<.hl>/;
|
||||
next unless /^<seg/;
|
||||
push @HEADLINE, $headline_flag;
|
||||
}
|
||||
close(SRC);
|
||||
}
|
||||
|
||||
my $sentence = 0;
|
||||
my $infile = $INFILE;
|
||||
$infile =~ s/[\.\/]/_/g;
|
||||
open(MODEL,"$MOSES -f $RECASE_MODEL -i $INFILE -dl 1|");
|
||||
while(<MODEL>) {
|
||||
chomp;
|
||||
s/\s+$//;
|
||||
my @WORD = split(/ /);
|
||||
|
||||
# uppercase initial word
|
||||
&uppercase(\$WORD[0]);
|
||||
|
||||
# uppercase after period
|
||||
for(my $i=1;$i<scalar(@WORD);$i++) {
|
||||
if ($WORD[$i-1] eq '.') {
|
||||
&uppercase(\$WORD[$i]);
|
||||
}
|
||||
}
|
||||
|
||||
# uppercase headlines {
|
||||
if (defined($SRC) && $HEADLINE[$sentence]) {
|
||||
foreach (@WORD) {
|
||||
&uppercase(\$_) unless $ALWAYS_LOWER{$_};
|
||||
}
|
||||
}
|
||||
|
||||
# output
|
||||
my $first = 1;
|
||||
foreach (@WORD) {
|
||||
print " " unless $first;
|
||||
$first = 0;
|
||||
print $_;
|
||||
}
|
||||
print "\n";
|
||||
$sentence++;
|
||||
}
|
||||
close(MODEL);
|
||||
|
||||
`rm -rf /tmp/filter.$infile`;
|
||||
|
||||
sub uppercase {
|
||||
my ($W) = @_;
|
||||
substr($$W,0,1) =~ tr/a-z/A-Z/;
|
||||
substr($$W,0,1) =~ tr/à-þ/À-Þ/;
|
||||
}
|
98
scripts/recaser/train-recaser.perl
Executable file
98
scripts/recaser/train-recaser.perl
Executable file
@ -0,0 +1,98 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
||||
binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
# apply switches
|
||||
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG);
|
||||
my $NGRAM_COUNT = "ngram-count";
|
||||
my $TRAIN_SCRIPT = "train-factored-phrase-model.perl";
|
||||
my $MAX_LEN = 1;
|
||||
my $FIRST_STEP = 1;
|
||||
my $LAST_STEP = 11;
|
||||
die("train-recaser.perl --dir recaser --corpus cased")
|
||||
unless &GetOptions('first-step=i' => \$FIRST_STEP,
|
||||
'last-step=i' => \$LAST_STEP,
|
||||
'corpus=s' => \$CORPUS,
|
||||
'config=s' => \$CONFIG,
|
||||
'dir=s' => \$DIR,
|
||||
'ngram-count=s' => \$NGRAM_COUNT,
|
||||
'train-script=s' => \$TRAIN_SCRIPT,
|
||||
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
|
||||
'max-len=i' => \$MAX_LEN);
|
||||
|
||||
# check and set default to unset parameters
|
||||
die("please specify working dir --dir") unless defined($DIR);
|
||||
die("please specify --corpus") if !defined($CORPUS)
|
||||
&& $FIRST_STEP <= 2 && $LAST_STEP >= 1;
|
||||
|
||||
# main loop
|
||||
`mkdir -p $DIR`;
|
||||
&truecase() if 0 && $FIRST_STEP == 1;
|
||||
&train_lm() if $FIRST_STEP <= 2;
|
||||
&prepare_data() if $FIRST_STEP <= 3 && $LAST_STEP >= 3;
|
||||
&train_recase_model() if $FIRST_STEP <= 10 && $LAST_STEP >= 3;
|
||||
&cleanup() if $LAST_STEP == 11;
|
||||
|
||||
### subs ###
|
||||
|
||||
sub truecase {
|
||||
# to do
|
||||
}
|
||||
|
||||
sub train_lm {
|
||||
print STDERR "(2) Train language model on cased data @ ".`date`;
|
||||
my $cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -kndiscount";
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
|
||||
sub prepare_data {
|
||||
print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
|
||||
open(CORPUS,$CORPUS);
|
||||
open(CASED,">$DIR/aligned.cased");
|
||||
print "$DIR/aligned.lowercased\n";
|
||||
open(LOWERCASED,">$DIR/aligned.lowercased");
|
||||
open(ALIGNMENT,">$DIR/aligned.a");
|
||||
while(<CORPUS>) {
|
||||
next if length($_)>2000;
|
||||
s/\x{0}//g;
|
||||
s/\|//g;
|
||||
s/ +/ /g;
|
||||
s/^ //;
|
||||
s/ [\r\n]*$/\n/;
|
||||
next if /^$/;
|
||||
print CASED $_;
|
||||
print LOWERCASED lc($_);
|
||||
my $i=0;
|
||||
foreach (split) {
|
||||
print ALIGNMENT "$i-$i ";
|
||||
$i++;
|
||||
}
|
||||
print ALIGNMENT "\n";
|
||||
}
|
||||
close(CORPUS);
|
||||
close(CASED);
|
||||
close(LOWERCASED);
|
||||
close(ALIGNMENT);
|
||||
}
|
||||
|
||||
sub train_recase_model {
|
||||
my $first = $FIRST_STEP;
|
||||
$first = 4 if $first < 4;
|
||||
print STDERR "\n(4) Training recasing model @ ".`date`;
|
||||
my $cmd = "$TRAIN_SCRIPT --root-dir $DIR --model-dir $DIR --first-step $first --alignment a --corpus $DIR/aligned --f lowercased --e cased --max-phrase-length $MAX_LEN --lm 0:3:$DIR/cased.srilm.gz:0";
|
||||
$cmd .= " -scripts-root-dir $SCRIPTS_ROOT_DIR" if $SCRIPTS_ROOT_DIR;
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
|
||||
sub cleanup {
|
||||
print STDERR "\n(11) Cleaning up @ ".`date`;
|
||||
`rm -f $DIR/extract*`;
|
||||
`rm -f $DIR/aligned*`;
|
||||
`rm -f $DIR/lex*`;
|
||||
}
|
224
scripts/training/filter-and-binarize-model-given-input.pl
Executable file
224
scripts/training/filter-and-binarize-model-given-input.pl
Executable file
@ -0,0 +1,224 @@
|
||||
#!/usr/bin/perl -w
|
||||
# Given a moses.ini file and an input text prepare minimized translation
|
||||
# tables and a new moses.ini, so that loading of tables is much faster.
|
||||
|
||||
# original code by Philipp Koehn
|
||||
# changes by Ondrej Bojar
|
||||
|
||||
use strict;
|
||||
|
||||
my $MAX_LENGTH = 10;
|
||||
# consider phrases in input up to this length
|
||||
# in other words, all phrase-tables will be truncated at least to 10 words per
|
||||
# phrase
|
||||
|
||||
my $binarizer = shift;
|
||||
my $dir = shift;
|
||||
my $config = shift;
|
||||
my $input = shift;
|
||||
|
||||
if (!defined $dir || !defined $config || !defined $input) {
|
||||
print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text\n";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
$dir = ensure_full_path($dir);
|
||||
|
||||
# buggy directory in place?
|
||||
if (-d $dir && ! -e "$dir/info") {
|
||||
print STDERR "The directory $dir exists but does not belong to me. Delete $dir!\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
# already filtered? check if it can be re-used
|
||||
if (-d $dir) {
|
||||
my @INFO = `cat $dir/info`;
|
||||
chop(@INFO);
|
||||
if($INFO[0] ne $config
|
||||
|| ($INFO[1] ne $input &&
|
||||
$INFO[1].".tagged" ne $input)) {
|
||||
print STDERR "WARNING: directory exists but does not match parameters:\n";
|
||||
print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n";
|
||||
exit 1;
|
||||
}
|
||||
print STDERR "The filtered model was ready in $dir, not doing anything.\n";
|
||||
exit 0;
|
||||
}
|
||||
|
||||
|
||||
# filter the translation and distortion tables
|
||||
safesystem("mkdir -p $dir") or die "Can't mkdir $dir";
|
||||
|
||||
# get tables to be filtered (and modify config file)
|
||||
my (@TABLE,@TABLE_WEIGHTS,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%BINARIZABLE);
|
||||
open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini";
|
||||
open(INI,$config) or die "Can't read $config";
|
||||
while(<INI>) {
|
||||
print INI_OUT $_;
|
||||
if (/ttable-file\]/) {
|
||||
while(1) {
|
||||
my $table_spec = <INI>;
|
||||
if ($table_spec !~ /^([\d\,\-]+) ([\d\-]+) (\d+) (\S+)$/) {
|
||||
print INI_OUT $table_spec;
|
||||
last;
|
||||
}
|
||||
my ($source_factor,$t,$weights,$file) = ($1,$2,$3,$4);
|
||||
|
||||
chomp($file);
|
||||
push @TABLE, $file;
|
||||
push @TABLE_WEIGHTS,$weights;
|
||||
$BINARIZABLE{$#TABLE}++;
|
||||
|
||||
my $new_name = "$dir/phrase-table.$source_factor-$t";
|
||||
print INI_OUT "$source_factor $t $weights $new_name\n";
|
||||
push @TABLE_NEW_NAME,$new_name;
|
||||
|
||||
$CONSIDER_FACTORS{$source_factor} = 1;
|
||||
print STDERR "Considering factor $source_factor\n";
|
||||
push @TABLE_FACTORS, $source_factor;
|
||||
}
|
||||
}
|
||||
elsif (/distortion-file/) {
|
||||
while(1) {
|
||||
my $table_spec = <INI>;
|
||||
if ($table_spec !~ /^([\d\,\-]+) (\S+) (\d+) (\S+)$/) {
|
||||
print INI_OUT $table_spec;
|
||||
last;
|
||||
}
|
||||
my ($factors,$t,$weights,$file) = ($1,$2,$3,$4);
|
||||
my $source_factor = $factors;
|
||||
$source_factor =~ s/\-\d+$//;
|
||||
|
||||
chomp($file);
|
||||
push @TABLE,$file;
|
||||
push @TABLE_WEIGHTS,$weights;
|
||||
|
||||
$file =~ s/^.*\/+([^\/]+)/$1/g;
|
||||
my $new_name = "$dir/$file";
|
||||
$new_name =~ s/\.gz//;
|
||||
print INI_OUT "$factors $t $weights $new_name\n";
|
||||
push @TABLE_NEW_NAME,$new_name;
|
||||
|
||||
$CONSIDER_FACTORS{$source_factor} = 1;
|
||||
print STDERR "Considering factor $source_factor\n";
|
||||
push @TABLE_FACTORS,$source_factor;
|
||||
}
|
||||
}
|
||||
}
|
||||
close(INI);
|
||||
close(INI_OUT);
|
||||
|
||||
|
||||
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
|
||||
my %PHRASE_USED;
|
||||
open(INPUT,$input) or die "Can't read $input";
|
||||
while(my $line = <INPUT>) {
|
||||
chomp($line);
|
||||
my @WORD = split(/ +/,$line);
|
||||
for(my $i=0;$i<=$#WORD;$i++) {
|
||||
for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
|
||||
foreach (keys %CONSIDER_FACTORS) {
|
||||
my @FACTOR = split(/,/);
|
||||
my $phrase = "";
|
||||
for(my $k=$i;$k<=$i+$j;$k++) {
|
||||
my @WORD_FACTOR = split(/\|/,$WORD[$k]);
|
||||
for(my $f=0;$f<=$#FACTOR;$f++) {
|
||||
$phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
|
||||
}
|
||||
chop($phrase);
|
||||
$phrase .= " ";
|
||||
}
|
||||
chop($phrase);
|
||||
$PHRASE_USED{$_}{$phrase}++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(INPUT);
|
||||
|
||||
# filter files
|
||||
for(my $i=0;$i<=$#TABLE;$i++) {
|
||||
my ($used,$total) = (0,0);
|
||||
my $file = $TABLE[$i];
|
||||
my $factors = $TABLE_FACTORS[$i];
|
||||
my $new_file = $TABLE_NEW_NAME[$i];
|
||||
print STDERR "filtering $file -> $new_file...\n";
|
||||
|
||||
my $openstring;
|
||||
if ($file !~ /\.gz$/ && -e "$file.gz") {
|
||||
$openstring = "zcat $file.gz |";
|
||||
} elsif ($file =~ /\.gz$/) {
|
||||
$openstring = "zcat $file |";
|
||||
} else {
|
||||
$openstring = "< $file";
|
||||
}
|
||||
|
||||
open(FILE,$openstring) or die "Can't open '$openstring'";
|
||||
open(FILE_OUT,">$new_file") or die "Can't write $new_file";
|
||||
|
||||
while(my $entry = <FILE>) {
|
||||
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
|
||||
$foreign =~ s/ $//;
|
||||
if (defined($PHRASE_USED{$factors}{$foreign})) {
|
||||
print FILE_OUT $entry;
|
||||
$used++;
|
||||
}
|
||||
$total++;
|
||||
}
|
||||
close(FILE);
|
||||
close(FILE_OUT);
|
||||
die "No phrases found in $file!" if $total == 0;
|
||||
printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
|
||||
if ($BINARIZABLE{$i}) {
|
||||
print STDERR "binarizing...";
|
||||
my $cmd = "cat $new_file | sort | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
}
|
||||
|
||||
open(INFO,">$dir/info");
|
||||
print INFO "$config\n$input\n";
|
||||
close(INFO);
|
||||
|
||||
|
||||
print "To run the decoder, please call:
|
||||
moses -f $dir/moses.ini < $input\n";
|
||||
|
||||
sub safesystem {
|
||||
print STDERR "Executing: @_\n";
|
||||
system(@_);
|
||||
if ($? == -1) {
|
||||
print STDERR "Failed to execute: @_\n $!\n";
|
||||
exit(1);
|
||||
}
|
||||
elsif ($? & 127) {
|
||||
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
||||
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||
exit(1);
|
||||
}
|
||||
else {
|
||||
my $exitcode = $? >> 8;
|
||||
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
||||
return ! $exitcode;
|
||||
}
|
||||
}
|
||||
sub ensure_full_path {
|
||||
my $PATH = shift;
|
||||
return $PATH if $PATH =~ /^\//;
|
||||
my $dir = `pawd 2>/dev/null`;
|
||||
if (!$dir) {$dir = `pwd`;}
|
||||
chomp $dir;
|
||||
$PATH = $dir."/".$PATH;
|
||||
$PATH =~ s/[\r\n]//g;
|
||||
$PATH =~ s/\/\.\//\//g;
|
||||
$PATH =~ s/\/+/\//g;
|
||||
my $sanity = 0;
|
||||
while($PATH =~ /\/\.\.\// && $sanity++<10) {
|
||||
$PATH =~ s/\/+/\//g;
|
||||
$PATH =~ s/\/[^\/]+\/\.\.\//\//g;
|
||||
}
|
||||
$PATH =~ s/\/[^\/]+\/\.\.$//;
|
||||
$PATH =~ s/\/+$//;
|
||||
return $PATH;
|
||||
}
|
@ -141,7 +141,9 @@ for(my $i=0;$i<=$#TABLE;$i++) {
|
||||
print STDERR "filtering $file -> $new_file...\n";
|
||||
|
||||
my $openstring;
|
||||
if ($file =~ /\.gz$/) {
|
||||
if ($file !~ /\.gz$/ && -e "$file.gz") {
|
||||
$openstring = "zcat $file.gz |";
|
||||
} elsif ($file =~ /\.gz$/) {
|
||||
$openstring = "zcat $file |";
|
||||
} else {
|
||||
$openstring = "< $file";
|
||||
|
@ -9,6 +9,8 @@
|
||||
|
||||
# Revision history
|
||||
|
||||
# 13 Feb 2007 Better handling of default values for lambda, now works with multiple
|
||||
# models and lexicalized reordering
|
||||
# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1]
|
||||
# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi)
|
||||
# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table)
|
||||
@ -32,25 +34,38 @@
|
||||
# 13 Oct 2004 Use alternative decoders (DWC)
|
||||
# Original version by Philipp Koehn
|
||||
|
||||
|
||||
# for each _d_istortion, _l_anguage _m_odel, _t_ranslation _m_odel and _w_ord penalty, there is a list
|
||||
# of [ default value, lower bound, upper bound ]-triples. In most cases, only one triple is used,
|
||||
# but the translation model has currently 5 features
|
||||
|
||||
# defaults for initial values and ranges are:
|
||||
|
||||
my $default_triples = {
|
||||
# for each _d_istortion, _l_anguage _m_odel, _t_ranslation _m_odel and _w_ord penalty, there is a list
|
||||
# of [ default value, lower bound, upper bound ]-triples. In most cases, only one triple is used,
|
||||
# but the translation model has currently 5 features
|
||||
"d" => [ [ 1.0, 0.0, 2.0 ] ],
|
||||
"lm" => [ [ 1.0, 0.0, 2.0 ] ],
|
||||
"tm" => [
|
||||
[ 0.3, 0.0, 0.5 ],
|
||||
[ 0.2, 0.0, 0.5 ],
|
||||
[ 0.3, 0.0, 0.5 ],
|
||||
[ 0.2, 0.0, 0.5 ],
|
||||
[ 0.0, -1.0, 1.0 ],
|
||||
],
|
||||
"g" => [
|
||||
[ 1.0, 0.0, 2.0 ],
|
||||
[ 1.0, 0.0, 2.0 ],
|
||||
],
|
||||
"w" => [ [ 0.0, -1.0, 1.0 ] ],
|
||||
# these two basic models exist even if not specified, they are
|
||||
# not associated with any model file
|
||||
"d" => [ [ 1.0, 0.0, 2.0 ] ], # distance-based distortion
|
||||
"w" => [ [ 0.0, -1.0, 1.0 ] ], # word penalty
|
||||
};
|
||||
|
||||
my $additional_triples = {
|
||||
# if the more lambda parameters for the weights are needed
|
||||
# (due to additional tables) use the following values for them
|
||||
"d" => [ [ 1.0, 0.0, 2.0 ], # lexicalized reordering model
|
||||
[ 1.0, 0.0, 2.0 ],
|
||||
[ 1.0, 0.0, 2.0 ],
|
||||
[ 1.0, 0.0, 2.0 ],
|
||||
[ 1.0, 0.0, 2.0 ],
|
||||
[ 1.0, 0.0, 2.0 ],
|
||||
[ 1.0, 0.0, 2.0 ] ],
|
||||
"lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model
|
||||
"g" => [ [ 1.0, 0.0, 2.0 ], # generation model
|
||||
[ 1.0, 0.0, 2.0 ] ],
|
||||
"tm" => [ [ 0.3, 0.0, 0.5 ], # translation model
|
||||
[ 0.2, 0.0, 0.5 ],
|
||||
[ 0.3, 0.0, 0.5 ],
|
||||
[ 0.2, 0.0, 0.5 ],
|
||||
[ 0.0,-1.0, 1.0 ] ], # ... last weight is phrase penalty
|
||||
};
|
||||
|
||||
# moses.ini file uses FULL names for lambdas, while this training script internally (and on the command line)
|
||||
@ -66,13 +81,10 @@ my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d gene
|
||||
my %TABLECONFIG2ABBR = map {split(/=/,$_,2)} split /\s+/, $TABLECONFIG_ABBR_MAP;
|
||||
|
||||
# There are weights that do not correspond to any input file, they just increase the total number of lambdas we optimize
|
||||
my $extra_lambdas_for_model = {
|
||||
"w" => 1, # word penalty
|
||||
"d" => 1, # basic distortion
|
||||
};
|
||||
|
||||
|
||||
|
||||
#my $extra_lambdas_for_model = {
|
||||
# "w" => 1, # word penalty
|
||||
# "d" => 1, # basic distortion
|
||||
#};
|
||||
|
||||
my $minimum_required_change_in_weights = 0.00001;
|
||||
# stop if no lambda changes more than this
|
||||
@ -218,7 +230,7 @@ if ($___INPUTTYPE == 1)
|
||||
%FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
|
||||
|
||||
push @{$default_triples -> {"I"}}, [ 1.0, 0.0, 2.0 ];
|
||||
$extra_lambdas_for_model -> {"I"} = 1; #Confusion network posterior
|
||||
#$extra_lambdas_for_model -> {"I"} = 1; #Confusion network posterior
|
||||
}
|
||||
|
||||
# Check validity of input parameters and set defaults if needed
|
||||
@ -230,9 +242,6 @@ if (!defined $SCRIPTS_ROOTDIR) {
|
||||
|
||||
print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
|
||||
|
||||
|
||||
|
||||
|
||||
# path of script for filtering phrase tables and running the decoder
|
||||
$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd;
|
||||
|
||||
@ -250,8 +259,8 @@ $pythonpath = "$cmertdir/python" if !defined $pythonpath;
|
||||
|
||||
$ENV{PYTHONPATH} = $pythonpath; # other scripts need to know
|
||||
|
||||
|
||||
die "Not executable: $filtercmd" if ! -x $filtercmd;
|
||||
my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd);
|
||||
die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd;
|
||||
die "Not executable: $cmertcmd" if ! -x $cmertcmd;
|
||||
die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd;
|
||||
die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
|
||||
@ -298,13 +307,13 @@ $___CONFIG = $config_abs;
|
||||
|
||||
# check validity of moses.ini and collect number of models and lambdas per model
|
||||
# need to make a copy of $extra_lambdas_for_model, scan_config spoils it
|
||||
my %copy_of_extra_lambdas_for_model = %$extra_lambdas_for_model;
|
||||
my ($lambdas_per_model, $models_used) = scan_config($___CONFIG, \%copy_of_extra_lambdas_for_model);
|
||||
#my %copy_of_extra_lambdas_for_model = %$extra_lambdas_for_model;
|
||||
my %used_triples = %{$default_triples};
|
||||
my ($models_used) = scan_config($___CONFIG);
|
||||
|
||||
|
||||
# Parse the lambda config string and convert it to a nice structure in the same format as $default_triples
|
||||
my $use_triples = undef;
|
||||
# Parse the lambda config string and convert it to a nice structure in the same format as $used_triples
|
||||
if (defined $___LAMBDA) {
|
||||
my %specified_triples;
|
||||
# interpreting lambdas from command line
|
||||
foreach (split(/\s+/,$___LAMBDA)) {
|
||||
my ($name,$values) = split(/:/);
|
||||
@ -314,43 +323,25 @@ if (defined $___LAMBDA) {
|
||||
my $start = $1;
|
||||
my $min = $2;
|
||||
my $max = $3;
|
||||
push @{$use_triples->{$name}}, [$start, $min, $max];
|
||||
push @{$specified_triples{$name}}, [$start, $min, $max];
|
||||
}
|
||||
else {
|
||||
die "Malformed feature range definition: $name => $startminmax\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
# no lambdas supplied, use the default ones, but do not forget to repeat them accordingly
|
||||
# first for or inherent models
|
||||
foreach my $name (keys %$extra_lambdas_for_model) {
|
||||
foreach (1..$extra_lambdas_for_model->{$name}) {
|
||||
die "No default weights defined for -$name"
|
||||
if !defined $default_triples->{$name};
|
||||
# XXX here was a deadly bug: we need a deep copy of the default values
|
||||
my @copy = ();
|
||||
foreach my $triple (@{$default_triples->{$name}}) {
|
||||
my @copy_triple = @$triple;
|
||||
push @copy, [ @copy_triple ];
|
||||
}
|
||||
push @{$use_triples->{$name}}, @copy;
|
||||
}
|
||||
# sanity checks for specified lambda triples
|
||||
foreach my $name (keys %used_triples) {
|
||||
die "No lambdas specified for '$name', but ".($used_triples{$name})." needed.\n"
|
||||
unless defined($specified_triples{$name});
|
||||
die "Number of lambdas specified for '$name' (".($specified_triples{$name}).") does not match number needed (".($used_triples{$name}).")\n"
|
||||
if scalar $used_triples{$name} != scalar $specified_triples{$name};
|
||||
}
|
||||
# and then for all models used
|
||||
foreach my $name (keys %$models_used) {
|
||||
foreach (1..$models_used->{$name}) {
|
||||
die "No default weights defined for -$name"
|
||||
if !defined $default_triples->{$name};
|
||||
# XXX here was a deadly bug: we need a deep copy of the default values
|
||||
my @copy = ();
|
||||
foreach my $triple (@{$default_triples->{$name}}) {
|
||||
my @copy_triple = @$triple;
|
||||
push @copy, [ @copy_triple ];
|
||||
}
|
||||
push @{$use_triples->{$name}}, @copy;
|
||||
}
|
||||
foreach my $name (keys %specified_triples) {
|
||||
die "Lambdas specified for '$name' ".($specified_triples{$name}).", but none needed.\n"
|
||||
unless defined($used_triples{$name});
|
||||
}
|
||||
%used_triples = %specified_triples;
|
||||
}
|
||||
|
||||
# moses should use our config
|
||||
@ -363,24 +354,6 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
|
||||
die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files.";
|
||||
}
|
||||
|
||||
# walk through all lambdas the user wishes to optimize and check
|
||||
# if the number of lambdas matches
|
||||
foreach my $name (keys %$use_triples) {
|
||||
my $expected_lambdas = $lambdas_per_model->{$name};
|
||||
$expected_lambdas = 0 if !defined $expected_lambdas;
|
||||
my $got_lambdas = defined $use_triples->{$name} ? scalar @{$use_triples->{$name}} : 0;
|
||||
if ($got_lambdas != $expected_lambdas) {
|
||||
if ($allow_unknown_lambdas && $expected_lambdas == 0) {
|
||||
print STDERR "Allowing to optimize $name, although I have no idea what it is.\n";
|
||||
} else {
|
||||
print STDERR "Wrong number of lambdas for $name. Expected (given the config file): $expected_lambdas, got: $got_lambdas.
|
||||
Use --allow-unknown-lambdas to optimize lambdas that you are just introducing
|
||||
and I cannot validate against the models mentioned in moses.ini.\n";
|
||||
exit 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# as weights are normalized in the next steps (by cmert)
|
||||
# normalize initial LAMBDAs, too
|
||||
my $need_to_normalize = 1;
|
||||
@ -399,6 +372,7 @@ my @order_of_lambdas_from_decoder = ();
|
||||
my $cwd = `pawd 2>/dev/null`;
|
||||
if(!$cwd){$cwd = `pwd`;}
|
||||
chomp($cwd);
|
||||
|
||||
safesystem("mkdir -p $___WORKING_DIR") or die "Can't mkdir $___WORKING_DIR";
|
||||
|
||||
{
|
||||
@ -440,12 +414,11 @@ if ($continue) {
|
||||
close IN;
|
||||
my @newweights = split /\s+/, $newweights;
|
||||
|
||||
# dump_triples($use_triples);
|
||||
$use_triples = store_new_lambda_values($use_triples, \@order_of_lambdas_from_decoder, \@newweights);
|
||||
# dump_triples($use_triples);
|
||||
#dump_triples(\%used_triples);
|
||||
store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights);
|
||||
#dump_triples(\%used_triples);
|
||||
}
|
||||
|
||||
|
||||
if ($___FILTER_PHRASE_TABLE){
|
||||
# filter the phrase tables wih respect to input, use --decoder-flags
|
||||
print "filtering the phrase tables... ".`date`;
|
||||
@ -480,13 +453,13 @@ while(1) {
|
||||
print "run $run start at ".`date`;
|
||||
|
||||
# In case something dies later, we might wish to have a copy
|
||||
create_config($___CONFIG, "./run$run.moses.ini", $use_triples, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
|
||||
create_config($___CONFIG, "./run$run.moses.ini", \%used_triples, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
|
||||
|
||||
|
||||
# skip if the user wanted
|
||||
if (!$skip_decoder) {
|
||||
print "($run) run decoder to produce n-best lists\n";
|
||||
@order_of_lambdas_from_decoder = run_decoder($use_triples, $PARAMETERS, $run, \@order_of_lambdas_from_decoder, $need_to_normalize);
|
||||
@order_of_lambdas_from_decoder = run_decoder(\%used_triples, $PARAMETERS, $run, \@order_of_lambdas_from_decoder, $need_to_normalize);
|
||||
$need_to_normalize = 0;
|
||||
safesystem("gzip -f run*out") or die "Failed to gzip run*out";
|
||||
}
|
||||
@ -566,8 +539,8 @@ while(1) {
|
||||
next if $visited{$name};
|
||||
$visited{$name} = 1;
|
||||
die "The decoder produced also some '$name' scores, but we do not know the ranges for them, no way to optimize them\n"
|
||||
if !defined $use_triples->{$name};
|
||||
foreach my $feature (@{$use_triples->{$name}}) {
|
||||
if !defined $used_triples{$name};
|
||||
foreach my $feature (@{$used_triples{$name}}) {
|
||||
my ($val, $min, $max) = @$feature;
|
||||
push @CURR, $val;
|
||||
push @MIN, $min;
|
||||
@ -624,7 +597,7 @@ while(1) {
|
||||
my @newweights = split /\s+/, $bestpoint;
|
||||
|
||||
# update my cache of lambda values
|
||||
$use_triples = store_new_lambda_values($use_triples, \@order_of_lambdas_from_decoder, \@newweights);
|
||||
store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights);
|
||||
|
||||
## additional stopping criterion: weights have not changed
|
||||
my $shouldstop = 1;
|
||||
@ -653,7 +626,7 @@ print "Training finished at ".`date`;
|
||||
safesystem("cp init.opt run$run.init.opt") or die;
|
||||
safesystem ("cp cmert.log run$run.cmert.log") or die;
|
||||
|
||||
create_config($___CONFIG, "./moses.ini", $use_triples, $run, $devbleu);
|
||||
create_config($___CONFIG, "./moses.ini", \%used_triples, $run, $devbleu);
|
||||
|
||||
# just to be sure that we have the really last finished step marked
|
||||
open F, "> finished_step.txt" or die "Can't mark finished step";
|
||||
@ -693,7 +666,6 @@ sub store_new_lambda_values {
|
||||
# print STDERR "Storing $i-th score as $name: $idx{$name}: $values->[$i]\n";
|
||||
$triples->{$name}->[$idx{$name}]->[0] = $values->[$i];
|
||||
}
|
||||
return $triples;
|
||||
}
|
||||
|
||||
sub dump_triples {
|
||||
@ -820,7 +792,7 @@ sub create_config {
|
||||
delete($P{$abbr});
|
||||
delete($P{$ABBR2FULL{$abbr}});
|
||||
# Then feed P with the current values
|
||||
foreach my $feature (@{$use_triples->{$abbr}}) {
|
||||
foreach my $feature (@{$used_triples{$abbr}}) {
|
||||
my ($val, $min, $max) = @$feature;
|
||||
my $name = defined $ABBR2FULL{$abbr} ? $ABBR2FULL{$abbr} : $abbr;
|
||||
push @{$P{$name}}, $val;
|
||||
@ -933,7 +905,6 @@ sub ensure_full_path {
|
||||
sub scan_config {
|
||||
my $ini = shift;
|
||||
my $inishortname = $ini; $inishortname =~ s/^.*\///; # for error reporting
|
||||
my $lambda_counts = shift;
|
||||
# we get a pre-filled counts, because some lambdas are always needed (word penalty, for instance)
|
||||
# as we walk though the ini file, we record how many extra lambdas do we need
|
||||
# and finally, we report it
|
||||
@ -978,12 +949,13 @@ sub scan_config {
|
||||
my @flds = split / +/;
|
||||
my $fn = $flds[$where_is_filename{$section}];
|
||||
if (defined $fn && $fn !~ /^\s+$/) {
|
||||
print "checking weight-count for $section\n";
|
||||
# this is a filename! check it
|
||||
if ($fn !~ /^\//) {
|
||||
$error = 1;
|
||||
print STDERR "$inishortname:$nr:Filename not absolute: $fn\n";
|
||||
}
|
||||
if (! -s $fn) {
|
||||
if (! -s $fn && ! -s "$fn.gz") {
|
||||
$error = 1;
|
||||
print STDERR "$inishortname:$nr:File does not exist or empty: $fn\n";
|
||||
}
|
||||
@ -996,12 +968,20 @@ sub scan_config {
|
||||
my $needlambdas = defined $where_is_lambda_count{$section} ? $flds[$where_is_lambda_count{$section}] : 1;
|
||||
|
||||
print STDERR "Config needs $needlambdas lambdas for $section (i.e. $shortname)\n" if $verbose;
|
||||
$lambda_counts->{$shortname}+=$needlambdas;
|
||||
if (!defined $___LAMBDA && (!defined $default_triples->{$shortname} || scalar(@{$default_triples->{$shortname}}) != $needlambdas)) {
|
||||
print STDERR "$inishortname:$nr:Your model $shortname needs $needlambdas weights but we define the default ranges for "
|
||||
.scalar(@{$default_triples->{$shortname}})." weights. Cannot use the default, you must supply lambdas by hand.\n";
|
||||
if (!defined $___LAMBDA && (!defined $additional_triples->{$shortname} || scalar(@{$additional_triples->{$shortname}}) < $needlambdas)) {
|
||||
print STDERR "$inishortname:$nr:Your model $shortname needs $needlambdas weights but we define the default ranges for only "
|
||||
.scalar(@{$additional_triples->{$shortname}})." weights. Cannot use the default, you must supply lambdas by hand.\n";
|
||||
$error = 1;
|
||||
}
|
||||
else {
|
||||
# note: table may use less parameters than the maximum number
|
||||
# of triples
|
||||
for(my $lambda=0;$lambda<$needlambdas;$lambda++) {
|
||||
my ($start, $min, $max)
|
||||
= @{${$additional_triples->{$shortname}}[$lambda]};
|
||||
push @{$used_triples{$shortname}}, [$start, $min, $max];
|
||||
}
|
||||
}
|
||||
$defined_files{$shortname}++;
|
||||
}
|
||||
}
|
||||
@ -1018,6 +998,6 @@ sub scan_config {
|
||||
}
|
||||
}
|
||||
exit(1) if $error;
|
||||
return ($lambda_counts, \%defined_files);
|
||||
return (\%defined_files);
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,7 @@ use Getopt::Long "GetOptions";
|
||||
# -----------------------------------------------------
|
||||
$ENV{"LC_ALL"} = "C";
|
||||
|
||||
my($_ROOT_DIR,$_CORPUS_DIR,$_GIZA_E2F,$_GIZA_F2E,$_MODEL_DIR,$_CORPUS,$_CORPUS_COMPRESSION,$_FIRST_STEP,$_LAST_STEP,$_F,$_E,$_MAX_PHRASE_LENGTH,$_LEXICAL_DIR,$_NO_LEXICAL_WEIGHTING,$_VERBOSE,$_ALIGNMENT,@_LM,$_EXTRACT_FILE,$_GIZA_OPTION,$_HELP,$_PARTS,$_DIRECTION,$_ONLY_PRINT_GIZA,$_REORDERING,$_REORDERING_SMOOTH,$_ALIGNMENT_FACTORS,$_TRANSLATION_FACTORS,$_REORDERING_FACTORS,$_GENERATION_FACTORS,$_DECODING_STEPS,$_PARALLEL, $SCRIPTS_ROOTDIR, $_FACTOR_DELIMITER);
|
||||
my($_ROOT_DIR,$_CORPUS_DIR,$_GIZA_E2F,$_GIZA_F2E,$_MODEL_DIR,$_CORPUS,$_CORPUS_COMPRESSION,$_FIRST_STEP,$_LAST_STEP,$_F,$_E,$_MAX_PHRASE_LENGTH,$_LEXICAL_DIR,$_NO_LEXICAL_WEIGHTING,$_VERBOSE,$_ALIGNMENT,@_LM,$_EXTRACT_FILE,$_GIZA_OPTION,$_HELP,$_PARTS,$_DIRECTION,$_ONLY_PRINT_GIZA,$_REORDERING,$_REORDERING_SMOOTH,$_ALIGNMENT_FACTORS,$_TRANSLATION_FACTORS,$_REORDERING_FACTORS,$_GENERATION_FACTORS,$_DECODING_STEPS,$_PARALLEL, $SCRIPTS_ROOTDIR, $_FACTOR_DELIMITER,@_PHRASE_TABLE,@_REORDERING_TABLE,$_CONFIG);
|
||||
|
||||
my $debug = 0; # debug this script, do not delete any files in debug mode
|
||||
|
||||
@ -56,6 +56,9 @@ $_HELP = 1
|
||||
'decoding-steps=s' => \$_DECODING_STEPS,
|
||||
'scripts-root-dir=s' => \$SCRIPTS_ROOTDIR,
|
||||
'factor-delimiter=s' => \$_FACTOR_DELIMITER,
|
||||
'phrase-table=s' => \@_PHRASE_TABLE,
|
||||
'config=s' => \$_CONFIG,
|
||||
'reordering-table=s' => \@_REORDERING_TABLE,
|
||||
);
|
||||
|
||||
if ($_HELP) {
|
||||
@ -147,6 +150,8 @@ $___MODEL_DIR = $_MODEL_DIR if $_MODEL_DIR;
|
||||
my $___EXTRACT_FILE = $___MODEL_DIR."/extract";
|
||||
$___EXTRACT_FILE = $_EXTRACT_FILE if $_EXTRACT_FILE;
|
||||
|
||||
my $___CONFIG = $___ROOT_DIR."/model/moses.ini";
|
||||
$___CONFIG = $_CONFIG if $_CONFIG;
|
||||
|
||||
my $___MAX_PHRASE_LENGTH = 7;
|
||||
my $___LEXICAL_WEIGHTING = 1;
|
||||
@ -167,12 +172,14 @@ if ($___LAST_STEP == 9) {
|
||||
die "use --lm factor:order:filename to specify at least one language model"
|
||||
if scalar @_LM == 0;
|
||||
foreach my $lm (@_LM) {
|
||||
my ($f, $order, $filename) = split /:/, $lm, 3;
|
||||
my $type = 0; # default to srilm
|
||||
my ($f, $order, $filename);
|
||||
($f, $order, $filename, $type) = split /:/, $lm, 4;
|
||||
die "Wrong format of --lm. Expected: --lm factor:order:filename"
|
||||
if $f !~ /^[0-9]+$/ || $order !~ /^[0-9]+$/ || !defined $filename;
|
||||
die "Language model file not found or empty: $filename"
|
||||
if ! -s $filename;
|
||||
push @___LM, [ $f, $order, $filename ];
|
||||
push @___LM, [ $f, $order, $filename, $type ];
|
||||
}
|
||||
}
|
||||
|
||||
@ -196,16 +203,17 @@ $___REORDERING_SMOOTH = $_REORDERING_SMOOTH if $_REORDERING_SMOOTH;
|
||||
my %REORDERING_MODEL;
|
||||
my $REORDERING_LEXICAL = 0; # flag for building lexicalized reordering models
|
||||
foreach my $r (split(/,/,$___REORDERING)) {
|
||||
if (!( $r eq "orientation-f" ||
|
||||
$r eq "orientation-fe" ||
|
||||
$r eq "orientation-bidirectional-f" ||
|
||||
$r eq "orientation-bidirectional-fe" ||
|
||||
$r =~ s/orientation/msd/;
|
||||
if (!( $r eq "msd-f" ||
|
||||
$r eq "msd-fe" ||
|
||||
$r eq "msd-bidirectional-f" ||
|
||||
$r eq "msd-bidirectional-fe" ||
|
||||
$r eq "monotonicity-f" ||
|
||||
$r eq "monotonicity-fe" ||
|
||||
$r eq "monotonicity-bidirectional-f" ||
|
||||
$r eq "monotonicity-bidirectional-fe" ||
|
||||
$r eq "distance")) {
|
||||
print STDERR "unknwown reordering type: $r";
|
||||
print STDERR "unknown reordering type: $r";
|
||||
exit(1);
|
||||
}
|
||||
if ($r ne "distance") { $REORDERING_LEXICAL = 1; }
|
||||
@ -225,11 +233,13 @@ $___ALIGNMENT_FACTORS = $_ALIGNMENT_FACTORS if defined($_ALIGNMENT_FACTORS);
|
||||
die("format for alignment factors is \"0-0\" or \"0,1,2-0,1\", you provided $___ALIGNMENT_FACTORS\n") if $___ALIGNMENT_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*$/;
|
||||
|
||||
my $___TRANSLATION_FACTORS = undef;
|
||||
$___TRANSLATION_FACTORS = "0-0" unless defined($_DECODING_STEPS); # single factor default
|
||||
$___TRANSLATION_FACTORS = $_TRANSLATION_FACTORS if defined($_TRANSLATION_FACTORS);
|
||||
die("format for translation factors is \"0-0\" or \"0-0+1-1\" or \"0-0+0,1-0,1\", you provided $___TRANSLATION_FACTORS\n")
|
||||
if defined $___TRANSLATION_FACTORS && $___TRANSLATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
|
||||
|
||||
my $___REORDERING_FACTORS = undef;
|
||||
$___REORDERING_FACTORS = "0-0" if defined($_REORDERING) && ! defined($_DECODING_STEPS); # single factor default
|
||||
$___REORDERING_FACTORS = $_REORDERING_FACTORS if defined($_REORDERING_FACTORS);
|
||||
die("format for reordering factors is \"0-0\" or \"0-0+1-1\" or \"0-0+0,1-0,1\", you provided $___REORDERING_FACTORS\n")
|
||||
if defined $___REORDERING_FACTORS && $___REORDERING_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
|
||||
@ -239,10 +249,10 @@ $___GENERATION_FACTORS = $_GENERATION_FACTORS if defined($_GENERATION_FACTORS);
|
||||
die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n")
|
||||
if defined $___GENERATION_FACTORS && $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
|
||||
|
||||
my $___DECODING_STEPS = $_DECODING_STEPS;
|
||||
die("use --decoding-steps to specify decoding steps") if ( !defined $_DECODING_STEPS && $___LAST_STEP>=9 && $___FIRST_STEP<=9);
|
||||
my $___DECODING_STEPS = "t0";
|
||||
$___DECODING_STEPS = $_DECODING_STEPS if defined($_DECODING_STEPS);
|
||||
die("format for decoding steps is \"t0,g0,t1,g1\", you provided $___DECODING_STEPS\n")
|
||||
if defined $___DECODING_STEPS && $___DECODING_STEPS !~ /^[tg]\d+(,[tg]\d+)*$/;
|
||||
if defined $_DECODING_STEPS && $_DECODING_STEPS !~ /^[tg]\d+(,[tg]\d+)*$/;
|
||||
|
||||
my ($factor,$factor_e,$factor_f);
|
||||
|
||||
@ -1029,14 +1039,14 @@ sub get_reordering {
|
||||
print STDERR "(7.2) building tables @ ".`date`;
|
||||
open(O,"$___EXTRACT_FILE.$factor.o.sorted")
|
||||
or die "Can't read $___EXTRACT_FILE.$factor.o.sorted";
|
||||
open(OF, "|gzip >$___MODEL_DIR/orientation-table.$factor.f.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"orientation-f"});
|
||||
open(OFE, "|gzip >$___MODEL_DIR/orientation-table.$factor.fe.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"orientation-fe"});
|
||||
open(OBF, "|gzip >$___MODEL_DIR/orientation-table.$factor.bi.f.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"orientation-bidirectional-f"});
|
||||
open(OBFE,"|gzip >$___MODEL_DIR/orientation-table.$factor.bi.fe.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"orientation-bidirectional-fe"});
|
||||
open(OF, "|gzip >$___MODEL_DIR/msd-table.$factor.f.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"msd-f"});
|
||||
open(OFE, "|gzip >$___MODEL_DIR/msd-table.$factor.fe.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"msd-fe"});
|
||||
open(OBF, "|gzip >$___MODEL_DIR/msd-table.$factor.bi.f.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"msd-bidirectional-f"});
|
||||
open(OBFE,"|gzip >$___MODEL_DIR/msd-table.$factor.bi.fe.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"msd-bidirectional-fe"});
|
||||
open(MF, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.f.$___REORDERING_SMOOTH.gz")
|
||||
if defined($REORDERING_MODEL{"monotonicity-f"});
|
||||
open(MFE, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.fe.$___REORDERING_SMOOTH.gz")
|
||||
@ -1107,14 +1117,14 @@ sub get_reordering {
|
||||
sub store_reordering_f {
|
||||
my $total_previous_f = $mono_previous_f+$swap_previous_f+$other_previous_f;
|
||||
my $total_following_f = $mono_following_f+$swap_following_f+$other_following_f;
|
||||
if(defined($REORDERING_MODEL{"orientation-f"})) {
|
||||
if(defined($REORDERING_MODEL{"msd-f"})) {
|
||||
printf OF ("%s ||| %.5f %.5f %.5f\n",
|
||||
$f_current,
|
||||
$mono_previous_f/$total_previous_f,
|
||||
$swap_previous_f/$total_previous_f,
|
||||
$other_previous_f/$total_previous_f);
|
||||
}
|
||||
if(defined($REORDERING_MODEL{"orientation-bidirectional-f"})) {
|
||||
if(defined($REORDERING_MODEL{"msd-bidirectional-f"})) {
|
||||
printf OBF ("%s ||| %.5f %.5f %.5f %.5f %.5f %.5f\n",
|
||||
$f_current,
|
||||
$mono_previous_f/$total_previous_f,
|
||||
@ -1144,14 +1154,14 @@ sub store_reordering_fe {
|
||||
my $total_previous_fe = $mono_previous_fe+$swap_previous_fe+$other_previous_fe;
|
||||
my $total_following_fe = $mono_following_fe+$swap_following_fe+$other_following_fe;
|
||||
|
||||
if(defined($REORDERING_MODEL{"orientation-fe"})) {
|
||||
if(defined($REORDERING_MODEL{"msd-fe"})) {
|
||||
printf OFE ("%s ||| %s ||| %.5f %.5f %.5f\n",
|
||||
$f_current, $e_current,
|
||||
$mono_previous_fe/$total_previous_fe,
|
||||
$swap_previous_fe/$total_previous_fe,
|
||||
$other_previous_fe/$total_previous_fe);
|
||||
}
|
||||
if(defined($REORDERING_MODEL{"orientation-bidirectional-fe"})) {
|
||||
if(defined($REORDERING_MODEL{"msd-bidirectional-fe"})) {
|
||||
printf OBFE ("%s ||| %s ||| %.5f %.5f %.5f %.5f %.5f %.5f\n",
|
||||
$f_current, $e_current,
|
||||
$mono_previous_fe/$total_previous_fe,
|
||||
@ -1257,12 +1267,13 @@ sub create_ini {
|
||||
&full_path(\$___MODEL_DIR);
|
||||
&full_path(\$___VCB_E);
|
||||
&full_path(\$___VCB_F);
|
||||
open(INI,">$___MODEL_DIR/moses.ini") or die "Can't write $___MODEL_DIR/moses.ini";
|
||||
`mkdir -p $___MODEL_DIR`;
|
||||
open(INI,">$___CONFIG") or die("Can't write $___CONFIG");
|
||||
print INI "#########################
|
||||
### MOSES CONFIG FILE ###
|
||||
#########################
|
||||
\n";
|
||||
|
||||
|
||||
if (defined $___TRANSLATION_FACTORS) {
|
||||
print INI "# input factors\n";
|
||||
print INI "[input-factors]\n";
|
||||
@ -1278,7 +1289,6 @@ sub create_ini {
|
||||
die "No translation steps defined, cannot prepare [input-factors] section\n";
|
||||
}
|
||||
|
||||
|
||||
my %stepsused;
|
||||
print INI "\n# mapping steps
|
||||
[mapping]\n";
|
||||
@ -1292,11 +1302,14 @@ sub create_ini {
|
||||
print INI "\n# translation tables: source-factors, target-factors, number of scores, file
|
||||
[ttable-file]\n";
|
||||
my $num_of_ttables = 0;
|
||||
my @SPECIFIED_TABLE = @_PHRASE_TABLE;
|
||||
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
|
||||
$num_of_ttables++;
|
||||
my $ff = $f;
|
||||
$ff =~ s/\-/ /;
|
||||
print INI "$ff 5 $___MODEL_DIR/phrase-table.$f.gz\n";
|
||||
my $file = "$___MODEL_DIR/phrase-table.$f.gz";
|
||||
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
print INI "$ff 5 $file\n";
|
||||
}
|
||||
if ($num_of_ttables != $stepsused{"T"}) {
|
||||
print STDERR "WARNING: Your [mapping-steps] require translation steps up to id $stepsused{T} but you defined translation steps 0..$num_of_ttables\n";
|
||||
@ -1326,12 +1339,11 @@ sub create_ini {
|
||||
print INI "\n# language models: type(srilm/irstlm), factors, order, file
|
||||
[lmodel-file]\n";
|
||||
foreach my $lm (@___LM) {
|
||||
my ($f, $o, $fn) = @$lm;
|
||||
my $type = 0; # default to srilm
|
||||
my ($f, $o, $fn, $type) = @{$lm};
|
||||
print INI "$type $f $o $fn\n";
|
||||
}
|
||||
|
||||
print INI "\n\n# limit on how many phrase translations e for each phrase f are loaded
|
||||
print INI "\n\n\# limit on how many phrase translations e for each phrase f are loaded
|
||||
# 0 = all elements loaded
|
||||
[ttable-limit]
|
||||
20\n";
|
||||
@ -1341,8 +1353,10 @@ print INI "\n\n# limit on how many phrase translations e for each phrase f are l
|
||||
|
||||
my $weight_d_count = 0;
|
||||
if ($___REORDERING ne "distance") {
|
||||
my $file = "# distortion (reordering) files\n[distortion-file]\n";
|
||||
my $file = "# distortion (reordering) files\n\[distortion-file]\n";
|
||||
my $factor_i = 0;
|
||||
|
||||
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
|
||||
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
|
||||
foreach my $r (keys %REORDERING_MODEL) {
|
||||
next if $r eq "fe" || $r eq "f";
|
||||
@ -1350,23 +1364,24 @@ print INI "\n\n# limit on how many phrase translations e for each phrase f are l
|
||||
if ($r eq "distance") { $weight_d_count++; }
|
||||
else {
|
||||
my $type = $r;
|
||||
$type =~ s/orientation/msd/;
|
||||
|
||||
$r =~ s/-bidirectional/.bi/;
|
||||
$r =~ s/-f/.f/;
|
||||
$r =~ s/orientation/orientation-table.$factor/;
|
||||
$r =~ s/msd/msd-table.$factor/;
|
||||
$r =~ s/monotonicity/monotonicity-table.$factor/;
|
||||
|
||||
my $w;
|
||||
if ($r =~ /orient/) { $w = 3; } else { $w = 1; }
|
||||
if ($r =~ /msd/) { $w = 3; } else { $w = 1; }
|
||||
if ($r =~ /bi/) { $w *= 2; }
|
||||
$weight_d_count += $w;
|
||||
$file .= "$factor $type $w $___MODEL_DIR/$r.$___REORDERING_SMOOTH.gz\n";
|
||||
|
||||
my $table_file = "$___MODEL_DIR/$r.$___REORDERING_SMOOTH.gz";
|
||||
$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
$file .= "$factor $type $w $table_file\n";
|
||||
}
|
||||
}
|
||||
$factor_i++;
|
||||
}
|
||||
print INI $file."\n";
|
||||
}
|
||||
print INI $file."\n";
|
||||
}
|
||||
else {
|
||||
$weight_d_count = 1;
|
||||
|
Loading…
Reference in New Issue
Block a user