diff --git a/contrib/mert-sge-nosync/README b/contrib/mert-sge-nosync/README new file mode 100644 index 000000000..85b95ec31 --- /dev/null +++ b/contrib/mert-sge-nosync/README @@ -0,0 +1,29 @@ +MERT-sge-nosync +Raymond Ng, University of Sheffield. +Apr, 2014. + +The parallel MERT tuning scripts in moses cannot run in the SGE "no-sync" mode (where job submission is done in one go, after which user can go offline while leaving the SGE to manage the whole process). Scripts provided in this site are for parallel MERT in SGE no-sync mode. You will need to have SSH support in perl (Step 2), and set up public ssh-keys between the running hosts and the submission hosts (Step 3). + +1. Untar the scipts in ${MOSES}/scripts/ + +2. Download and install OpenSSH packages for perl: +Net-OpenSSH-Compat (http://search.cpan.org/CPAN/authors/id/S/SA/SALVA/Net-OpenSSH-Compat-0.06.tar.gz) +$ cd Net-OpenSSH-Compat-0.06 +$ perl Makefile.PL +(You may have dependency issue and need to install IO-Tty and Net-OpenSSH in advance) +IO-Tty-1.10 (http://search.cpan.org/CPAN/authors/id/T/TO/TODDR/IO-Tty-1.10.tar.gz) +Net-OpenSSH-0.60 (http://search.cpan.org/CPAN/authors/id/S/SA/SALVA/Net-OpenSSH-0.60.tar.gz) + +3. Set up public ssh-keys for accessing the submithost (machine from which qsub is executed) from the running hosts (machines which actually runs the scripts) +http://www.linuxproblem.org/art_9.html + +4. Run parallel MERT by +nohup nice ${MOSES}/scripts/training/mert-moses-sge-nosync.pl ${TM_DEVTEXT_SOURCE} \ +--threads 20 --jobs 20 \ +--queue-flags='-q normal.q -P project' \ +--submithost='squeal' \ +${TM_DEVTEXT_TARGET} ${MOSES}/dist/bin/moses ${WORKINGDIR}/train/model/moses.ini \ +--mertdir ${MOSES}/dist/bin/ >& ${WORKINGDIR}/mert.out + + +(tested on moses version Built Apr 2012 version) diff --git a/contrib/mert-sge-nosync/generic/moses-parallel-sge-nosync.pl b/contrib/mert-sge-nosync/generic/moses-parallel-sge-nosync.pl new file mode 100755 index 000000000..dc9d69571 --- /dev/null +++ b/contrib/mert-sge-nosync/generic/moses-parallel-sge-nosync.pl @@ -0,0 +1,1388 @@ +#! /usr/bin/perl + +# $Id$ +####################### +# Revision history +# +# 02 Aug 2006 added strict requirement +# 01 Aug 2006 fix bug about inputfile parameter +# fix bug about suffix index generation +# 31 Jul 2006 added parameter for reading queue parameters +# 29 Jul 2006 added code to handling consfusion networks +# 28 Jul 2006 added a better policy for removing jobs from the queue in case of killing signal (CTRL-C) +# added the parameter -qsub-prefix which sets the prefix for the name of submitted jobs +# 27 Jul 2006 added safesystem() function and other checks to handle with process failure +# added checks for existence of decoder and configuration file +# 26 Jul 2006 fix a bug related to the use of absolute path for srcfile and nbestfile + +use strict; +use Net::OpenSSH::Compat::Perl; + + +####################### +#Customizable parameters + +#parameters for submiiting processes through Sun GridEngine +my $submithost = undef; +my $queueparameters=""; +my $batch_and_join = undef; +my $processid="$$"; + +# look for the correct pwdcmd +my $pwdcmd = getPwdCmd(); + +my $workingdir = `$pwdcmd`; chomp $workingdir; +my $tmpdir="$workingdir/tmp$$"; +my $splitpfx="split$$"; + + +$SIG{'INT'} = \&kill_all_and_quit; # catch exception for CTRL-C + +####################### +#Default parameters +my $jobscript="$workingdir/job$$"; +my $qsubout="$workingdir/out.job$$"; +my $qsuberr="$workingdir/err.job$$"; + +my $mosesparameters=""; +my $feed_moses_via_stdin = 0; + # a workaround, for a reason, the default "-input-file X" blocks + # my moses, while "< X" works fine. +my $cfgfile=""; #configuration file + +my $version=undef; +my $help=0; +my $dbg=0; +my $jobs=4; +my $mosescmd="$ENV{MOSESBIN}/moses"; #decoder in use +my $inputlist=undef; +my $inputfile=undef; +my $inputtype=0; +my @nbestlist=(); +my $nbestlist=undef; +my $nbestfile=undef; +my $oldnbestfile=undef; +my $oldnbest=undef; +my $nbestflag=0; +my @wordgraphlist=(); +my $wordgraphlist=undef; +my $wordgraphfile=undef; +my $wordgraphflag=0; +my $robust=5; # resubmit crashed jobs robust-time +my $alifile=undef; +my $outnbest=undef; +my $logfile=""; +my $logflag=""; +my $searchgraphlist=""; +my $searchgraphfile=""; +my $searchgraphflag=0; +my $qsubname="MOSES"; +my $qsubwrapper=undef; +my $qsubwrapper_exit=undef; +my $old_sge = 0; # assume old Sun Grid Engine (<6.0) where qsub does not + # implement -sync and -b +my $___LATTICE_SAMPLES = 0; +my $___DECODER_FLAGS = ""; # additional parameters to pass to the decoder +my $___N_BEST_LIST_SIZE = 100; +my $___RANGES = undef; +my $___WORKING_DIR = undef; +my $SCRIPTS_ROOTDIR = undef; +my $postdecodecmd = undef; +my $postdecodeargs = undef; + + +my $run = 0; +my $jobid = -1; +my $prevjid = undef; +my $need_to_normalize = 1; + +####################### +# Command line options processing +sub init(){ + + + + use Getopt::Long qw(:config pass_through no_ignore_case permute); + GetOptions('version'=>\$version, + 'help'=>\$help, + 'debug'=>\$dbg, + 'jobs=i'=>\$jobs, + 'decoder=s'=> \$mosescmd, + 'robust=i' => \$robust, + 'script-rootdir=s' => \$SCRIPTS_ROOTDIR, + 'lattice-samples=i' => \$___LATTICE_SAMPLES, + 'decoder-flags=s' => \$___DECODER_FLAGS, + 'feed-decoder-via-stdin'=> \$feed_moses_via_stdin, + 'logfile=s'=> \$logfile, + 'i|inputfile|input-file=s'=> \$inputlist, + 'n-best-list-size=s'=> \$___N_BEST_LIST_SIZE, + 'n-best-file=s'=> \$oldnbestfile, + 'n-best-size=i'=> \$oldnbest, + 'output-search-graph|osg=s'=> \$searchgraphlist, + 'output-word-graph|owg=s'=> \$wordgraphlist, + 'alignment-output-file=s'=> \$alifile, + 'submithost=s'=> \$submithost, + 'queue-parameters=s'=> \$queueparameters, + 'inputtype=i'=> \$inputtype, + 'config|f=s'=>\$cfgfile, + 'ranges=s@'=> \$___RANGES, + 'old-sge' => \$old_sge, + 'run=i' => \$run, + 'need-to-normalize' => \$need_to_normalize, + 'working-dir=s' => \$___WORKING_DIR, + 'qsubwrapper=s' => \$qsubwrapper, + 'qsubwrapper-exit=s' => \$qsubwrapper_exit + ) or exit(1); + +# 'decoder-parameters=s'=> \$mosesparameters, +# 'n-best-list=s'=> \$nbestlist, +# 'qsub-prefix=s'=> \$qsubname, +} + +sub init_secondpart() { + getNbestParameters(); + + getSearchGraphParameters(); + + getWordGraphParameters(); + + getLogParameters(); + + chomp(my $my_username = `whoami`); + # $submithost = "squeal"; + + print STDERR "submithost is $submithost\n"; + + my $prevjid = undef; + my $jobid = undef; + +#print_parameters(); +#print STDERR "nbestflag:$nbestflag\n"; +#print STDERR "searchgraphflag:$searchgraphflag\n"; +print STDERR "wordgraphflag:$wordgraphflag\n"; +#print STDERR "inputlist:$inputlist\n"; + + chomp($inputfile=`basename $inputlist`) if defined($inputlist); + + # $mosesparameters.="@ARGV -config $cfgfile -inputtype $inputtype"; + # $mosesparameters = "@ARGV -config $cfgfile -inputtype $inputtype "; + # $mosesparameters .= "@ARGV -config $cfgfile -inputtype $inputtype "; + $mosesparameters .= " -config $cfgfile -inputtype $inputtype "; + +} + + +####################### +##print version +sub version(){ +# print STDERR "version 1.0 (15-07-2006)\n"; +# print STDERR "version 1.1 (17-07-2006)\n"; +# print STDERR "version 1.2 (18-07-2006)\n"; +# print STDERR "version 1.3 (21-07-2006)\n"; +# print STDERR "version 1.4 (26-07-2006)\n"; +# print STDERR "version 1.5 (27-07-2006)\n"; +# print STDERR "version 1.6 (28-07-2006)\n"; +# print STDERR "version 1.7 (29-07-2006)\n"; +# print STDERR "version 1.8 (31-07-2006)\n"; +# print STDERR "version 1.9 (01-08-2006)\n"; +# print STDERR "version 1.10 (02-08-2006)\n"; +# print STDERR "version 1.11 (10-10-2006)\n"; +# print STDERR "version 1.12 (27-12-2006)\n"; +# print STDERR "version 1.13 (29-12-2006)\n"; + print STDERR "version 1.13b (01-04-2014)\n"; + exit(1); +} + +#usage +sub usage(){ + print STDERR "moses-parallel.pl [parallel-options] [moses-options]\n"; + print STDERR "Options marked (*) are required.\n"; + print STDERR "Parallel options:\n"; + print STDERR "* -decoder Moses decoder to use\n"; + print STDERR "* -i|inputfile|input-file the input text to translate\n"; + print STDERR "* -jobs number of required jobs\n"; + print STDERR " -logfile file where storing log files of all jobs\n"; + print STDERR " -queue-parameters specific requirements for queue\n"; + print STDERR " -old-sge Assume Sun Grid Engine < 6.0\n"; + print STDERR " -debug debug\n"; + print STDERR " -version print version of the script\n"; + print STDERR " -help this help\n"; + print STDERR "Moses options:\n"; + print STDERR " -inputtype <0|1|2> 0 for text, 1 for confusion networks, 2 for lattices\n"; + print STDERR " -output-search-graph (osg) : Output connected hypotheses of search into specified filename\n"; + print STDERR " -output-word-graph (osg) ' <0|1>': Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos\n"; + print STDERR " IMPORTANT NOTE: use single quote to group parameters of -output-word-graph\n"; + print STDERR " This is different from standard moses\n"; + print STDERR " -lattice-samples : how many lattice samples (Chatterjee & Cancedda, emnlp 2010) (added option in moses-parallel-sge-nosync)\n"; + print STDERR " -n-best-list-size : size of nbest lists (added option in moses-parallel-sge-nosync)\n"; + print STDERR " NOTE: -n-best-file-n-best-size are passed to the decoder as \"-n-best-list \"\n"; + print STDERR " -decoder-flags : (added option is moses-parallel-sge-nosync)\n"; + print STDERR " -ranges : (added option is moses-parallel-sge-nosync)\n"; + print STDERR " -run : (add option in moses-parallel-sge-nosync)\n"; + print STDERR "* -config (f) configuration file\n"; + print STDERR "All other options are passed to Moses\n"; + print STDERR " (This way to pass parameters is maintained for back compatibility\n"; + print STDERR " but preferably use -decoder-parameters)\n"; + exit(1); +} + + + + + +#printparameters +sub print_parameters(){ + print STDERR "Inputfile: $inputlist\n"; + print STDERR "Configuration file: $cfgfile\n"; + print STDERR "Decoder in use: $mosescmd\n"; + print STDERR "Number of jobs:$jobs\n"; + print STDERR "Nbest list: $nbestlist\n" if ($nbestflag); + print STDERR "Output Search Graph: $searchgraphlist\n" if ($searchgraphflag); + print STDERR "Output Word Graph: $wordgraphlist\n" if ($wordgraphflag); + print STDERR "LogFile:$logfile\n" if ($logflag); + print STDERR "Qsub name: $qsubname\n"; + print STDERR "Queue parameters: $queueparameters\n"; + print STDERR "Inputtype: text\n" if $inputtype == 0; + print STDERR "Inputtype: confusion network\n" if $inputtype == 1; + print STDERR "Inputtype: lattices\n" if $inputtype == 2; + + print STDERR "parameters directly passed to Moses: $mosesparameters\n"; +} + +#get parameters for log file +sub getLogParameters(){ + if ($logfile){ $logflag=1; } +} + +#get parameters for nbest computation (possibly from configuration file) +sub getNbestParameters(){ + if (!$nbestlist){ + open (CFG, "$cfgfile"); + while (chomp($_=)){ + if (/^\[n-best-list\]/){ + my $tmp; + while (chomp($tmp=)){ + last if $tmp eq "" || $tmp=~/^\[/; + $nbestlist .= "$tmp "; + } + last; + } + } + close(CFG); + } + + if ($nbestlist){ + if ($oldnbestfile){ + print STDERR "There is a conflict between NEW parameter -n-best-list and OBSOLETE parameter -n-best-file\n"; + print STDERR "Please use only -nbest-list ' [distinct]\n"; + exit; + } + } + else{ + if ($oldnbestfile){ + print STDERR "You are using the OBSOLETE parameter -n-best-file\n"; + print STDERR "Next time please use only -n-best-list ' [distinct]\n"; + $nbestlist="$oldnbestfile"; + if ($oldnbest){ $nbestlist.=" $oldnbest"; } + else { $nbestlist.=" 1"; } + } + } + + if ($nbestlist){ + my @tmp=split(/[ \t]+/,$nbestlist); + @nbestlist = @tmp; + + if ($nbestlist[0] eq '-'){ $nbestfile="nbest"; } + else{ chomp($nbestfile=`basename $nbestlist[0]`); } + $nbestflag=1; + } + print STDERR "getNbest\n"; + print STDERR "nbestflag = $nbestflag\n"; + print STDERR "nbestfile = $nbestfile\n"; + + +} + +#get parameters for search graph computation (possibly from configuration file) +sub getSearchGraphParameters(){ + if (!$searchgraphlist){ + open (CFG, "$cfgfile"); + while (chomp($_=)){ + if (/^\[output-search-graph\]/ || /^\[osg\]/){ + my $tmp; + while (chomp($tmp=)){ + last if $tmp eq "" || $tmp=~/^\[/; + $searchgraphlist = "$tmp"; + } + last; + } + } + close(CFG); + } + if ($searchgraphlist){ + if ($searchgraphlist eq '-'){ $searchgraphfile="searchgraph"; } + else{ chomp($searchgraphfile=`basename $searchgraphlist`); } + $searchgraphflag=1; + } +} + +#get parameters for word graph computation (possibly from configuration file) +sub getWordGraphParameters(){ + if (!$wordgraphlist){ + open (CFG, "$cfgfile"); + while (chomp($_=)){ + if (/^\[output-word-graph\]/ || /^\[owg\]/){ + my $tmp; + while (chomp($tmp=)){ + last if $tmp eq "" || $tmp=~/^\[/; + $wordgraphlist .= "$tmp "; + } + last; + } + } + close(CFG); + } + if ($wordgraphlist){ + my @tmp=split(/[ \t]+/,$wordgraphlist); + @wordgraphlist = @tmp; + + if ($wordgraphlist[0] eq '-'){ $wordgraphfile="wordgraph"; } + else{ chomp($wordgraphfile=`basename $wordgraphlist[0]`); } + $wordgraphflag=1; + } +} + +sub sanity_check_order_of_lambdas { + my $featlist = shift; + my $filename_or_stream = shift; + + my @expected_lambdas = @{$featlist->{"names"}}; + my @got = get_order_of_scores_from_nbestlist($filename_or_stream); + die "Mismatched lambdas. Decoder returned @got, we expected @expected_lambdas" + if "@got" ne "@expected_lambdas"; +} + + + + +####################### +#Script starts here + +init(); + +print "I have started parallel moses!!"; + +# moses.ini file uses FULL names for lambdas, while this training script +# internally (and on the command line) uses ABBR names. +my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w + g=weight-generation lex=weight-lex I=weight-i); +my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP; +my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP; + + + + +version() if $version; +usage() if $help; + +####################################### +# incorporate run_decoder() here + +### moved to below +# my $qsubname = mert$run; +# $mosesparameters = "$___DECODER_FLAGS $decoder_config"; +# $nbestlist = "$filename $___N_BEST_LIST_SIZE"; + + +my $featlist = get_featlist_from_moses("./run$run.moses.ini"); +$featlist = insert_ranges_to_featlist($featlist, $___RANGES); + + +## sub run_decoder { +# my ($featlist, $run, $need_to_normalize) = @_; +my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out"; +my $filename = sprintf($filename_template, $run); +my $lsamp_filename = undef; +if ($___LATTICE_SAMPLES) { + my $lsamp_filename_template = "run%d.lsamp$___LATTICE_SAMPLES.out"; + $lsamp_filename = sprintf($lsamp_filename_template, $run); +} + +# user-supplied parameters +print STDERR "params = $___DECODER_FLAGS\n"; + + +# parameters to set all model weights (to override moses.ini) +my @vals = @{$featlist->{"values"}}; +if ($need_to_normalize) { + print STDERR "Normalizing lambdas: @vals\n"; + my $totlambda=0; + grep($totlambda+=abs($_),@vals); + grep($_/=$totlambda,@vals); +} + + +######################################## + + # parameters to set all model weights (to override moses.ini) + my @vals = @{$featlist->{"values"}}; + if ($need_to_normalize) { + print STDERR "Normalizing lambdas: @vals\n"; + my $totlambda=0; + grep($totlambda+=abs($_),@vals); + grep($_/=$totlambda,@vals); + } + # moses now does not seem accept "-tm X -tm Y" but needs "-tm X Y" + my %model_weights; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $model_weights{$name} = "-$name" if !defined $model_weights{$name}; + $model_weights{$name} .= sprintf " %.6f", $vals[$i]; + } + my $decoder_config = join(" ", values %model_weights); + $decoder_config .= " -weight-file run$run.sparse-weights" if -e "run$run.sparse-weights"; + print STDERR "DECODER_CFG = $decoder_config\n"; + print "decoder_config = $decoder_config\n"; + + + + +######################################### + +# moses now does not seem accept "-tm X -tm Y" but needs "-tm X Y" +my %model_weights; +for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $model_weights{$name} = "-$name" if !defined $model_weights{$name}; + $model_weights{$name} .= sprintf " %.6f", $vals[$i]; +} +my $decoder_config = join(" ", values %model_weights); +$decoder_config .= " -weight-file run$run.sparse-weights" if -e "run$run.sparse-weights"; +print STDERR "DECODER_CFG = $decoder_config\n"; +print STDERR "decoder_config = $decoder_config\n"; + +####### moved here??? ########### +$qsubname = "dec$run"; +# $mosesparameters = "$___DECODER_FLAGS $decoder_config"; +$mosesparameters = "$___DECODER_FLAGS $decoder_config "; + +print STDERR "moses parameter with ___DECODER_FLAGS $___DECODER_FLAGS decoder_config $decoder_config and $qsubname\n"; +print STDERR "$mosesparameters\n"; + +$nbestlist = "$filename $___N_BEST_LIST_SIZE"; + +print STDERR "to output -n-best-list $nbestlist\n"; + +init_secondpart(); +################################# + + +# run the decoder +my $decoder_cmd; +my $lsamp_cmd = ""; +if ($___LATTICE_SAMPLES) { + $lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES "; +} + + +if (!defined $inputlist || !defined $mosescmd || ! defined $cfgfile) { + print STDERR "Please specify -input-file, -decoder and -config\n"; + usage(); +} + +#checking if inputfile exists +if (! -e ${inputlist} ){ + print STDERR "Inputfile ($inputlist) does not exists\n"; + usage(); +} + +#checking if decoder exists +if (! -e $mosescmd) { + print STDERR "Decoder ($mosescmd) does not exists\n"; + usage(); +} + +#checking if configfile exists +if (! -e $cfgfile) { + print STDERR "Configuration file ($cfgfile) does not exists\n"; + usage(); +} + + +print_parameters(); # so that people know +exit(1) if $dbg; # debug mode: just print and do not run + + +#splitting test file in several parts +#$decimal="-d"; #split does not accept this options (on MAC OS) +my $decimal=""; + +my $cmd; +my $sentenceN; +my $splitN; + +my @idxlist=(); +my $idxliststr=""; + +if ($inputtype==0){ #text input +#getting the number of input sentences (one sentence per line) + chomp($sentenceN=`wc -l ${inputlist} | awk '{print \$1}' `); + +#Reducing the number of jobs if less sentences to translate + if ($jobs>$sentenceN){ $jobs=$sentenceN; } + +#Computing the number of sentences for each files + if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } + else{ $splitN=int($sentenceN /$jobs) + 1; } + + if ($dbg){ + print STDERR "There are $sentenceN sentences to translate\n"; + print STDERR "There are at most $splitN sentences per job\n"; + } + + $cmd="split $decimal -a 2 -l $splitN $inputlist ${inputfile}.$splitpfx-"; + safesystem("$cmd") or die; +} +elsif ($inputtype==1){ #confusion network input + my $tmpfile="/tmp/cnsplit$$"; + $cmd="cat $inputlist | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' > $tmpfile"; + safesystem("$cmd") or die; + +#getting the number of input CNs + chomp($sentenceN=`wc -l $tmpfile | awk '{print \$1}' `); + +#Reducing the number of jobs if less CNs to translate + if ($jobs>$sentenceN){ $jobs=$sentenceN; } + +#Computing the number of CNs for each files + if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } + else{ $splitN=int($sentenceN /$jobs) + 1; } + + if ($dbg){ + print STDERR "There are $sentenceN confusion networks to translate\n"; + print STDERR "There are at most $splitN sentences per job\n"; + } + + $cmd="split $decimal -a 2 -l $splitN $tmpfile $tmpfile-"; + safesystem("$cmd") or die; + + my @idxlist=(); + chomp(@idxlist=`ls $tmpfile-*`); + grep(s/.+(\-\S+)$/$1/e,@idxlist); + + foreach my $idx (@idxlist){ + $cmd="perl -pe 's/ _CNendline_ /\\n/g;s/ _CNendline_/\\n/g;'"; + safesystem("cat $tmpfile$idx | $cmd > ${inputfile}.$splitpfx$idx ; \\rm -f $tmpfile$idx;"); + } +} +elsif ($inputtype==2){ #confusion network input +#getting the number of input lattices (one lattice per line) + chomp($sentenceN=`wc -l ${inputlist} | awk '{print \$1}' `); + +#Reducing the number of jobs if less lattices to translate + if ($jobs>$sentenceN){ $jobs=$sentenceN; } + +#Computing the number of sentences for each files + if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } + else{ $splitN=int($sentenceN /$jobs) + 1; } + + if ($dbg){ + print STDERR "There are $sentenceN lattices to translate\n"; + print STDERR "There are at most $splitN lattices per job\n"; + } + + $cmd="split $decimal -a 2 -l $splitN $inputlist ${inputfile}.$splitpfx-"; + safesystem("$cmd") or die; +} +else{ #unknown input type + die "INPUTTYPE:$inputtype is unknown!\n"; +} + +chomp(@idxlist=`ls ${inputfile}.$splitpfx-*`); +grep(s/.+(\-\S+)$/$1/e,@idxlist); + +safesystem("mkdir -p $tmpdir") or die; + +preparing_script(); + + + +#launching process through the queue +my @sgepids =(); +my $splitdecodejid=""; + +my @idx_todo = (); +foreach (@idxlist) { push @idx_todo,$_; } + +# loop up to --robust times +my $max_robust = $robust; +my $robust_idx; +while ($robust && scalar @idx_todo) { + $robust--; + + my $failure=0; + + + foreach my $idx (@idx_todo){ + + if ($old_sge) { + # old SGE understands -b no as the default and does not understand 'yes' + $batch_and_join = "-j y"; + } else { + $batch_and_join = "-b yes -j yes"; # -b yes to submit bash script + } + + + ##### Replace the direct qsub command with submit_or_exec_thu_host() ############# + + my $split_decoder_cmd = "${jobscript}${idx}.bash"; + &submit_or_exec_thu_host($submithost,$run,$idx,$split_decoder_cmd,$batch_and_join,"decode$run$idx.out","decode$run$idx.err","decode$run$idx.id"); + chomp($jobid=`tail -n 1 decode$run$idx.id`); + print STDERR "JOBID for decoding sub-task decode$run$idx is $jobid\n"; + + ################################################################################### + + ##### # get jobid ################################################################# + push @sgepids, $jobid; + $splitdecodejid .= " $jobid"; + ################################################################################### + } + + ## clear temp file for this robust trial + foreach my $idx (@idx_todo){ + &exit_submit_thu_host($submithost,$run,$idx,"","decode$run$idx.out","decode$run$idx.err","decode$run$idx.id","decode$run$idx.id.pid",$splitdecodejid); + } + + ## WAIT JOB for robust iteration + $cmd = "date"; + $robust_idx = $max_robust - $robust; + &submit_or_exec_thu_host($submithost,$run,"R${robust_idx}.W",$cmd,"-sync y","decode${run}R${robust_idx}.W.out","decode${run}R${robust_idx}.W.err","decode${run}R${robust_idx}.W.id",$splitdecodejid); + # no need to harvest jobid as this is in sync mode + + # clear up tmp files + &exit_submit_thu_host($submithost,$run,"R${robust_idx}.W","","decode${run}R${robust_idx}.W.out","decode${run}R${robust_idx}.W.err","decode${run}R${robust_idx}.W.id","decode${run}R${robust_idx}.W.id.pid",$splitdecodejid); + + + + # check if some translations failed + my @idx_still_todo = check_translation(); + if ($robust) { + # if robust, redo crashed jobs + ##RESUBMIT_ANYWAY## if ((scalar @idx_still_todo) == (scalar @idxlist)) { + ##RESUBMIT_ANYWAY## # ... but not if all crashed + ##RESUBMIT_ANYWAY## print STDERR "everything crashed, not trying to resubmit jobs\n"; + ##RESUBMIT_ANYWAY## $robust = 0; + ##RESUBMIT_ANYWAY## kill_all_and_quit(); + ##RESUBMIT_ANYWAY## } + @idx_todo = @idx_still_todo; + } + else { + if (scalar (@idx_still_todo)) { + print STDERR "some jobs crashed: ".join(" ",@idx_still_todo)."\n"; + # kill_all_and_quit(); + } + + } + +} + + + +$idxliststr = join(" ",@idxlist); + + +$postdecodecmd = "$SCRIPTS_ROOTDIR/training/sge-nosync/moses-parallel-postdecode-sge-nosync.pl" if !defined $postdecodecmd; + +$postdecodeargs = "" if !defined $postdecodeargs; +$postdecodeargs = "$postdecodeargs --process-id $$ --idxliststr \"$idxliststr\""; +$postdecodeargs = "$postdecodeargs --nbestfile $nbestfile" if (defined $nbestfile); +$postdecodeargs = "$postdecodeargs --outnbest $outnbest" if (defined $outnbest); +$postdecodeargs = "$postdecodeargs --input-file $inputfile" if ($inputfile); + +my $cmd = "$postdecodecmd $postdecodeargs"; +&submit_or_exec_thu_host($submithost,$run,".CONCAT",$cmd,"","run$run.out","run$run.err","postdecode$run.id","$splitdecodejid"); + + +chomp($jobid=`tail -n 1 postdecode$run.id`); +$prevjid = $jobid; + +## clear up tmp +&exit_submit_thu_host($submithost,$run,".CONCAT","","run$run.out","run$run.err","postdecode$run.id","postdecode$run.id.pid",$prevjid); + + + +##### OVERALL WAIT JOB ########## + my $syncscript = "${jobscript}.waitall.sh"; + + open (OUT, ">$syncscript"); + my $scriptheader="\#\!/bin/bash\n\#\$ -S /bin/sh\n# Both lines are needed to invoke base\n#the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n"; + $scriptheader .="uname -a\n\n"; + $scriptheader .="cd $___WORKING_DIR\n\n"; + print OUT $scriptheader; + print OUT "'date'"; + close(OUT); + + # safesystem("echo 'date' > $syncscript") or kill_all_and_quit(); + chmod(oct(755),"$syncscript"); + + # $cmd="qsub $queueparameters -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls > $qsubname.W.log"; + $batch_and_join = "-j y"; + + &submit_or_exec_thu_host($submithost,$run,".W",$syncscript,$batch_and_join,"decode$run.W.out","decode$run.W.err","decode$run.W.id",$prevjid); + chomp($jobid=`tail -n 1 decode$run.W.id`); + print STDERR "JOBID for wait-job for all decoding sub-task is $jobid\n"; + $prevjid = $jobid; + + ## clear-up tmp + &exit_submit_thu_host($submithost,$run,".W",$batch_and_join,"decode$run.W.out","decode$run.W.err","decode$run.W.id","decode$run.W.id.pid",$prevjid); + +exit(); + + + + + +### ending scripts in run_decoder() ############## +sanity_check_order_of_lambdas($featlist, $filename); +return ($filename, $lsamp_filename); +################################################## + + + + +#script creation +sub preparing_script(){ + my $currStartTranslationId = 0; + + foreach my $idx (@idxlist){ + my $scriptheader=""; + $scriptheader.="\#\! /bin/bash\n\n"; + # !!! this is useless. qsub ignores the first line of the script. + # Pass '-S /bin/bash' to qsub instead. + $scriptheader.="uname -a\n\n"; + $scriptheader.="ulimit -c 0\n\n"; # avoid coredumps + $scriptheader.="cd $workingdir\n\n"; + + open (OUT, "> ${jobscript}${idx}.bash"); + print OUT $scriptheader; + my $inputmethod = $feed_moses_via_stdin ? "<" : "-input-file"; + + my $tmpnbestlist=""; + if ($nbestflag){ + $tmpnbestlist="$tmpdir/$nbestfile.$splitpfx$idx $nbestlist[1]"; + $tmpnbestlist = "$tmpnbestlist $nbestlist[2]" if scalar(@nbestlist)==3; + $tmpnbestlist = "-n-best-list $tmpnbestlist"; + } + + $outnbest=$nbestlist[0]; + if ($nbestlist[0] eq '-'){ $outnbest="nbest$$"; } + + print STDERR "n-best-list $tmpnbestlist\n"; + + + my $tmpalioutfile = ""; + if (defined $alifile){ + $tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx"; + } + + my $tmpsearchgraphlist=""; + if ($searchgraphflag){ + $tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx"; + } + + my $tmpwordgraphlist=""; + if ($wordgraphflag){ + $tmpwordgraphlist="-output-word-graph $tmpdir/$wordgraphfile.$splitpfx$idx $wordgraphlist[1]"; + } + + my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId"; + + print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; + print OUT "echo exit status \$\?\n\n"; + + if (defined $alifile){ + print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n"; + print OUT "echo exit status \$\?\n\n"; + } + if ($nbestflag){ + print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n"; + print OUT "echo exit status \$\?\n\n"; + } + if ($searchgraphflag){ + print OUT "\\mv -f $tmpdir/${searchgraphfile}.$splitpfx$idx .\n\n"; + print OUT "echo exit status \$\?\n\n"; + } + + if ($wordgraphflag){ + print OUT "\\mv -f $tmpdir/${wordgraphfile}.$splitpfx$idx .\n\n"; + print OUT "echo exit status \$\?\n\n"; + } + + print OUT "\\mv -f $tmpdir/${inputfile}.$splitpfx$idx.trans .\n\n"; + print OUT "echo exit status \$\?\n\n"; + close(OUT); + + #setting permissions of each script + chmod(oct(755),"${jobscript}${idx}.bash"); + + $currStartTranslationId += $splitN; + } +} + +sub concatenate_wordgraph(){ + my $oldcode=""; + my $newcode=-1; + my %inplength = (); + my $offset = 0; + + my $outwordgraph=$wordgraphlist[0]; + if ($wordgraphlist[0] eq '-'){ $outwordgraph="wordgraph$$"; } + + open (OUT, "> $outwordgraph"); + foreach my $idx (@idxlist){ + +#computing the length of each input file + my @in=(); + open (IN, "${inputfile}.${splitpfx}${idx}.trans"); + @in=; + close(IN); + $inplength{$idx} = scalar(@in); + + open (IN, "${wordgraphfile}.${splitpfx}${idx}"); + while (){ + + my $code=""; + if (/^UTTERANCE=/){ + ($code)=($_=~/^UTTERANCE=(\d+)/); + + print STDERR "code:$code offset:$offset\n"; + $code += $offset; + if ($code ne $oldcode){ + +# if there is a jump between two consecutive codes +# it means that an input sentence is not translated +# fill this hole with a "fictitious" list of wordgraphs +# comprising just one "_EMPTYSEARCHGRAPH_ + while ($code - $oldcode > 1){ + $oldcode++; + print OUT "UTTERANCE=$oldcode\n"; + print STDERR " to OUT -> code:$oldcode\n"; + print OUT "_EMPTYWORDGRAPH_\n"; + } + } + + $oldcode=$code; + print OUT "UTTERANCE=$oldcode\n"; + next; + } + print OUT "$_"; + } + close(IN); + $offset += $inplength{$idx}; + + while ($offset - $oldcode > 1){ + $oldcode++; + print OUT "UTTERANCE=$oldcode\n"; + print OUT "_EMPTYWORDGRAPH_\n"; + } + } + close(OUT); +} + + +sub concatenate_searchgraph(){ + my $oldcode=""; + my $newcode=-1; + my %inplength = (); + my $offset = 0; + + my $outsearchgraph=$searchgraphlist; + if ($searchgraphlist eq '-'){ $outsearchgraph="searchgraph$$"; } + + open (OUT, "> $outsearchgraph"); + foreach my $idx (@idxlist){ + +#computing the length of each input file + my @in=(); + open (IN, "${inputfile}.${splitpfx}${idx}.trans"); + @in=; + close(IN); + $inplength{$idx} = scalar(@in); + + open (IN, "${searchgraphfile}.${splitpfx}${idx}"); + while (){ + my ($code,@extra)=split(/[ \t]+/,$_); + $code += $offset; + if ($code ne $oldcode){ + +# if there is a jump between two consecutive codes +# it means that an input sentence is not translated +# fill this hole with a "fictitious" list of searchgraphs +# comprising just one "_EMPTYSEARCHGRAPH_ + while ($code - $oldcode > 1){ + $oldcode++; + print OUT "$oldcode _EMPTYSEARCHGRAPH_\n"; + } + } + $oldcode=$code; + print OUT join(" ",($oldcode,@extra)); + } + close(IN); + $offset += $inplength{$idx}; + + while ($offset - $oldcode > 1){ + $oldcode++; + print OUT "$oldcode _EMPTYSEARCHGRAPH_\n"; + } + } + close(OUT); +} + +sub concatenate_nbest(){ + my $oldcode=""; + my $newcode=-1; + my %inplength = (); + my $offset = 0; + +# get the list of feature and set a fictitious string with zero scores + open (IN, "${nbestfile}.${splitpfx}$idxlist[0]"); + my $str = ; + chomp($str); + close(IN); + my ($code,$trans,$featurescores,$globalscore)=split(/\|\|\|/,$str); + + my $emptytrans = " "; + my $emptyglobalscore = " 0.0"; + my $emptyfeaturescores = $featurescores; + $emptyfeaturescores =~ s/[-0-9\.]+/0/g; + + my $outnbest=$nbestlist[0]; + if ($nbestlist[0] eq '-'){ $outnbest="nbest$$"; } + + open (OUT, "> $outnbest"); + foreach my $idx (@idxlist){ + +#computing the length of each input file + my @in=(); + open (IN, "${inputfile}.${splitpfx}${idx}.trans"); + @in=; + close(IN); + $inplength{$idx} = scalar(@in); + + open (IN, "${nbestfile}.${splitpfx}${idx}"); + while (){ + my ($code,@extra)=split(/\|\|\|/,$_); + $code += $offset; + if ($code ne $oldcode){ + +# if there is a jump between two consecutive codes +# it means that an input sentence is not translated +# fill this hole with a "fictitious" list of translation +# comprising just one "emtpy translation" with zero scores + while ($code - $oldcode > 1){ + $oldcode++; + print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n"; + } + } + $oldcode=$code; + print OUT join("\|\|\|",($oldcode,@extra)); + } + close(IN); + $offset += $inplength{$idx}; + + while ($offset - $oldcode > 1){ + $oldcode++; + print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n"; + } + } + close(OUT); +} + + + +sub check_exit_status(){ + print STDERR "check_exit_status\n"; + my $failure=0; + foreach my $idx (@idxlist){ + print STDERR "check_exit_status of job $idx\n"; + open(IN,"$qsubout$idx"); + while (){ + $failure=1 if (/exit status 1/); + } + close(IN); + } + return $failure; +} + +sub kill_all_and_quit(){ + print STDERR "Got interrupt or something failed.\n"; + print STDERR "kill_all_and_quit\n"; + foreach my $id (@sgepids){ + print STDERR "qdel $id\n"; + safesystem("qdel $id"); + } + + print STDERR "Translation was not performed correctly\n"; + print STDERR "or some of the submitted jobs died.\n"; + print STDERR "qdel function was called for all submitted jobs\n"; + + exit(1); +} + + +sub check_translation(){ + #checking if all sentences were translated + my $inputN; + my $outputN; + my @failed = (); + foreach my $idx (@idx_todo){ + if ($inputtype==0){#text input + chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); + } + elsif ($inputtype==1){#confusion network input + chomp($inputN=`cat ${inputfile}.$splitpfx$idx | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' | wc -l | cut -d' ' -f1 `); + } + elsif ($inputtype==2){#lattice input + chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); + } + else{#unknown input + die "INPUTTYPE:$inputtype is unknown!\n"; + } + chomp($outputN=`wc -l ${inputfile}.$splitpfx$idx.trans | cut -d' ' -f1`); + + if ($inputN != $outputN){ + print STDERR "Split ($idx) were not entirely translated\n"; + print STDERR "outputN=$outputN inputN=$inputN\n"; + print STDERR "outputfile=${inputfile}.$splitpfx$idx.trans inputfile=${inputfile}.$splitpfx$idx\n"; + push @failed,$idx; + } + } + return @failed; +} + +sub check_translation_old_sge(){ + #checking if all sentences were translated + my $inputN; + my $outputN; + foreach my $idx (@idx_todo){ + if ($inputtype==0){#text input + chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); + } + elsif ($inputtype==1){#confusion network input + chomp($inputN=`cat ${inputfile}.$splitpfx$idx | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' | wc -l | + cut -d' ' -f1 `); + } + elsif ($inputtype==2){#lattice input + chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); + } + else{#unknown input + die "INPUTTYPE:$inputtype is unknown!\n"; + } + chomp($outputN=`wc -l ${inputfile}.$splitpfx$idx.trans | cut -d' ' -f1`); + + if ($inputN != $outputN){ + print STDERR "Split ($idx) were not entirely translated\n"; + print STDERR "outputN=$outputN inputN=$inputN\n"; + print STDERR "outputfile=${inputfile}.$splitpfx$idx.trans inputfile=${inputfile}.$splitpfx$idx\n"; + return 1; + } + + } + return 0; +} + +sub remove_temporary_files(){ + #removing temporary files + foreach my $idx (@idxlist){ + unlink("${inputfile}.${splitpfx}${idx}.trans"); + unlink("${inputfile}.${splitpfx}${idx}"); + if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); } + if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); } + if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); } + if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); } + unlink("${jobscript}${idx}.bash"); + unlink("${jobscript}${idx}.log"); + unlink("$qsubname.W.log"); + unlink("$qsubout$idx"); + unlink("$qsuberr$idx"); + rmdir("$tmpdir"); + } + if ($nbestflag && $nbestlist[0] eq '-'){ unlink("${nbestfile}$$"); }; + if ($searchgraphflag && $searchgraphlist eq '-'){ unlink("${searchgraphfile}$$"); }; + if ($wordgraphflag && $wordgraphlist eq '-'){ unlink("${wordgraphfile}$$"); }; +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit 1; + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + + +# look for the correct pwdcmd (pwd by default, pawd if it exists) +# I assume that pwd always exists +sub getPwdCmd(){ + my $pwdcmd="pwd"; + my $a; + chomp($a=`which pawd | head -1 | awk '{print $1}'`); + if ($a && -e $a){ $pwdcmd=$a; } + return $pwdcmd; +} + + + +sub get_featlist_from_moses { + # run moses with the given config file and return the list of features and + # their initial values + + ### variable conversion for moses-parallel-sge-nosync.pl ### + my $___INPUTTYPE = $inputtype; + my $___DECODER = $mosescmd; + ###################################################### + my $configfn = shift; + + + # forceful load features list every time + my $featlistfn = "./features.list.run${run}_start"; + if (-e $featlistfn) { + print STDERR "Removing old features list: $featlistfn\n"; + print STDERR "Generating a new one with $configfn\n"; + } + print STDERR "Asking moses for feature names and values from $configfn\n"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + print STDERR "$cmd\n"; #DEBUG + safesystem($cmd) or die "Failed to run moses with the config $configfn"; + + + # read feature list + my @names = (); + my @startvalues = (); + open(INI,$featlistfn) or die "Can't read $featlistfn"; + my $nr = 0; + my @errs = (); + while () { + $nr++; + chomp; + /^(.+) (\S+) (\S+)$/ || die("invalid feature: $_"); + my ($longname, $feature, $value) = ($1,$2,$3); + next if $value eq "sparse"; + push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" + if $value !~ /^[+-]?[0-9.e]+$/; + push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" + if !defined $ABBR2FULL{$feature}; + push @names, $feature; + push @startvalues, $value; + } + close INI; + if (scalar @errs) { + print STDERR join("", @errs); + exit 1; + } + return {"names"=>\@names, "values"=>\@startvalues}; +} + + +sub insert_ranges_to_featlist { + ### variable conversion for moses-parallel-sge-nosync.pl ### + my $___INPUTTYPE = $inputtype; + my $___DECODER = $mosescmd; + ###################################################### + + my $featlist = shift; + my $ranges = shift; + + $ranges = [] if !defined $ranges; + + # first collect the ranges from options + my $niceranges; + foreach my $range (@$ranges) { + my $name = undef; + foreach my $namedpair (split /,/, $range) { + if ($namedpair =~ /^(.*?):/) { + $name = $1; + $namedpair =~ s/^.*?://; + die "Unrecognized name '$name' in --range=$range" + if !defined $ABBR2FULL{$name}; + } + my ($min, $max) = split /\.\./, $namedpair; + die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "No name given in --range=$range" if !defined $name; + push @{$niceranges->{$name}}, [$min, $max]; + } + } + + # now populate featlist + my $seen = undef; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $seen->{$name} ++; + my $min = 0.0; + my $max = 1.0; + if (defined $niceranges->{$name}) { + my $minmax = shift @{$niceranges->{$name}}; + ($min, $max) = @$minmax if defined $minmax; + } + $featlist->{"mins"}->[$i] = $min; + $featlist->{"maxs"}->[$i] = $max; + } + return $featlist; +} + + + + +sub submit_or_exec_thu_host { + # use Net::OpenSSH::Compat::Perl; + + my $argvlen = @_; + my $submithost = undef; + my $run = -1; + my $idx = ""; + my $batch_and_join = ""; + my $my_username = undef; + my $cmd = undef; + my $qsubwrapcmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $prevjid = undef; + + + # if supply 7 arguments, exec without submit + # if supply 8 arguments, then submit new job + # if supply 9 arguments, wait for the previous job to finish + if ($argvlen == 7){ + ($submithost,$run,$idx,$cmd,$batch_and_join,$stdout,$stderr) = @_; + } elsif ($argvlen == 8){ + ($submithost,$run,$idx,$cmd,$batch_and_join,$stdout,$stderr,$jidfile) = @_; + } elsif ($argvlen == 9){ + ($submithost,$run,$idx,$cmd,$batch_and_join,$stdout,$stderr,$jidfile,$prevjid) = @_; + } + + + + chomp(my $my_username = `whoami`); + my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + + print STDERR "submithost = $submithost\n"; + print STDERR "qusbwrapper at = $qsubwrapper\n"; + + $ssh->login(); + + if ($argvlen == 9) { + $qsubwrapcmd = "$qsubwrapper -command='$cmd' -queue-parameter=\"$queueparameters $batch_and_join\" -qsub-prefix='$qsubname$idx' -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -prevjid='$prevjid'"; + } elsif ($argvlen == 8) { + # $qsubwrapcmd = "$qsubwrapper -command='${jobscript}${idx}.bash' -queue-parameter=\"$queueparameters $batch_and_join\" -qsub-prefix='$qsubname$idx' -stdout=$stdout -stderr=$stderr -jidfile=$jidfile"; + $qsubwrapcmd = "$qsubwrapper -command='$cmd' -queue-parameter=\"$queueparameters $batch_and_join\" -qsub-prefix='$qsubname$idx' -stdout=$stdout -stderr=$stderr -jidfile=$jidfile"; + } + print STDERR "Executing $qsubwrapcmd in $___WORKING_DIR\n"; + $ssh->cmd("cd $___WORKING_DIR && $qsubwrapcmd"); + +} + +sub exit_submit_thu_host { + + my $argvlen = @_; + my $submithost = undef; + my $run = -1; + my $idx = ""; + my $batch_and_join = ""; + my $my_username = undef; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $pidfile = undef; + my $prevjid = undef; + my $prevjidarraysize = 0; + my @prevjidarray = (); + my $pid = undef; + my $qsubcmd=""; + my $hj=""; + + # if supply 8 arguments, then submit new job + # if supply 9 arguments, wait for the previous job to finish + if ($argvlen == 6){ + ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr) = @_; + } elsif ($argvlen == 8){ + ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr,$jidfile,$pidfile) = @_; + } elsif ($argvlen == 9){ + ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr,$jidfile,$pidfile,$prevjid) = @_; + } + + # parse prevjid ######################## + $prevjid =~ s/^\s+|\s+$//g; + @prevjidarray = split(/\s+/,$prevjid); + $prevjidarraysize = scalar(@prevjidarray); + ######################################## + + + # print STDERR "exec: $stdout\n"; + + # read pid from file, and draft exit script ################## + chomp ($pid=`tail -n 1 $pidfile`); + open (OUT, ">exitjob$pid.sh"); + + my $scriptheader="\#\!/bin/bash\n\#\$ -S /bin/sh\n# Both lines are needed to invoke base\n#the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n"; + $scriptheader .="uname -a\n\n"; + $scriptheader .="cd $___WORKING_DIR\n\n"; + + print OUT $scriptheader; + + print OUT "if $qsubwrapper_exit -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -pidfile=$pidfile > exitjob$pid.out 2> exitjob$pid.err ; then + echo 'succeeded' +else + echo failed with exit status \$\? + die=1 +fi +"; + print OUT "\n\n"; + + close (OUT); + # setting permissions of the script + chmod(oct(755),"exitjob$pid.sh"); + ############################################################## + + # log in submit host ######################################### + chomp(my $my_username = `whoami`); + my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + + print STDERR "submithost = $submithost\n"; + print STDERR "my username = $my_username\n"; + print STDERR "qusbwrapper at = $qsubwrapper\n"; + + $ssh->login(); + ############################################################## + + + if ($argvlen==9) { + if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1){ + $hj = "-hold_jid $prevjid"; + } elsif (defined $prevjid && $prevjidarraysize > 1){ + $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray); + } + $qsubcmd="qsub $queueparameters -o /dev/null -e /dev/null $hj exitjob$pid.sh > exitjob$pid.log 2>&1"; + $ssh->cmd("cd $___WORKING_DIR && $qsubcmd"); + } elsif ($argvlen==8) { + $qsubcmd="qsub $queueparameters -o /dev/null -e /dev/null exitjob$pid.sh > exitjob$pid.log 2>&1"; + $ssh->cmd("cd $___WORKING_DIR && $qsubcmd"); + } + print STDERR "Executing $qsubcmd in $___WORKING_DIR\n"; + +} + + diff --git a/contrib/mert-sge-nosync/generic/qsub-wrapper-exit-sge-nosync.pl b/contrib/mert-sge-nosync/generic/qsub-wrapper-exit-sge-nosync.pl new file mode 100755 index 000000000..b480cbf72 --- /dev/null +++ b/contrib/mert-sge-nosync/generic/qsub-wrapper-exit-sge-nosync.pl @@ -0,0 +1,312 @@ +#! /usr/bin/perl + +# $Id$ +use strict; +use Net::OpenSSH::Compat::Perl; +####################### +#Default parameters +#parameters for submiiting processes through SGE +#NOTE: group name is ws06ossmt (with 2 's') and not ws06osmt (with 1 's') +my $queueparameters=""; + +# look for the correct pwdcmd +my $pwdcmd = getPwdCmd(); + +my $workingdir = `$pwdcmd`; chomp $workingdir; +# my $tmpdir="$workingdir/tmp$$"; +# my $jobscript="$workingdir/job$$.sh"; +# my $qsubout="$workingdir/out.job$$"; +# my $qsuberr="$workingdir/err.job$$"; + + +$SIG{INT} = \&kill_all_and_quit; # catch exception for CTRL-C + +my $submithost=""; +my $help=""; +my $dbg=""; +my $version=""; +my $qsubname="WR$$"; +my $cmd=""; +my $cmdout=undef; +my $cmderr=undef; +my $jid=0; +my $jidfile=undef; +my $pid=0; +my $pidfile=undef; +my $prevjid=undef; +my $parameters=""; +my $old_sge = 0; # assume grid engine < 6.0 +my $prevjidarraysize = 0; +my $force_delete = 0; +my @prevjidarray = (); + +sub init(){ + use Getopt::Long qw(:config pass_through); + GetOptions('version'=>\$version, + 'help'=>\$help, + 'debug'=>\$dbg, + 'submithost=s'=> \$submithost, + 'qsub-prefix=s'=> \$qsubname, + 'stdout=s'=> \$cmdout, + 'stderr=s'=> \$cmderr, + 'jidfile=s'=> \$jidfile, + 'pidfile=s'=> \$pidfile, # process id for previous job + 'prevjid=s'=> \$prevjid, + 'queue-parameter=s'=> \$queueparameters, + 'force-delete=i' => \$force_delete, + 'old-sge' => \$old_sge, + ) or exit(1); + $parameters="@ARGV"; + + # read $pid from file + chomp($pid=`tail -n 1 $pidfile`); + # print "PID=+$pidfile+\n"; + + if (defined $jidfile) { + chomp($jid=`tail -n 1 $jidfile`); + } + + # print STDERR "INPUT prevjid =+$prevjid+\n"; + $prevjid =~ s/^\s+|\s+$//g; + # print STDERR "TRIMMED prevjid =+$prevjid+\n"; + + @prevjidarray = split(/\s+/,$prevjid); + $prevjidarraysize = scalar(@prevjidarray); + + # print STDERR "arraysize: $prevjidarraysize\n"; + + + version() if $version; + usage() if $help; + print_parameters() if $dbg; +} + +####################### +##print version +sub version(){ +# print STDERR "version 1.0 (29-07-2006)\n"; + print STDERR "version 1.1 (31-07-2006)\n"; + exit(1); +} + +#usage +sub usage(){ + print STDERR "qsub-wrapper.pl [options]\n"; + print STDERR "Options:\n"; + print STDERR "-stdout file to find stdout from target cmd (optional)\n"; + print STDERR "-stderr file to find stderr from target cmd (optional)\n"; + print STDERR "-jidfile file to find the submit jobid (for submit option)\n"; + print STDERR "-pidfile file to find the process id to the target job for deletion\n"; + print STDERR "-prevjid wait for the previous job with jobid=id to finish before starting (optional)\n"; + print STDERR "-force-delete 1 force-delete without checking\n"; + print STDERR "-qsub-prefix name for sumbitted jobs (optional)\n"; + print STDERR "-queue-parameters parameter for the queue (optional)\n"; + print STDERR "-old-sge ... assume Sun Grid Engine < 6.0\n"; + print STDERR "-debug debug\n"; + print STDERR "-version print version of the script\n"; + print STDERR "-help this help\n"; + exit(1); +} + +#printparameters +sub print_parameters(){ + # print STDERR "command: $cmd\n"; + if (defined($cmdout)){ print STDERR "file for stdout: $cmdout\n"; } + else { print STDERR "file for stdout is not defined, stdout is discarded\n"; } + if (defined($cmderr)){ print STDERR "file for stdout: $cmderr\n"; } + else { print STDERR "file for stderr is not defined, stderr is discarded\n"; } + if (defined($jidfile)){ print STDERR "file for submit job id: $jidfile\n"; } + else { print STDERR "file for submit job id is not defined, jidfile is discarded\n"; } + print STDERR "Qsub name: $qsubname\n"; + print STDERR "Queue parameters: $queueparameters\n"; + print STDERR "parameters directly passed to cmd: $parameters\n"; + exit(1); +} + + +####################### +#Script starts here + +init(); + +my $tmpdir="$workingdir/tmp$pid"; +my $jobscript="$workingdir/job$pid.sh"; +my $qsubout="$workingdir/out.job$pid"; +my $qsuberr="$workingdir/err.job$pid"; + + + +### usage() if $cmd eq ""; +### +### safesystem("mkdir -p $tmpdir") or die; +### +### preparing_script(); +### +#### my $maysync = $old_sge ? "" : "-sync y"; +#### never run in syn mode +###my $maysync = ""; +### +###my $qsubcmd = ""; +#### create the qsubcmd to submit to the queue with the parameter "-b yes" +####my $qsubcmd="qsub $queueparameters $maysync -V -o $qsubout -e $qsuberr -N $qsubname -b yes $jobscript > $jobscript.log 2>&1"; +### +#### add -b yes if not yet defined, otherwise leave empty +###$queueparameters .= " -b yes " if (index($queueparameters," -b ")==-1); +### +### +###if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1) { +### $qsubcmd="qsub $queueparameters $maysync -V -hold_jid $prevjid -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1"; +###} elsif (defined $prevjid && $prevjidarraysize > 1) { +### my $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray); +### # print STDERR "hj is $hj\n"; +### $qsubcmd="qsub $queueparameters $maysync -V $hj -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1"; +###} else { +### $qsubcmd="qsub $queueparameters $maysync -V -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1"; +###} +### +###print "submitting $qsubcmd\n"; +### +####run the qsubcmd +### +###safesystem($qsubcmd) or die; +### +####getting id of submitted job############# +###my $res; +###open (IN,"$jobscript.log") or die "Can't read main job id: $jobscript.log"; +###chomp($res=); +###my @arrayStr = split(/\s+/,$res); +###my $id=$arrayStr[2]; +###die "Failed to get job id from $jobscript.log, got: $res" +### if $id !~ /^[0-9]+$/; +###close(IN); +############################################ +###print STDERR " res:$res\n"; +###print STDERR " id:$id\n"; +### +###open (JIDOUT,">$jidfile") or die "Can't open jid file to write"; +###print JIDOUT "$id\n"; +###close(JIDOUT); +### +###open (JOBNUMOUT,">$jidfile.job") or die "Can't open id.job file to write"; +###print JOBNUMOUT "$$\n"; +###close(JOBNUMOUT); +### +###if ($old_sge) { +### # need to workaround -sync, add another job that will wait for the main one +### # prepare a fake waiting script +### my $syncscript = "$jobscript.sync_workaround_script.sh"; +### safesystem("echo 'date' > $syncscript") or die; +### +### my $checkpointfile = "$jobscript.sync_workaround_checkpoint"; +### +### # ensure checkpoint does not exist +### safesystem("\\rm -f $checkpointfile") or die; +### +### # start the 'hold' job, i.e. the job that will wait +#### $cmd="qsub -cwd $queueparameters -hold_jid $id -o $checkpointfile f -e /dev/null -N $qsubname.W $syncscript >& $qsubname.W.log"; +### $cmd="qsub -cwd $queueparameters -hold_jid $id -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript >& $qsubname.W.log"; +### safesystem($cmd) or die; +### +### # and wait for checkpoint file to appear +### my $nr=0; +### while (!-e $checkpointfile) { +### sleep(10); +### $nr++; +### print STDERR "w" if $nr % 3 == 0; +### } +### safesystem("\\rm -f $checkpointfile $syncscript") or die(); +### print STDERR "End of waiting workaround.\n"; +###} + + +my $failure=0; + + +if (!$force_delete) { + $failure=&check_exit_status(); + print STDERR "check_exit_status returned $failure\n"; +} + + +&kill_all_and_quit() if $failure; + +&remove_temporary_files() if !$dbg; + + + + +sub check_exit_status(){ + my $failure=0; + + print STDERR "check_exit_status of submitted job $jid from file $qsubout\n"; + open(IN,"$qsubout") or die "Can't read $qsubout"; + while (){ + $failure=1 if (/failed with exit status/); + } + close(IN); + return $failure; +} + +sub kill_all_and_quit(){ + my $my_username = undef; + + # chomp($my_username = `whoami`); + # + # my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + # + # $ssh->login("$my_username",`cat /home/$my_username/accpw`); + + my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + + $ssh->login(); + + print STDERR "kill_all_and_quit\n"; + print STDERR "qdel $jid\n"; + # safesystem("qdel $jid"); + $ssh->cmd("qdel $jid"); + + print STDERR "The submitted jobs died not correctly\n"; + print STDERR "Send qdel signal to the submitted jobs\n"; + + exit(1); +} + +sub remove_temporary_files(){ + #removing temporary files + + unlink("${jobscript}"); + unlink("${jobscript}.log"); + unlink("$qsubout"); + unlink("$qsuberr"); + rmdir("$tmpdir"); +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + +# look for the correct pwdcmd (pwd by default, pawd if it exists) +# I assume that pwd always exists +sub getPwdCmd(){ + my $pwdcmd="pwd"; + my $a; + chomp($a=`which pawd | head -1 | awk '{print $1}'`); + if ($a && -e $a){ $pwdcmd=$a; } + return $pwdcmd; +} + diff --git a/contrib/mert-sge-nosync/generic/qsub-wrapper-sge-nosync.pl b/contrib/mert-sge-nosync/generic/qsub-wrapper-sge-nosync.pl new file mode 100755 index 000000000..1a17b83fe --- /dev/null +++ b/contrib/mert-sge-nosync/generic/qsub-wrapper-sge-nosync.pl @@ -0,0 +1,320 @@ +#! /usr/bin/perl + +# $Id$ +use strict; + +####################### +#Default parameters +#parameters for submiiting processes through SGE +#NOTE: group name is ws06ossmt (with 2 's') and not ws06osmt (with 1 's') +my $queueparameters=""; + +# look for the correct pwdcmd +my $pwdcmd = getPwdCmd(); + +my $workingdir = `$pwdcmd`; chomp $workingdir; + +my $uniqtime = `date +"%s%N"`; chomp $uniqtime; +my $uid = "$$".".".$uniqtime; + + +my $tmpdir="$workingdir/tmp$uid"; +my $jobscript="$workingdir/job$uid.sh"; +my $qsubout="$workingdir/out.job$uid"; +my $qsuberr="$workingdir/err.job$uid"; + + +$SIG{INT} = \&kill_all_and_quit; # catch exception for CTRL-C + +my $help=""; +my $dbg=""; +my $version=""; +my $qsubname="WR$uid"; +my $cmd=""; +my $cmdout=undef; +my $cmderr=undef; +my $jidfile=undef; +my $pidfile=undef; +my $prevjid=undef; +my $parameters=""; +my $old_sge = 0; # assume grid engine < 6.0 +my $prevjidarraysize = 0; +my @prevjidarray = (); + +sub init(){ + use Getopt::Long qw(:config pass_through); + GetOptions('version'=>\$version, + 'help'=>\$help, + 'debug'=>\$dbg, + 'qsub-prefix=s'=> \$qsubname, + 'command=s'=> \$cmd, + 'stdout=s'=> \$cmdout, + 'stderr=s'=> \$cmderr, + 'jidfile=s'=> \$jidfile, + 'prevjid=s'=> \$prevjid, + 'queue-parameters=s'=> \$queueparameters, + 'old-sge' => \$old_sge, + ) or exit(1); + $parameters="@ARGV"; + + # print STDERR "INPUT prevjid =+$prevjid+\n"; + $prevjid =~ s/^\s+|\s+$//g; + # print STDERR "TRIMMED prevjid =+$prevjid+\n"; + + @prevjidarray = split(/\s+/,$prevjid); + $prevjidarraysize = scalar(@prevjidarray); + + # print STDERR "arraysize: $prevjidarraysize\n"; + + + + + version() if $version; + usage() if $help; + print_parameters() if $dbg; +} + +####################### +##print version +sub version(){ +# print STDERR "version 1.0 (29-07-2006)\n"; + print STDERR "version 1.1 (31-07-2006)\n"; + exit(1); +} + +#usage +sub usage(){ + print STDERR "qsub-wrapper.pl [options]\n"; + print STDERR "Options:\n"; + print STDERR "-command command to run\n"; + print STDERR "-stdout file to save stdout of cmd (optional)\n"; + print STDERR "-stderr file to save stderr of cmd (optional)\n"; + print STDERR "-jidfile file to save the submit jobid (for submit option)\n"; + print STDERR "-prevjid wait for the previous job with jobid=id to finish before starting (optional)\n"; + print STDERR "-qsub-prefix name for sumbitted jobs (optional)\n"; + print STDERR "-queue-parameters parameter for the queue (optional)\n"; + print STDERR "-old-sge ... assume Sun Grid Engine < 6.0\n"; + print STDERR "-debug debug\n"; + print STDERR "-version print version of the script\n"; + print STDERR "-help this help\n"; + exit(1); +} + +#printparameters +sub print_parameters(){ + print STDERR "command: $cmd\n"; + if (defined($cmdout)){ print STDERR "file for stdout: $cmdout\n"; } + else { print STDERR "file for stdout is not defined, stdout is discarded\n"; } + if (defined($cmderr)){ print STDERR "file for stdout: $cmderr\n"; } + else { print STDERR "file for stderr is not defined, stderr is discarded\n"; } + if (defined($jidfile)){ print STDERR "file for submit job id: $jidfile\n"; } + else { print STDERR "file for submit job id is not defined, jidfile is discarded\n"; } + print STDERR "Qsub name: $qsubname\n"; + print STDERR "Queue parameters: $queueparameters\n"; + print STDERR "parameters directly passed to cmd: $parameters\n"; + exit(1); +} + +#script creation +sub preparing_script(){ + my $scriptheader="\#\!/bin/bash\n# the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n"; + $scriptheader.="uname -a\n\n"; + + $scriptheader.="cd $workingdir\n\n"; + + open (OUT, "> $jobscript"); + print OUT $scriptheader; + + print OUT "if $cmd $parameters > $tmpdir/cmdout$uid 2> $tmpdir/cmderr$uid ; then + echo 'succeeded' +else + echo failed with exit status \$\? + die=1 +fi +"; + + if (defined $cmdout){ + print OUT "mv -f $tmpdir/cmdout$uid $cmdout || echo failed to preserve the log: $tmpdir/cmdout$uid\n\n"; + } + else{ + print OUT "rm -f $tmpdir/cmdout$uid\n\n"; + } + + if (defined $cmderr){ + print OUT "mv -f $tmpdir/cmderr$uid $cmderr || echo failed to preserve the log: $tmpdir/cmderr$uid\n\n"; + } + else{ + print OUT "rm -f $tmpdir/cmderr$uid\n\n"; + } + print OUT "if [ x\$die == 1 ]; then exit 1; fi\n"; + close(OUT); + + #setting permissions of the script + chmod(oct(755),$jobscript); +} + +####################### +#Script starts here + +init(); + +usage() if $cmd eq ""; + +safesystem("mkdir -p $tmpdir") or die; + +preparing_script(); + +# my $maysync = $old_sge ? "" : "-sync y"; +# never run in syn mode +my $maysync = ""; + +my $qsubcmd = ""; +# create the qsubcmd to submit to the queue with the parameter "-b yes" +#my $qsubcmd="qsub $queueparameters $maysync -V -o $qsubout -e $qsuberr -N $qsubname -b yes $jobscript > $jobscript.log 2>&1"; + +# add -b yes if not yet defined, otherwise leave empty +$queueparameters .= " -b yes " if (index($queueparameters," -b ")==-1); + + +if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1) { + $qsubcmd="qsub $queueparameters $maysync -V -hold_jid $prevjid -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1"; +} elsif (defined $prevjid && $prevjidarraysize > 1) { + my $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray); + # print STDERR "hj is $hj\n"; + $qsubcmd="qsub $queueparameters $maysync -V $hj -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1"; +} else { + $qsubcmd="qsub $queueparameters $maysync -V -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1"; +} + +print "submitting $qsubcmd\n"; + +#run the qsubcmd + +safesystem($qsubcmd) or die; + +#getting id of submitted job############# +my $res; +open (IN,"$jobscript.log") or die "Can't read main job id: $jobscript.log"; +chomp($res=); +my @arrayStr = split(/\s+/,$res); +my $id=$arrayStr[2]; +die "Failed to get job id from $jobscript.log, got: $res" + if $id !~ /^[0-9]+$/; +close(IN); +######################################### +print STDERR " res:$res\n"; +print STDERR " id:$id\n"; + +open (JIDOUT,">$jidfile") or die "Can't open jid file to write"; +print JIDOUT "$id\n"; +close(JIDOUT); + +open (PIDOUT,">$jidfile.pid") or die "Can't open id.pid file to write"; +print PIDOUT "$uid\n"; +close(PIDOUT); + +if ($old_sge) { + # need to workaround -sync, add another job that will wait for the main one + # prepare a fake waiting script + my $syncscript = "$jobscript.sync_workaround_script.sh"; + safesystem("echo 'date' > $syncscript") or die; + + my $checkpointfile = "$jobscript.sync_workaround_checkpoint"; + + # ensure checkpoint does not exist + safesystem("\\rm -f $checkpointfile") or die; + + # start the 'hold' job, i.e. the job that will wait +# $cmd="qsub -cwd $queueparameters -hold_jid $id -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript >& $qsubname.W.log"; + $cmd="qsub -cwd $queueparameters -hold_jid $id -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript >& $qsubname.W.log"; + safesystem($cmd) or die; + + # and wait for checkpoint file to appear + my $nr=0; + while (!-e $checkpointfile) { + sleep(10); + $nr++; + print STDERR "w" if $nr % 3 == 0; + } + safesystem("\\rm -f $checkpointfile $syncscript") or die(); + print STDERR "End of waiting workaround.\n"; +} + + + + +# my $failure=&check_exit_status(); +# print STDERR "check_exit_status returned $failure\n"; + +# &kill_all_and_quit() if $failure; + +# &remove_temporary_files() if !$dbg; + + + + + + + +sub check_exit_status(){ + my $failure=0; + + print STDERR "check_exit_status of submitted job $id\n"; + open(IN,"$qsubout") or die "Can't read $qsubout"; + while (){ + $failure=1 if (/failed with exit status/); + } + close(IN); + return $failure; +} + +sub kill_all_and_quit(){ + print STDERR "kill_all_and_quit\n"; + print STDERR "qdel $id\n"; + safesystem("qdel $id"); + + print STDERR "The submitted jobs died not correctly\n"; + print STDERR "Send qdel signal to the submitted jobs\n"; + + exit(1); +} + +sub remove_temporary_files(){ + #removing temporary files + + unlink("${jobscript}"); + unlink("${jobscript}.log"); + unlink("$qsubout"); + unlink("$qsuberr"); + rmdir("$tmpdir"); +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + +# look for the correct pwdcmd (pwd by default, pawd if it exists) +# I assume that pwd always exists +sub getPwdCmd(){ + my $pwdcmd="pwd"; + my $a; + chomp($a=`which pawd | head -1 | awk '{print $1}'`); + if ($a && -e $a){ $pwdcmd=$a; } + return $pwdcmd; +} + diff --git a/contrib/mert-sge-nosync/training/mert-moses-sge-nosync.pl b/contrib/mert-sge-nosync/training/mert-moses-sge-nosync.pl new file mode 100755 index 000000000..6d6c083ce --- /dev/null +++ b/contrib/mert-sge-nosync/training/mert-moses-sge-nosync.pl @@ -0,0 +1,1633 @@ +#! /usr/bin/perl -w +# $Id$ +# Usage: +# mert-moses.pl +# For other options see below or run 'mert-moses.pl --help' + +# Notes: +# and should be raw text files, one sentence per line +# can be a prefix, in which case the files are 0, 1, etc. are used + +# Excerpts from revision history + +# Sept 2011 multi-threaded mert (Barry Haddow) +# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK) +# Jul 2011 simplifications (Ondrej Bojar) +# -- rely on moses' -show-weights instead of parsing moses.ini +# ... so moses is also run once *before* mert starts, checking +# the model to some extent +# -- got rid of the 'triples' mess; +# use --range to supply bounds for random starting values: +# --range tm:-3..3 --range lm:-3..3 +# 5 Aug 2009 Handling with different reference length policies (shortest, average, closest) for BLEU +# and case-sensistive/insensitive evaluation (Nicola Bertoldi) +# 5 Jun 2008 Forked previous version to support new mert implementation. +# 13 Feb 2007 Better handling of default values for lambda, now works with multiple +# models and lexicalized reordering +# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1] +# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi) +# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table) +# useful if binary phrase tables are used (Nicola Bertoldi) +# 28 Aug 2006 Use either closest or average or shortest (default) reference +# length as effective reference length +# Use either normalization or not (default) of texts (Nicola Bertoldi) +# 31 Jul 2006 move gzip run*.out to avoid failure wit restartings +# adding default paths +# 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again) +# 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar) +# 27 Jul 2006 adding the safesystem() function to handle with process failure +# 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi) +# 21 Jul 2006 adapted for Moses-in-parallel (Nicola Bertoldi) +# 18 Jul 2006 adapted for Moses and cleaned up (PK) +# 21 Jan 2005 unified various versions, thorough cleanup (DWC) +# now indexing accumulated n-best list solely by feature vectors +# 14 Dec 2004 reimplemented find_threshold_points in C (NMD) +# 25 Oct 2004 Use either average or shortest (default) reference +# length as effective reference length (DWC) +# 13 Oct 2004 Use alternative decoders (DWC) +# Original version by Philipp Koehn + +use strict; +use Net::OpenSSH::Compat::Perl; +use FindBin qw($Bin); +use File::Basename; +use File::Path; +use File::Spec; +use Cwd; + +my $SCRIPTS_ROOTDIR = $Bin; +$SCRIPTS_ROOTDIR =~ s/\/training$//; +$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); + +## We preserve this bit of comments to keep the traditional weight ranges. +# "w" => [ [ 0.0, -1.0, 1.0 ] ], # word penalty +# "d" => [ [ 1.0, 0.0, 2.0 ] ], # lexicalized reordering model +# "lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model +# "g" => [ [ 1.0, 0.0, 2.0 ], # generation model +# [ 1.0, 0.0, 2.0 ] ], +# "tm" => [ [ 0.3, 0.0, 0.5 ], # translation model +# [ 0.2, 0.0, 0.5 ], +# [ 0.3, 0.0, 0.5 ], +# [ 0.2, 0.0, 0.5 ], +# [ 0.0,-1.0, 1.0 ] ], # ... last weight is phrase penalty +# "lex"=> [ [ 0.1, 0.0, 0.2 ] ], # global lexical model +# "I" => [ [ 0.0,-1.0, 1.0 ] ], # input lattice scores + + + +# moses.ini file uses FULL names for lambdas, while this training script +# internally (and on the command line) uses ABBR names. +my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w + g=weight-generation lex=weight-lex I=weight-i); +my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP; +my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP; + +my $minimum_required_change_in_weights = 0.00001; + # stop if no lambda changes more than this + +my $verbose = 0; +my $usage = 0; # request for --help + +# We assume that if you don't specify working directory, +# we set the default is set to `pwd`/mert-work + +my $___FIRSTJOBWAITID = undef; # wait for a job in the grid before first job starts + +my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work"); +my $___DEV_F = undef; # required, input text to decode +my $___DEV_E = undef; # required, basename of files with references +my $___DECODER = undef; # required, pathname to the decoder executable +my $___CONFIG = undef; # required, pathname to startup ini file +my $___N_BEST_LIST_SIZE = 100; +my $___LATTICE_SAMPLES = 0; +my $queue_flags = "-hard"; # extra parameters for parallelizer + # the -l ws0ssmt was relevant only to JHU 2006 workshop +my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial) +my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder +my $qsubprefix = ""; +my $continue = 0; # should we try to continue from the last saved step? +my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert) +my $___FILTER_PHRASE_TABLE = 1; # filter phrase table +my $___PREDICTABLE_SEEDS = 0; +my $___START_WITH_HISTORIC_BESTS = 0; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009] +my $___RANDOM_DIRECTIONS = 0; # search in random directions only +my $___NUM_RANDOM_DIRECTIONS = 0; # number of random directions, also works with default optimizer [Cer&al.,2008] +my $___PAIRWISE_RANKED_OPTIMIZER = 0; # use Hopkins&May[2011] +my $___PRO_STARTING_POINT = 0; # get a starting point from pairwise ranked optimizer +my $___RANDOM_RESTARTS = 20; +my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous iteration's weights [Hopkins&May,2011,5.4.3] +my $__THREADS = 0; + +# Parameter for effective reference length when computing BLEU score +# Default is to use shortest reference +# Use "--shortest" to use shortest reference length +# Use "--average" to use average reference length +# Use "--closest" to use closest reference length +# Only one between --shortest, --average and --closest can be set +# If more than one choice the defualt (--shortest) is used +my $___SHORTEST = 0; +my $___AVERAGE = 0; +my $___CLOSEST = 0; + +# Use "--nocase" to compute case-insensitive scores +my $___NOCASE = 0; + +# Use "--nonorm" to non normalize translation before computing scores +my $___NONORM = 0; + +# set 0 if input type is text, set 1 if input type is confusion network +my $___INPUTTYPE = 0; + + +my $mertdir = undef; # path to new mert directory +my $mertargs = undef; # args to pass through to mert & extractor +my $mertmertargs = undef; # args to pass through to mert only +my $extractorargs = undef; # args to pass through to extractor only +my $filtercmd = undef; # path to filter-model-given-input.pl +my $submithost = undef; +my $filterfile = undef; +my $qsubwrapper = undef; +my $qsubwrapper_exit = undef; +my $moses_parallel_cmd = undef; +my $poll_decoder_cmd = undef; +my $zipextcmd = undef; +my $zipextargs = undef; +my $processresultcmd = undef; +my $processresultargs = undef; +my $old_sge = 0; # assume sge<6.0 +my $___CONFIG_ORIG = undef; # pathname to startup ini file before filtering +my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on + # if undef work on all features + # (others are fixed to the starting values) +my $___RANGES = undef; +my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loading data (default =-1) + # -1 means all previous, i.e. from iteration 1 + # 0 means no previous data, i.e. from actual iteration + # 1 means 1 previous data , i.e. from the actual iteration and from the previous one + # and so on +my $maximum_iterations = 25; +my $cmd = undef; + +##################### +my $processfeatlistcmd = undef; +my $processfeatlistargs = undef; +my $createconfigcmd = undef; +my $createconfigargs = undef; +my $decoderargs = undef; +##################### + +use Getopt::Long; +GetOptions( + "prevjid=i" => \$___FIRSTJOBWAITID, + "working-dir=s" => \$___WORKING_DIR, + "input=s" => \$___DEV_F, + "inputtype=i" => \$___INPUTTYPE, + "refs=s" => \$___DEV_E, + "decoder=s" => \$___DECODER, + "config=s" => \$___CONFIG, + "nbest=i" => \$___N_BEST_LIST_SIZE, + "lattice-samples=i" => \$___LATTICE_SAMPLES, + "submithost=s" => \$submithost, + "queue-flags=s" => \$queue_flags, + "jobs=i" => \$___JOBS, + "decoder-flags=s" => \$___DECODER_FLAGS, + "continue" => \$continue, + "skip-decoder" => \$skip_decoder, + "shortest" => \$___SHORTEST, + "average" => \$___AVERAGE, + "closest" => \$___CLOSEST, + "nocase" => \$___NOCASE, + "nonorm" => \$___NONORM, + "help" => \$usage, + "verbose" => \$verbose, + "mertdir=s" => \$mertdir, + "mertargs=s" => \$mertargs, + "extractorargs=s" => \$extractorargs, + "mertmertargs=s" => \$mertmertargs, + "rootdir=s" => \$SCRIPTS_ROOTDIR, + "filtercmd=s" => \$filtercmd, # allow to override the default location + "filterfile=s" => \$filterfile, # input to filtering script (useful for lattices/confnets) + "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location + "mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location + "old-sge" => \$old_sge, #passed to moses-parallel + "filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # (dis)allow of phrase tables + "predictable-seeds" => \$___PREDICTABLE_SEEDS, # make random restarts deterministic + "historic-bests" => \$___START_WITH_HISTORIC_BESTS, # use best settings from all previous iterations as starting points + "random-directions" => \$___RANDOM_DIRECTIONS, # search only in random directions + "number-of-random-directions=i" => \$___NUM_RANDOM_DIRECTIONS, # number of random directions + "random-restarts=i" => \$___RANDOM_RESTARTS, # number of random restarts + "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) + "range=s@" => \$___RANGES, + "prev-aggregate-nbestlist=i" => \$prev_aggregate_nbl_size, #number of previous step to consider when loading data (default =-1, i.e. all previous) + "maximum-iterations=i" => \$maximum_iterations, + "pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER, + "pro-starting-point" => \$___PRO_STARTING_POINT, + "historic-interpolation=f" => \$___HISTORIC_INTERPOLATION, + "threads=i" => \$__THREADS +) or exit(1); + +# the 4 required parameters can be supplied on the command line directly +# or using the --options +if (scalar @ARGV == 4) { + # required parameters: input_file references_basename decoder_executable + $___DEV_F = shift; + $___DEV_E = shift; + $___DECODER = shift; + $___CONFIG = shift; +} + +if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER || !defined $___CONFIG) { + print STDERR "usage: $0 input-text references decoder-executable decoder.ini +Options: + --prevjid=i ... previous job SGE ID to wait before first job starts + --working-dir=mert-dir ... where all the files are created + --nbest=100 ... how big nbestlist to generate + --lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010) + --jobs=N ... set this to anything to run moses in parallel + --mosesparallelcmd=STR ... use a different script instead of moses-parallel + --submithost=STRING ... submithost from where qsub operates + --queue-flags=STRING ... anything you with to pass to qsub, eg. + '-l ws06osssmt=true'. The default is: '-hard' + To reset the parameters, please use + --queue-flags=' ' + (i.e. a space between the quotes). + --decoder-flags=STRING ... extra parameters for the decoder + --continue ... continue from the last successful iteration + --skip-decoder ... skip the decoder run for the first time, + assuming that we got interrupted during + optimization + --shortest --average --closest + ... Use shortest/average/closest reference length + as effective reference length (mutually exclusive) + --nocase ... Do not preserve case information; i.e. + case-insensitive evaluation (default is false). + --nonorm ... Do not use text normalization (flag is not active, + i.e. text is NOT normalized) + --filtercmd=STRING ... path to filter-model-given-input.pl + --filterfile=STRING ... path to alternative to input-text for filtering + model. useful for lattice decoding + --rootdir=STRING ... where do helpers reside (if not given explicitly) + --mertdir=STRING ... path to new mert implementation + --mertargs=STRING ... extra args for both extractor and mert + --extractorargs=STRING ... extra args for extractor only + --mertmertargs=STRING ... extra args for mert only + --scorenbestcmd=STRING ... path to score-nbest.py + --old-sge ... passed to parallelizers, assume Grid Engine < 6.0 + --inputtype=[0|1|2] ... Handle different input types: (0 for text, + 1 for confusion network, 2 for lattices, + default is 0) + --no-filter-phrase-table ... disallow filtering of phrase tables + (useful if binary phrase tables are available) + --random-restarts=INT ... number of random restarts (default: 20) + --predictable-seeds ... provide predictable seeds to mert so that random + restarts are the same on every run + --range=tm:0..1,-1..1 ... specify min and max value for some features + --range can be repeated as needed. + The order of the various --range specifications + is important only within a feature name. + E.g.: + --range=tm:0..1,-1..1 --range=tm:0..2 + is identical to: + --range=tm:0..1,-1..1,0..2 + but not to: + --range=tm:0..2 --range=tm:0..1,-1..1 + --activate-features=STRING ... comma-separated list of features to optimize, + others are fixed to the starting values + default: optimize all features + example: tm_0,tm_4,d_0 + --prev-aggregate-nbestlist=INT ... number of previous step to consider when + loading data (default = $prev_aggregate_nbl_size) + -1 means all previous, i.e. from iteration 1 + 0 means no previous data, i.e. only the + current iteration + N means this and N previous iterations + + --maximum-iterations=ITERS ... Maximum number of iterations. Default: $maximum_iterations + --random-directions ... search only in random directions + --number-of-random-directions=int ... number of random directions + (also works with regular optimizer, default: 0) + --pairwise-ranked ... Use PRO for optimisation (Hopkins and May, emnlp 2011) + --pro-starting-point ... Use PRO to get a starting point for MERT + --threads=NUMBER ... Use multi-threaded mert (must be compiled in). + --historic-interpolation ... Interpolate optimized weights with prior iterations' weight + (parameter sets factor [0;1] given to current weights) +"; + exit 1; +} + + +# Check validity of input parameters and set defaults if needed + +print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n"; + +# path of script for filtering phrase tables and running the decoder +$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd; + +if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) { + print STDERR "Filtering command not found: $filtercmd.\n"; + print STDERR "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table\n"; + exit 1; +} + +$qsubwrapper = "$SCRIPTS_ROOTDIR/generic/qsub-wrapper-sge-nosync.pl" if !defined $qsubwrapper; + +$qsubwrapper_exit = "$SCRIPTS_ROOTDIR/generic/qsub-wrapper-exit-sge-nosync.pl" if !defined $qsubwrapper_exit; + +$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel-sge-nosync.pl" + if !defined $moses_parallel_cmd; + +if (!defined $mertdir) { + $mertdir = "$SCRIPTS_ROOTDIR/../mert"; + print STDERR "Assuming --mertdir=$mertdir\n"; +} + +my $mert_extract_cmd = "$mertdir/extractor"; +my $mert_mert_cmd = "$mertdir/mert"; +my $mert_pro_cmd = "$mertdir/pro"; + +die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd; +die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd; +die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd; + +my $pro_optimizer = "$mertdir/megam_i686.opt"; # or set to your installation +if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) { + print "did not find $pro_optimizer, installing it in $mertdir\n"; + `cd $mertdir; wget http://www.cs.utah.edu/~hal/megam/megam_i686.opt.gz;`; + `gunzip $pro_optimizer.gz`; + `chmod +x $pro_optimizer`; + die("ERROR: Installation of megam_i686.opt failed! Install by hand from http://www.cs.utah.edu/~hal/megam/") unless -x $pro_optimizer; +} + +$mertargs = "" if !defined $mertargs; + +my $scconfig = undef; +if ($mertargs =~ /\-\-scconfig\s+(.+?)(\s|$)/){ + $scconfig=$1; + $scconfig =~ s/\,/ /g; + $mertargs =~ s/\-\-scconfig\s+(.+?)(\s|$)//; +} + +# handling reference lengh strategy +if (($___CLOSEST + $___AVERAGE + $___SHORTEST) > 1){ + die "You can specify just ONE reference length strategy (closest or shortest or average) not both\n"; +} + +if ($___SHORTEST){ + $scconfig .= " reflen:shortest"; +}elsif ($___AVERAGE){ + $scconfig .= " reflen:average"; +}elsif ($___CLOSEST){ + $scconfig .= " reflen:closest"; +} + +# handling case-insensitive flag +if ($___NOCASE) { + $scconfig .= " case:false"; +}else{ + $scconfig .= " case:true"; +} +$scconfig =~ s/^\s+//; +$scconfig =~ s/\s+$//; +$scconfig =~ s/\s+/,/g; + +$scconfig = "--scconfig $scconfig" if ($scconfig); + +my $mert_extract_args=$mertargs; +$mert_extract_args .=" $scconfig"; +$mert_extract_args .=" $extractorargs"; + +$mertmertargs = "" if !defined $mertmertargs; + +my $mert_mert_args="$mertargs $mertmertargs"; +$mert_mert_args =~ s/\-+(binary|b)\b//; +$mert_mert_args .=" $scconfig"; +if ($___ACTIVATE_FEATURES){ $mert_mert_args .=" -o \"$___ACTIVATE_FEATURES\""; } + +my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd); +die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd; +die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd; +die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper; +die "Not executable: $___DECODER" if ! -x $___DECODER; + +my $input_abs = ensure_full_path($___DEV_F); +die "File not found: $___DEV_F (interpreted as $input_abs)." + if ! -e $input_abs; +$___DEV_F = $input_abs; + +# Option to pass to qsubwrapper and moses-parallel +my $pass_old_sge = $old_sge ? "-old-sge" : ""; + +my $decoder_abs = ensure_full_path($___DECODER); +die "File not executable: $___DECODER (interpreted as $decoder_abs)." + if ! -x $decoder_abs; +$___DECODER = $decoder_abs; + +my $ref_abs = ensure_full_path($___DEV_E); +# check if English dev set (reference translations) exist and store a list of all references +my @references; +if (-e $ref_abs) { + push @references, $ref_abs; +} +else { + # if multiple file, get a full list of the files + my $part = 0; + if (! -e $ref_abs."0" && -e $ref_abs.".ref0") { + $ref_abs .= ".ref"; + } + while (-e $ref_abs.$part) { + push @references, $ref_abs.$part; + $part++; + } + die("Reference translations not found: $___DEV_E (interpreted as $ref_abs)") unless $part; +} + +my $config_abs = ensure_full_path($___CONFIG); +die "File not found: $___CONFIG (interpreted as $config_abs)." + if ! -e $config_abs; +$___CONFIG = $config_abs; + +# moses should use our config +if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) / +|| $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) / +|| $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(generation-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) / +) { + die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files."; +} + +# as weights are normalized in the next steps (by cmert) +# normalize initial LAMBDAs, too +my $need_to_normalize = 1; + +#store current directory and create the working directory (if needed) +my $cwd = `pawd 2>/dev/null`; +if(!$cwd){$cwd = `pwd`;} +chomp($cwd); + +mkpath($___WORKING_DIR); + +{ +# open local scope + +#chdir to the working directory +chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR"; + +# fixed file names +my $mert_outfile = "mert.out"; +my $mert_logfile = "mert.log"; +my $weights_in_file = "init.opt"; +my $weights_out_file = "weights.txt"; + +# set start run +my $start_run = 1; ## START FROM run>1 is not supported +my $bestpoint = undef; +my $devbleu = undef; +my $sparse_weights_file = undef; +my $jobid = -1; + +my $prev_feature_file = undef; +my $prev_score_file = undef; +my $prev_init_file = undef; + + +######################### +# set jobid to trace different jobs +my $prevjid = undef; + + +################################################## +# STEP 1: FILTER PHRASE TABLE #################### +################################################## + +if ($___FILTER_PHRASE_TABLE) { + my $outdir = "filtered"; + if (-e "$outdir/moses.ini") { + print STDERR "Assuming the tables are already filtered, reusing $outdir/moses.ini\n"; + } + else { + # filter the phrase tables with respect to input, use --decoder-flags + print STDERR "filtering the phrase tables... ".`date`; + my $___FILTER_F = $___DEV_F; + $___FILTER_F = $filterfile if (defined $filterfile); + $cmd = "$filtercmd ./$outdir $___CONFIG $___FILTER_F"; + if (!defined $___FIRSTJOBWAITID) { + # &submit_or_exec($cmd,"filterphrases.out","filterphrases.err","filterphrases.id"); + $qsubprefix = "filph"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","filterphrases.out","filterphrases.err","filterphrases.id"); + } else { + # &submit_or_exec($cmd,"filterphrases.out","filterphrases.err","filterphrases.id",$___FIRSTJOBWAITID); + $qsubprefix = "filph"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","filterphrases.out","filterphrases.err","filterphrases.id",$___FIRSTJOBWAITID); + } + chomp($jobid=`tail -n 1 filterphrases.id`); + $prevjid = $jobid; + print STDERR "JOBID for filterphrases is $prevjid\n"; + ## clear up tmp + &exit_submit_thu_host($submithost,"","","","filterphrases.out","filterphrases.err","filterphrases.id","filterphrases.id.pid",$prevjid); + } + + # make a backup copy of startup ini filepath + $___CONFIG_ORIG = $___CONFIG; + # the decoder should now use the filtered model + $___CONFIG = "$outdir/moses.ini"; +} +else{ + # do not filter phrase tables (useful if binary phrase tables are available) + # use the original configuration file + $___CONFIG_ORIG = $___CONFIG; +} + +################################################# +######## STEP 2: CHECK moses.ini ################ +################################################# + + +# path of script for filtering phrase tables and running the decoder ------------ +$processfeatlistcmd="$SCRIPTS_ROOTDIR/training/sge-nosync/process-featlist-sge-nosync.pl" if !defined $processfeatlistcmd; + +$processfeatlistargs = "" if !defined $processfeatlistargs; +$processfeatlistargs = "$processfeatlistargs --range $___RANGES" if (defined $___RANGES); +$processfeatlistargs = "$processfeatlistargs --decoder-flags $___DECODER_FLAGS" if (!$___DECODER_FLAGS eq ""); + +$cmd = "$processfeatlistcmd $___DECODER $___CONFIG --inputtype $___INPUTTYPE $processfeatlistargs"; + +if (defined $prevjid) { + $qsubprefix = "proclist"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","processfeatlist.out","processfeatlist.err","processfeatlist.id",$prevjid); +} else { + $qsubprefix = "proclist"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","processfeatlist.out","processfeatlist.err","processfeatlist.id"); +} +chomp($jobid=`tail -n 1 processfeatlist.id`); +$prevjid = $jobid; +#---------------------------------------------------------------------------------- +## clear up tmp +&exit_submit_thu_host($submithost,"","","","processfeatlist.out","processfeatlist.err","processfeatlist.id","processfeatlist.id.pid",$prevjid); + +# we run moses to check validity of moses.ini and to obtain all the feature +# names +##COPIED# my $featlist = get_featlist_from_moses($___CONFIG); +##COPIED#$featlist = insert_ranges_to_featlist($featlist, $___RANGES); +##COPIED# +##COPIED## Mark which features are disabled: +##COPIED#if (defined $___ACTIVATE_FEATURES) { +##COPIED# my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES; +##COPIED# my %cnt; +##COPIED# for(my $i=0; $i{"names"}}); $i++) { +##COPIED# my $name = $featlist->{"names"}->[$i]; +##COPIED# $cnt{$name} = 0 if !defined $cnt{$name}; +##COPIED# $featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}}; +##COPIED# $cnt{$name}++; +##COPIED# } +##COPIED#} else { +##COPIED# # all enabled +##COPIED# for(my $i=0; $i{"names"}}); $i++) { +##COPIED# $featlist->{"enabled"}->[$i] = 1; +##COPIED# } +##COPIED#} +##COPIED# +##COPIED#print STDERR "MERT starting values and ranges for random generation:\n"; +##COPIED#for(my $i=0; $i{"names"}}); $i++) { +##COPIED# my $name = $featlist->{"names"}->[$i]; +##COPIED# my $val = $featlist->{"values"}->[$i]; +##COPIED# my $min = $featlist->{"mins"}->[$i]; +##COPIED# my $max = $featlist->{"maxs"}->[$i]; +##COPIED# my $enabled = $featlist->{"enabled"}->[$i]; +##COPIED# printf STDERR " %5s = %7.3f", $name, $val; +##COPIED# if ($enabled) { +##COPIED# printf STDERR " (%5.2f .. %5.2f)\n", $min, $max; +##COPIED# } else { +##COPIED# print STDERR " --- inactive, not optimized ---\n"; +##COPIED# } +##COPIED#} + +############################################################## +# if continue from last section ###### +###################################### +##!#if ($continue) { +##!# # getting the last finished step +##!# print STDERR "Trying to continue an interrupted optimization.\n"; +##!# open IN, "finished_step.txt" or die "Failed to find the step number, failed to read finished_step.txt"; +##!# my $step = ; +##!# chomp $step; +##!# close IN; +##!# +##!# print STDERR "Last finished step is $step\n"; +##!# +##!# # getting the first needed step +##!# my $firststep; +##!# if ($prev_aggregate_nbl_size==-1){ +##!# $firststep=1; +##!# } +##!# else{ +##!# $firststep=$step-$prev_aggregate_nbl_size+1; +##!# $firststep=($firststep>0)?$firststep:1; +##!# } +##!# +##!##checking if all needed data are available +##!## get $prev_feature_file, $prev_score_file, $prev_init_file +##!# if ($firststep<=$step){ +##!# print STDERR "First previous needed data index is $firststep\n"; +##!# print STDERR "Checking whether all needed data (from step $firststep to step $step) are available\n"; +##!# +##!# for (my $prevstep=$firststep; $prevstep<=$step;$prevstep++){ +##!# print STDERR "Checking whether data of step $prevstep are available\n"; +##!# if (! -e "run$prevstep.features.dat"){ +##!# die "Can't start from step $step, because run$prevstep.features.dat was not found!"; +##!# }else{ +##!# if (defined $prev_feature_file){ +##!# $prev_feature_file = "${prev_feature_file},run$prevstep.features.dat"; +##!# } +##!# else{ +##!# $prev_feature_file = "run$prevstep.features.dat"; +##!# } +##!# } +##!# if (! -e "run$prevstep.scores.dat"){ +##!# die "Can't start from step $step, because run$prevstep.scores.dat was not found!"; +##!# }else{ +##!# if (defined $prev_score_file){ +##!# $prev_score_file = "${prev_score_file},run$prevstep.scores.dat"; +##!# } +##!# else{ +##!# $prev_score_file = "run$prevstep.scores.dat"; +##!# } +##!# } +##!# if (! -e "run$prevstep.${weights_in_file}"){ +##!# die "Can't start from step $step, because run$prevstep.${weights_in_file} was not found!"; +##!# }else{ +##!# if (defined $prev_init_file){ +##!# $prev_init_file = "${prev_init_file},run$prevstep.${weights_in_file}"; +##!# } +##!# else{ +##!# $prev_init_file = "run$prevstep.${weights_in_file}"; +##!# } +##!# } +##!# } +##!# if (! -e "run$step.weights.txt"){ +##!# die "Can't start from step $step, because run$step.weights.txt was not found!"; +##!# } +##!# if (! -e "run$step.$mert_logfile"){ +##!# die "Can't start from step $step, because run$step.$mert_logfile was not found!"; +##!# } +##!# if (! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz"){ +##!# die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!"; +##!# } +##!# print STDERR "All needed data are available\n"; +##!# +##!# print STDERR "Loading information from last step ($step)\n"; +##!# my %dummy; # sparse features +##!# ($bestpoint,$devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile",scalar @{$featlist->{"names"}},\%dummy); +##!# die "Failed to parse mert.log, missed Best point there." +##!# if !defined $bestpoint || !defined $devbleu; +##!# print "($step) BEST at $step $bestpoint => $devbleu at ".`date`; +##!# my @newweights = split /\s+/, $bestpoint; +##!# +##!# # Sanity check: order of lambdas must match +##!# sanity_check_order_of_lambdas($featlist, +##!# "gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |"); +##!# +##!# # update my cache of lambda values +##!# $featlist->{"values"} = \@newweights; +##!# } +##!# else{ +##!# print STDERR "No previous data are needed\n"; +##!# } +##!# +##!# $start_run = $step +1; +##!# } +##################################### +## If continue from last section #### +##################################### + + +print STDERR "I am about to start main loop!!!\n"; + +### load featlist when needed +# my $featlist = get_featlist_from_moses($___CONFIG); +# my $featlist = undef; +###### MERT MAIN LOOP + + + +my $run=$start_run-1; + +my $oldallsorted = undef; +my $allsorted = undef; + +my $nbest_file=undef; +my $lsamp_file=undef; #Lattice samples +my $orig_nbest_file=undef; # replaced if lattice sampling + + + +# ------------------------------------------------------------------------ +# ------ get_featlist_and_create_config (only for run 1) ----------------- +# ------------------------------------------------------------------------ +$createconfigcmd="$SCRIPTS_ROOTDIR/training/sge-nosync/create-config-sge-nosync.pl" if !defined $createconfigcmd; + +$createconfigargs = "" if !defined $createconfigargs; +$createconfigargs = "$createconfigargs --range $___RANGES" if (defined $___RANGES); +$createconfigargs = "$createconfigargs --decoder-flags $___DECODER_FLAGS" if (!$___DECODER_FLAGS eq ""); +$createconfigargs = "$createconfigargs --devbleu $devbleu" if (defined $devbleu); +$createconfigargs = "$createconfigargs --sparse_weights_file $sparse_weights_file" if (defined $sparse_weights_file); +$createconfigargs = "$createconfigargs --working-dir $___WORKING_DIR" if (defined $___WORKING_DIR); + +my $cmd = "$createconfigcmd $___DEV_F $___DECODER $___CONFIG first --inputtype $___INPUTTYPE $createconfigargs"; + +if (defined $prevjid) { + $qsubprefix = "firstcfg"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","createconfigfirstrun.out","createconfigfirstrun.err","createconfigfirstrun.id",$prevjid); +} else { + $qsubprefix = "firstcfg"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","createconfigfirstrun.out","createconfigfirstrun.err","createconfigfirstrun.id"); +} +chomp($jobid=`tail -n 1 createconfigfirstrun.id`); +$prevjid = $jobid; +## clear up tmp +&exit_submit_thu_host($submithost,"","","","createconfigfirstrun.out","createconfigfirstrun.err","createconfigfirstrun.id","createconfigfirstrun.id.pid",$prevjid); + +##COPIED# create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file); +# ------------------------------------------------------------------------- +# ------------------------------------------------------------------------- +print "create config for first run is done!\n"; + + +while(1) { + $run++; + if ($maximum_iterations && $run > $maximum_iterations) { + print "Maximum number of iterations exceeded - stopping\n"; + last; + } + # run beamdecoder with option to output nbestlists + # the end result should be (1) @NBEST_LIST, a list of lists; (2) @SCORE, a list of lists of lists + + print "run $run start at ".`date`; + + + + + ################################################### + # step 3: decode + ################################################### + +# path of script for running the decoder ------------ +$moses_parallel_cmd="$SCRIPTS_ROOTDIR/generic/moses-parallel-sge-nosync.pl" if !defined $moses_parallel_cmd; +my $decoder_cmd; + +if (defined $___JOBS && $___JOBS > 0 ) { + # not support -lattice-samples + # $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -queue-parameters \"$queue_flags\" -decoder-flags $___DECODER_FLAGS -lattice-samples $___LATTICE_SAMPLES -n-best-list-size $___N_BEST_LIST_SIZE -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER -run $run -need-to-normalize $need_to_normalize -working-dir $___WORKING_DIR -qsubwrapper $qsubwrapper > run$run.out"; + + $decoderargs = ""; + ### DO NOT pass $___DECODER_FLAGS!! load the latest decoding parameter from inside the script + # $decoderargs = "-decoder-flags $___DECODER_FLAGS " if (!$___DECODER_FLAGS eq ""); + $decoderargs = "-script-rootdir $SCRIPTS_ROOTDIR " if (!$SCRIPTS_ROOTDIR eq ""); + $decoderargs = "$decoderargs -lattice-samples $___LATTICE_SAMPLES " if ($___LATTICE_SAMPLES != 0); + + $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -submithost \"$submithost\" -queue-parameters \"$queue_flags\" $decoderargs -n-best-list-size $___N_BEST_LIST_SIZE -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER -run $run -need-to-normalize $need_to_normalize -working-dir $___WORKING_DIR -qsubwrapper $qsubwrapper -qsubwrapper-exit $qsubwrapper_exit > run$run.out"; +} else { + print STDERR "Execute without going through grid is not supported!\n"; + exit(1); +} + +if (defined $prevjid) { + $qsubprefix="decode$run"; + &submit_or_exec_thu_host($submithost,"","",$decoder_cmd,"","decode$run.out","decode$run.err","decode$run.id",$prevjid); +} else { + $qsubprefix="decode$run"; + &submit_or_exec_thu_host($submithost,"","",$decoder_cmd,"","decode$run.out","decode$run.err","decode$run.id"); +} +## may have to change to decode$run.last.id to retrieve later job id. +$need_to_normalize = 0; +chomp($jobid=`tail -n 1 decode$run.id`); +$prevjid = $jobid; + +# safesystem($decode_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n"; + + ################################################### + # step 4: define wait job + ################################################### + $poll_decoder_cmd="$SCRIPTS_ROOTDIR/training/sge-nosync/poll-decoder.pl -poll-target decode$run.W.out"; + + if (defined $prevjid) { + $qsubprefix = "poll$run"; + &submit_or_exec_thu_host($submithost,"","",$poll_decoder_cmd,"","decode$run.POLL.out","decode$run.POLL.err","decode$run.POLL.id",$prevjid); + } else { + die "Step 4 (poll_decoder): Cannot find previous process for sequential submit\n"; + } + + chomp($jobid=`tail -n 1 decode$run.POLL.id`); + $prevjid = $jobid; + + ################################################## + # clear-up (step 3 and step 4) + ################################################## + # job for moses-parallel-sge-nosync.pl + &exit_submit_thu_host($submithost,"","","","decode$run.out","decode$run.err","decode$run.id","decode$run.id.pid",$prevjid); + ## waitall.sh job (cannot submit here because pid not established) + ##&exit_submit("decode$run.W.out","decode$run.W.err","decode$run.W.id","decode$run.W.id.pid",$prevjid); + # polling job + &exit_submit_thu_host($submithost,"","","","decode$run.POLL.out","decode$run.POLL.err","decode$run.POLL.id","decode$run.POLL.id.pid",$prevjid); + + + + ################################################### + # step 5: zip and extract + ################################################### + $zipextcmd="$SCRIPTS_ROOTDIR/training/sge-nosync/zipextract-decoder-result.pl" if !defined $zipextcmd; + $zipextargs=" -run $run"; + $zipextargs.=" -mertdir $mertdir" if defined $mertdir; + $zipextargs.=" $___DEV_E ./run$run.moses.ini"; + + $cmd = "$zipextcmd $zipextargs"; + + if (defined $prevjid){ + $qsubprefix="zip$run"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","zipext$run.out","zipext$run.err","zipext$run.id",$prevjid); + } else { + die "Step 5 (zip extract): Cannot find previous process for sequential submit\n"; + } + chomp($jobid=`tail -n 1 zipext$run.id`); + $prevjid = $jobid; + + ## clear-up tmp ### + &exit_submit_thu_host($submithost,"","","","zipext$run.out","zipext$run.err","zipext$run.id","zipext$run.id.pid",$prevjid); + + #################################################### + # step 6: process results (mert) + #################################################### + $processresultcmd="$SCRIPTS_ROOTDIR/training/sge-nosync/process-moses-result-sge-nosync.pl" if !defined $processresultcmd; + $processresultargs=" -run $run -submithost $submithost -queue-flags \"$queue_flags\""; + $processresultargs.=" -inputtype $___INPUTTYPE"; + $processresultargs.=" -mertdir $mertdir" if defined $mertdir; +# $processresultargs.=" $___DEV_F $___DECODER ./run$run.moses.ini"; + $processresultargs.=" $___DEV_F $___DECODER $___CONFIG"; + + $cmd = "$processresultcmd $processresultargs"; + + if (defined $prevjid){ + $qsubprefix="MERT$run"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","processmoses$run.out","processmoses$run.err","processmoses$run.id",$prevjid); + } else { + die "Step 6 (process results - (mert)): Cannot find previous process for sequential submit\n"; + } + chomp($jobid=`tail -n 1 processmoses$run.id`); + $prevjid = $jobid; + + ## clear-up tmp ## + &exit_submit_thu_host($submithost,"","","","processmoses$run.out","processmoses$run.err","processmoses$run.id","processmoses$run.id.pid",$prevjid); + + ### sleep to prevent batch select in short time + sleep(1); + +##!# else { +##!# $nbest_file="run$run.best$___N_BEST_LIST_SIZE.out.gz"; +##!# print "skipped decoder run $run\n"; +##!# $skip_decoder = 0; +##!# $need_to_normalize = 0; +##!# } +} + + +################################################# +# step 7: write out the final files +################################################# +$createconfigcmd="$SCRIPTS_ROOTDIR/training/sge-nosync/create-config-sge-nosync.pl" if !defined $createconfigcmd; + +$createconfigargs = "" if !defined $createconfigargs; +$createconfigargs = "$createconfigargs --range $___RANGES" if (defined $___RANGES); +$createconfigargs = "$createconfigargs --decoder-flags $___DECODER_FLAGS" if (!$___DECODER_FLAGS eq ""); +$createconfigargs = "$createconfigargs --devbleu $devbleu" if (defined $devbleu); +$createconfigargs = "$createconfigargs --sparse_weights_file $sparse_weights_file" if (defined $sparse_weights_file); +$createconfigargs = "$createconfigargs --working-dir $___WORKING_DIR" if (defined $___WORKING_DIR); + +my $cmd = "$createconfigcmd $___DEV_F $___DECODER $___CONFIG_ORIG final --inputtype $___INPUTTYPE $createconfigargs"; + +if (defined $prevjid) { + $qsubprefix = "finalcfg"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","createconfigfinal.out","createconfigfinal.err","createconfigfinal.id",$prevjid); +} else { + $qsubprefix = "finalcfg"; + &submit_or_exec_thu_host($submithost,"","",$cmd,"","createconfigfinal.out","createconfigfinal.err","createconfigfinal.id"); +} +chomp($jobid=`tail -n 1 createconfigfinal.id`); +$prevjid = $jobid; +# clear up tmp +&exit_submit_thu_host($submithost,"","","","createconfigfinal.out","createconfigfinal.err","createconfigfinal.id","createconfigfinal.id.pid",$prevjid); +## ------------------------------------------------------------------------- +print "create config for last run is done!\n"; + + +###### STEP 8: clear job ###### +my $clearcmd = ""; +$clearcmd = "$SCRIPTS_ROOTDIR/training/sge-nosync/cleartmpfiles.pl"; + + +if (defined $prevjid){ + $qsubprefix="Clearjob"; + &submit_or_exec_thu_host($submithost,"","",$clearcmd,"","clear.out","clear.err","clear.id",$prevjid); +} else { + die "Step 7 (clear job): Cannot find previous process for sequential submit\n"; +} +chomp($jobid=`tail -n 1 clear.id`); +$prevjid = $jobid; + +## clear-up tmp ## +&exit_submit_thu_host($submithost,"","","","clear.out","clear.err","clear.id","clear.id.pid",$prevjid); + +print "Tuning finished at ".`date`; + +chdir($cwd); + +} # end of local scope + +sub get_weights_from_mert { + my ($outfile,$logfile,$weight_count,$sparse_weights) = @_; + my ($bestpoint,$devbleu); + if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/)) { + open(IN,$outfile) or die "Can't open $outfile"; + my (@WEIGHT,$sum); + for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; } + while() { + # regular features + if (/^F(\d+) ([\-\.\de]+)/) { + $WEIGHT[$1] = $2; + $sum += abs($2); + } + # sparse features + elsif(/^(.+_.+) ([\-\.\de]+)/) { + $$sparse_weights{$1} = $2; + } + } + $devbleu = "unknown"; + foreach (@WEIGHT) { $_ /= $sum; } + foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; } + $bestpoint = join(" ",@WEIGHT); + close IN; + } + else { + open(IN,$logfile) or die "Can't open $logfile"; + while () { + if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { + $bestpoint = $1; + $devbleu = $2; + last; + } + } + close IN; + } + return ($bestpoint,$devbleu); +} + +##COPIED# sub run_decoder { +##COPIED# my ($featlist, $run, $need_to_normalize) = @_; +##COPIED# my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out"; +##COPIED# my $filename = sprintf($filename_template, $run); +##COPIED# my $lsamp_filename = undef; +##COPIED# if ($___LATTICE_SAMPLES) { +##COPIED# my $lsamp_filename_template = "run%d.lsamp$___LATTICE_SAMPLES.out"; +##COPIED# $lsamp_filename = sprintf($lsamp_filename_template, $run); +##COPIED# } +##COPIED# +##COPIED# # user-supplied parameters +##COPIED# print "params = $___DECODER_FLAGS\n"; +##COPIED# +##COPIED# # parameters to set all model weights (to override moses.ini) +##COPIED# my @vals = @{$featlist->{"values"}}; +##COPIED# if ($need_to_normalize) { +##COPIED# print STDERR "Normalizing lambdas: @vals\n"; +##COPIED# my $totlambda=0; +##COPIED# grep($totlambda+=abs($_),@vals); +##COPIED# grep($_/=$totlambda,@vals); +##COPIED# } +##COPIED# # moses now does not seem accept "-tm X -tm Y" but needs "-tm X Y" +##COPIED# my %model_weights; +##COPIED# for(my $i=0; $i{"names"}}); $i++) { +##COPIED# my $name = $featlist->{"names"}->[$i]; +##COPIED# $model_weights{$name} = "-$name" if !defined $model_weights{$name}; +##COPIED# $model_weights{$name} .= sprintf " %.6f", $vals[$i]; +##COPIED# } +##COPIED# my $decoder_config = join(" ", values %model_weights); +##COPIED# $decoder_config .= " -weight-file run$run.sparse-weights" if -e "run$run.sparse-weights"; +##COPIED# print STDERR "DECODER_CFG = $decoder_config\n"; +##COPIED# print "decoder_config = $decoder_config\n"; +##COPIED# +##COPIED# +##COPIED# # run the decoder +##COPIED# my $decoder_cmd; +##COPIED# my $lsamp_cmd = ""; +##COPIED# if ($___LATTICE_SAMPLES) { +##COPIED# $lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES "; +##COPIED# } +##COPIED# +##COPIED# ####### RUN moses_parallel ################################# +##COPIED# # need to gather: +##COPIED# ### $run +##COPIED# ### $lsamp_cmd +##COPIED# ### $filename +##COPIED# ########################################################### +##COPIED# +##COPIED# if (defined $___JOBS && $___JOBS > 0) { +##COPIED# $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; +##COPIED# } else { +##COPIED# $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out"; +##COPIED# } +##COPIED# +##COPIED# safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n"; +##COPIED# +##COPIED# sanity_check_order_of_lambdas($featlist, $filename); +##COPIED# return ($filename, $lsamp_filename); +##COPIED# } + + +##COPIED# sub insert_ranges_to_featlist { +##COPIED# my $featlist = shift; +##COPIED# my $ranges = shift; +##COPIED# +##COPIED# $ranges = [] if !defined $ranges; +##COPIED# +##COPIED# # first collect the ranges from options +##COPIED# my $niceranges; +##COPIED# foreach my $range (@$ranges) { +##COPIED# my $name = undef; +##COPIED# foreach my $namedpair (split /,/, $range) { +##COPIED# if ($namedpair =~ /^(.*?):/) { +##COPIED# $name = $1; +##COPIED# $namedpair =~ s/^.*?://; +##COPIED# die "Unrecognized name '$name' in --range=$range" +##COPIED# if !defined $ABBR2FULL{$name}; +##COPIED# } +##COPIED# my ($min, $max) = split /\.\./, $namedpair; +##COPIED# die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/; +##COPIED# die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/; +##COPIED# die "No name given in --range=$range" if !defined $name; +##COPIED# push @{$niceranges->{$name}}, [$min, $max]; +##COPIED# } +##COPIED# } +##COPIED# +##COPIED# # now populate featlist +##COPIED# my $seen = undef; +##COPIED# for(my $i=0; $i{"names"}}); $i++) { +##COPIED# my $name = $featlist->{"names"}->[$i]; +##COPIED# $seen->{$name} ++; +##COPIED# my $min = 0.0; +##COPIED# my $max = 1.0; +##COPIED# if (defined $niceranges->{$name}) { +##COPIED# my $minmax = shift @{$niceranges->{$name}}; +##COPIED# ($min, $max) = @$minmax if defined $minmax; +##COPIED# } +##COPIED# $featlist->{"mins"}->[$i] = $min; +##COPIED# $featlist->{"maxs"}->[$i] = $max; +##COPIED# } +##COPIED# return $featlist; +##COPIED# } + +##COPIED# sub sanity_check_order_of_lambdas { +##COPIED# my $featlist = shift; +##COPIED# my $filename_or_stream = shift; +##COPIED# +##COPIED# my @expected_lambdas = @{$featlist->{"names"}}; +##COPIED# my @got = get_order_of_scores_from_nbestlist($filename_or_stream); +##COPIED# die "Mismatched lambdas. Decoder returned @got, we expected @expected_lambdas" +##COPIED# if "@got" ne "@expected_lambdas"; +##COPIED# } + +#### get_featlist_from_moses(): also used in process-featlist-sge-nosync.pl ##### +########################################################################### +sub get_featlist_from_moses { + # run moses with the given config file and return the list of features and + # their initial values + my $configfn = shift; + my $featlistfn = "./features.list"; + if (-e $featlistfn) { + print STDERR "Using cached features list: $featlistfn\n"; + } else { + print STDERR "Asking moses for feature names and values from $___CONFIG\n"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + safesystem($cmd) or die "Failed to run moses with the config $configfn"; + } + + # read feature list + my @names = (); + my @startvalues = (); + open(INI,$featlistfn) or die "Can't read $featlistfn"; + my $nr = 0; + my @errs = (); + while () { + $nr++; + chomp; + /^(.+) (\S+) (\S+)$/ || die("invalid feature: $_"); + my ($longname, $feature, $value) = ($1,$2,$3); + next if $value eq "sparse"; + push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" + if $value !~ /^[+-]?[0-9.e]+$/; + push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" + if !defined $ABBR2FULL{$feature}; + push @names, $feature; + push @startvalues, $value; + } + close INI; + if (scalar @errs) { + print STDERR join("", @errs); + exit 1; + } + return {"names"=>\@names, "values"=>\@startvalues}; +} +#### get_featlist_from_moses() ends ##################################### + +sub get_order_of_scores_from_nbestlist { + # read the first line and interpret the ||| label: num num num label2: num ||| column in nbestlist + # return the score labels in order + my $fname_or_source = shift; + # print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n"; + open IN, $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source'"; + my $line = ; + close IN; + die "Line empty in nbestlist '$fname_or_source'" if !defined $line; + my ($sent, $hypo, $scores, $total) = split /\|\|\|/, $line; + $scores =~ s/^\s*|\s*$//g; + die "No scores in line: $line" if $scores eq ""; + + my @order = (); + my $label = undef; + my $sparse = 0; # we ignore sparse features here + foreach my $tok (split /\s+/, $scores) { + if ($tok =~ /.+_.+:/) { + $sparse = 1; + } elsif ($tok =~ /^([a-z][0-9a-z]*):/i) { + $label = $1; + } elsif ($tok =~ /^-?[-0-9.e]+$/) { + if (!$sparse) { + # a score found, remember it + die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!" + if !defined $label; + push @order, $label; + } + $sparse = 0; + } else { + die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'"; + } + } + print STDERR "The decoder returns the scores in this order: @order\n"; + return @order; +} + +##COPIED#sub create_config { +##COPIED# my $infn = shift; # source config +##COPIED# my $outfn = shift; # where to save the config +##COPIED# my $featlist = shift; # the lambdas we should write +##COPIED# my $iteration = shift; # just for verbosity +##COPIED# my $bleu_achieved = shift; # just for verbosity +##COPIED# my $sparse_weights_file = shift; # only defined when optimizing sparse features +##COPIED# +##COPIED# my %P; # the hash of all parameters we wish to override +##COPIED# +##COPIED# # first convert the command line parameters to the hash +##COPIED# { # ensure local scope of vars +##COPIED# my $parameter=undef; +##COPIED# print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n"; +##COPIED# $___DECODER_FLAGS =~ s/^\s*|\s*$//; +##COPIED# $___DECODER_FLAGS =~ s/\s+/ /; +##COPIED# foreach (split(/ /,$___DECODER_FLAGS)) { +##COPIED# if (/^\-([^\d].*)$/) { +##COPIED# $parameter = $1; +##COPIED# $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); +##COPIED# } +##COPIED# else { +##COPIED# die "Found value with no -paramname before it: $_" +##COPIED# if !defined $parameter; +##COPIED# push @{$P{$parameter}},$_; +##COPIED# } +##COPIED# } +##COPIED# } +##COPIED# +##COPIED# # First delete all weights params from the input, we're overwriting them. +##COPIED# # Delete both short and long-named version. +##COPIED# for(my $i=0; $i{"names"}}); $i++) { +##COPIED# my $name = $featlist->{"names"}->[$i]; +##COPIED# delete($P{$name}); +##COPIED# delete($P{$ABBR2FULL{$name}}); +##COPIED# } +##COPIED# +##COPIED# # Convert weights to elements in P +##COPIED# for(my $i=0; $i{"names"}}); $i++) { +##COPIED# my $name = $featlist->{"names"}->[$i]; +##COPIED# my $val = $featlist->{"values"}->[$i]; +##COPIED# $name = defined $ABBR2FULL{$name} ? $ABBR2FULL{$name} : $name; +##COPIED# # ensure long name +##COPIED# push @{$P{$name}}, $val; +##COPIED# } +##COPIED# +##COPIED# if (defined($sparse_weights_file)) { +##COPIED# push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file; +##COPIED# } +##COPIED# +##COPIED# # create new moses.ini decoder config file by cloning and overriding the original one +##COPIED# open(INI,$infn) or die "Can't read $infn"; +##COPIED# delete($P{"config"}); # never output +##COPIED# print "Saving new config to: $outfn\n"; +##COPIED# open(OUT,"> $outfn") or die "Can't write $outfn"; +##COPIED# print OUT "# MERT optimized configuration\n"; +##COPIED# print OUT "# decoder $___DECODER\n"; +##COPIED# print OUT "# BLEU $bleu_achieved on dev $___DEV_F\n"; +##COPIED# print OUT "# We were before running iteration $iteration\n"; +##COPIED# print OUT "# finished ".`date`; +##COPIED# my $line = ; +##COPIED# while(1) { +##COPIED# last unless $line; +##COPIED# +##COPIED# # skip until hit [parameter] +##COPIED# if ($line !~ /^\[(.+)\]\s*$/) { +##COPIED# $line = ; +##COPIED# print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; +##COPIED# next; +##COPIED# } +##COPIED# +##COPIED# # parameter name +##COPIED# my $parameter = $1; +##COPIED# $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); +##COPIED# print OUT "[$parameter]\n"; +##COPIED# +##COPIED# # change parameter, if new values +##COPIED# if (defined($P{$parameter})) { +##COPIED# # write new values +##COPIED# foreach (@{$P{$parameter}}) { +##COPIED# print OUT $_."\n"; +##COPIED# } +##COPIED# delete($P{$parameter}); +##COPIED# # skip until new parameter, only write comments +##COPIED# while($line = ) { +##COPIED# print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; +##COPIED# last if $line =~ /^\[/; +##COPIED# last unless $line; +##COPIED# } +##COPIED# next; +##COPIED# } +##COPIED# +##COPIED# # unchanged parameter, write old +##COPIED# while($line = ) { +##COPIED# last if $line =~ /^\[/; +##COPIED# print OUT $line; +##COPIED# } +##COPIED# } +##COPIED# +##COPIED# # write all additional parameters +##COPIED# foreach my $parameter (keys %P) { +##COPIED# print OUT "\n[$parameter]\n"; +##COPIED# foreach (@{$P{$parameter}}) { +##COPIED# print OUT $_."\n"; +##COPIED# } +##COPIED# } +##COPIED# +##COPIED# close(INI); +##COPIED# close(OUT); +##COPIED# print STDERR "Saved: $outfn\n"; +##COPIED#} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} +sub ensure_full_path { + my $PATH = shift; +$PATH =~ s/\/nfsmnt//; + return $PATH if $PATH =~ /^\//; + my $dir = `pawd 2>/dev/null`; + if(!$dir){$dir = `pwd`;} + chomp($dir); + $PATH = $dir."/".$PATH; + $PATH =~ s/[\r\n]//g; + $PATH =~ s/\/\.\//\//g; + $PATH =~ s/\/+/\//g; + my $sanity = 0; + while($PATH =~ /\/\.\.\// && $sanity++<10) { + $PATH =~ s/\/+/\//g; + $PATH =~ s/\/[^\/]+\/\.\.\//\//g; + } + $PATH =~ s/\/[^\/]+\/\.\.$//; + $PATH =~ s/\/+$//; +$PATH =~ s/\/nfsmnt//; + return $PATH; +} + +sub submit_or_exec { + + my $argvlen = @_; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $prevjid = undef; + + # if supply 3 arguments, exec without submit + # if supply 4 arguments, then submit new job + # if supply 5 arguments, wait for the previous job to finish + if ($argvlen == 3){ + ($cmd,$stdout,$stderr) = @_; + } elsif ($argvlen == 4){ + ($cmd,$stdout,$stderr,$jidfile) = @_; + } elsif ($argvlen == 5){ + ($cmd,$stdout,$stderr,$jidfile,$prevjid) = @_; + } + + print STDERR "exec: $cmd\n"; + if (defined $___JOBS && $___JOBS > 0 && $argvlen==5) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameters=\"$queue_flags\" -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -prevjid=$prevjid" ) + or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; + } + elsif (defined $___JOBS && $___JOBS > 0 && $argvlen==4) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameters=\"$queue_flags\" -stdout=$stdout -stderr=$stderr -jidfile=$jidfile" ) + or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; + } else { + safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'."; + } +} + +sub exit_submit { + + my $argvlen = @_; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $pidfile = undef; + my $prevjid = undef; + my $prevjidarraysize = 0; + my @prevjidarray = (); + my $pid = undef; + my $qsubcmd=""; + my $hj=""; + + # if supply 4 arguments, then submit new job + # if supply 5 arguments, wait for the previous job to finish + if ($argvlen == 2) { + ($stdout,$stderr) = @_; + } elsif ($argvlen == 4){ + ($stdout,$stderr,$jidfile,$pidfile) = @_; + } elsif ($argvlen == 5){ + ($stdout,$stderr,$jidfile,$pidfile,$prevjid) = @_; + } + + # parse prevjid ######################## + $prevjid =~ s/^\s+|\s+$//g; + @prevjidarray = split(/\s+/,$prevjid); + $prevjidarraysize = scalar(@prevjidarray); + ######################################## + + + # print STDERR "exec: $stdout\n"; + + # read pid from file, and draft exit script ################## + chomp ($pid=`tail -n 1 $pidfile`); + open (OUT, ">exitjob$pid.sh"); + + my $scriptheader="\#\!/bin/bash\n\#\$ -S /bin/sh\n# Both lines are needed to invoke base\n#the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n"; + $scriptheader .="uname -a\n\n"; + $scriptheader .="cd $___WORKING_DIR\n\n"; + + print OUT $scriptheader; + + print OUT "if $qsubwrapper_exit -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -pidfile=$pidfile > exitjob$pid.out 2> exitjob$pid.err ; then + echo 'succeeded' +else + echo failed with exit status \$\? + die=1 +fi +"; + print OUT "\n\n"; + + close (OUT); + # setting permissions of the script + chmod(oct(755),"exitjob$pid.sh"); + ############################################################## + + + if (defined $___JOBS && $___JOBS > 0 && $argvlen==5) { + if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1){ + $hj = "-hold_jid $prevjid"; + } elsif (defined $prevjid && $prevjidarraysize > 1){ + $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray); + } + $qsubcmd="qsub $queue_flags -o /dev/null -e /dev/null -V $hj exitjob$pid.sh > exitjob$pid.log 2>&1"; + safesystem($qsubcmd) or die "ERROR: Failed to exit-submit $pid (via $qsubwrapper_exit)"; + } elsif (defined $___JOBS && $___JOBS > 0 && $argvlen==4) { + $qsubcmd="qsub $queue_flags -o /dev/null -e /dev/null -V exitjob$pid.sh > exitjob$pid.log 2>&1"; + safesystem($qsubcmd) or die "ERROR: Failed to exit-submit $pid (via $qsubwrapper_exit)"; + } else { + safesystem("rm $stdout") or die "ERROR: Failed to remove '$stdout'."; + safesystem("rm $stderr") or die "ERROR: Failed to remove '$stderr'."; + } +} + + + + + + + +sub create_extractor_script() +{ + my ($cmd, $outdir) = @_; + my $script_path = File::Spec->catfile($outdir, "extractor.sh"); + + open my $out, '>', $script_path + or die "Couldn't open $script_path for writing: $!\n"; + print $out "#!/bin/bash\n"; + print $out "cd $outdir\n"; + print $out "$cmd\n"; + close($out); + + `chmod +x $script_path`; + + return $script_path; +} + + +sub submit_or_exec_thu_host { + # use Net::OpenSSH::Compat::Perl; + + my $argvlen = @_; + my $submithost = undef; + my $run = -1; + my $idx = ""; + my $batch_and_join = ""; + my $my_username = undef; + my $cmd = undef; + my $qsubwrapcmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $prevjid = undef; + + my $qsubname = undef; + + # if supply 7 arguments, exec without submit + # if supply 8 arguments, then submit new job + # if supply 9 arguments, wait for the previous job to finish + if ($argvlen == 7){ + ($submithost,$run,$idx,$cmd,$batch_and_join,$stdout,$stderr) = @_; + } elsif ($argvlen == 8){ + ($submithost,$run,$idx,$cmd,$batch_and_join,$stdout,$stderr,$jidfile) = @_; + } elsif ($argvlen == 9){ + ($submithost,$run,$idx,$cmd,$batch_and_join,$stdout,$stderr,$jidfile,$prevjid) = @_; + } + + + + # chomp($my_username = `whoami`); + # my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + # + # print STDERR "submithost = $submithost\n"; + # print STDERR "my username = $my_username\n"; + # print STDERR "qusbwrapper at = $qsubwrapper\n"; + # + # $ssh->login("$my_username",`cat /home/$my_username/accpw`); + my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + + print STDERR "submithost = $submithost\n"; + print STDERR "qusbwrapper at = $qsubwrapper\n"; + + $ssh->login(); + + #### extra for mert-moses-sge-nosync.pl #### + if ($qsubprefix eq ""){ + $qsubname = $stdout; + } else { + $qsubname = $qsubprefix; + } + my $queueparameters = $queue_flags; + ############################################ + + # $cmd="qsub $queueparameters $batch_and_join -o $qsubout$idx -e $qsuberr$idx -N $qsubname$idx ${jobscript}${idx}.bash > ${jobscript}${idx}.log 2>&1"; + # -o qsubout + # -e qsuberr + if ($argvlen == 9) { + $qsubwrapcmd = "$qsubwrapper -command='$cmd' -queue-parameter=\"$queueparameters $batch_and_join\" -qsub-prefix='$qsubname$idx' -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -prevjid='$prevjid'"; + } elsif ($argvlen == 8) { + # $qsubwrapcmd = "$qsubwrapper -command='${jobscript}${idx}.bash' -queue-parameter=\"$queueparameters $batch_and_join\" -qsub-prefix='$qsubname$idx' -stdout=$stdout -stderr=$stderr -jidfile=$jidfile"; + $qsubwrapcmd = "$qsubwrapper -command='$cmd' -queue-parameter=\"$queueparameters $batch_and_join\" -qsub-prefix='$qsubname$idx' -stdout=$stdout -stderr=$stderr -jidfile=$jidfile"; + } + print STDERR "Executing $qsubwrapcmd in $___WORKING_DIR\n"; + $ssh->cmd("cd $___WORKING_DIR && $qsubwrapcmd"); + +} + +sub exit_submit_thu_host { + + my $argvlen = @_; + my $submithost = undef; + my $run = -1; + my $idx = ""; + my $batch_and_join = ""; + my $my_username = undef; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $pidfile = undef; + my $prevjid = undef; + my $prevjidarraysize = 0; + my @prevjidarray = (); + my $pid = undef; + my $qsubcmd=""; + my $hj=""; + + # if supply 8 arguments, then submit new job + # if supply 9 arguments, wait for the previous job to finish + if ($argvlen == 6){ + ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr) = @_; + } elsif ($argvlen == 8){ + ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr,$jidfile,$pidfile) = @_; + } elsif ($argvlen == 9){ + ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr,$jidfile,$pidfile,$prevjid) = @_; + } + + # parse prevjid ######################## + $prevjid =~ s/^\s+|\s+$//g; + @prevjidarray = split(/\s+/,$prevjid); + $prevjidarraysize = scalar(@prevjidarray); + ######################################## + + #### extra for mert-moses-sge-nosync.pl #### + my $queueparameters = $queue_flags; + ############################################ + + + + # print STDERR "exec: $stdout\n"; + + # read pid from file, and draft exit script ################## + chomp ($pid=`tail -n 1 $pidfile`); + open (OUT, ">exitjob$pid.sh"); + + my $scriptheader="\#\!/bin/bash\n\#\$ -S /bin/sh\n# Both lines are needed to invoke base\n#the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n"; + $scriptheader .="uname -a\n\n"; + $scriptheader .="cd $___WORKING_DIR\n\n"; + + print OUT $scriptheader; + + print OUT "if $qsubwrapper_exit -submithost=$submithost -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -pidfile=$pidfile > exitjob$pid.out 2> exitjob$pid.err ; then + echo 'succeeded' +else + echo failed with exit status \$\? + die=1 +fi +"; + print OUT "\n\n"; + + close (OUT); + # setting permissions of the script + chmod(oct(755),"exitjob$pid.sh"); + ############################################################## + + # log in submit host ######################################### + # chomp($my_username = `whoami`); + # my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + # + # print STDERR "submithost = $submithost\n"; + # print STDERR "my username = $my_username\n"; + # print STDERR "qusbwrapper at = $qsubwrapper\n"; + # + # $ssh->login("$my_username",`cat /home/$my_username/accpw`); + ############################################################## + my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + + print STDERR "submithost = $submithost\n"; + print STDERR "qusbwrapper at = $qsubwrapper\n"; + + $ssh->login(); + + + if ($argvlen==9) { + if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1){ + $hj = "-hold_jid $prevjid"; + } elsif (defined $prevjid && $prevjidarraysize > 1){ + $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray); + } + $qsubcmd="qsub $queueparameters -o /dev/null -e /dev/null $hj exitjob$pid.sh > exitjob$pid.log 2>&1"; + $ssh->cmd("cd $___WORKING_DIR && $qsubcmd"); + } elsif ($argvlen==8) { + $qsubcmd="qsub $queueparameters -o /dev/null -e /dev/null exitjob$pid.sh > exitjob$pid.log 2>&1"; + $ssh->cmd("cd $___WORKING_DIR && $qsubcmd"); + } + print STDERR "Executing $qsubcmd in $___WORKING_DIR\n"; + +} + + diff --git a/contrib/mert-sge-nosync/training/sge-nosync/cleartmpfiles.pl b/contrib/mert-sge-nosync/training/sge-nosync/cleartmpfiles.pl new file mode 100755 index 000000000..4e5e7d046 --- /dev/null +++ b/contrib/mert-sge-nosync/training/sge-nosync/cleartmpfiles.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl + + + my @filename_id = ""; + my $this_id = ""; + + # remove exitjob and forceexitjob + chomp(my @rddfile_list = `ls exitjob* forceexitjob*`); + foreach my $rddfile (@rddfile_list) { + unlink("$rddfile"); + } + + chomp(@filename_id = `ls *.id | grep -v 'clear'`); + open (OUT, "> all.id.all"); + print OUT "==Combine log at ".`date`; + print OUT `tail -n +1 *.id`; + print OUT "==LOG combined ".`date`; + close(OUT); + foreach $this_id (@filename_id) { + # print OUT `cat $this_id`; + unlink("$this_id"); + } + + chomp (@filename_id = `ls *.id.pid | grep -v 'clear'`); + open (OUT, "> all.id.pid.all"); + print OUT "==Combine log at ".`date`; + print OUT `tail -n +1 *.id.pid`; + print OUT "==Log combined ".`date`; + close(OUT); + foreach $this_id (@filename_id) { + # print OUT `cat $this_id`; + unlink("$this_id"); + } + + + + chomp(@filename_id = `ls *.out | grep -v 'clear'`); + open (OUT, "> all.out.all"); + print OUT "==Combine log at ".`date`; + print OUT `tail -n +1 *.out`; + print OUT "==Log combined ".`date`; + close(OUT); + foreach $this_id (@filename_id) { + # print OUT `cat $this_id`; + unlink("$this_id"); + } + + chomp(@filename_id = `ls *.err | grep -v 'clear'`); + open (OUT, "> all.err.all"); + print OUT "==Combine log at ".`date`; + print OUT `tail -n +1 *.err`; + print OUT "==Log combined ".`date`; + close(OUT); + foreach $this_id (@filename_id) { + # print OUT `cat $this_id`; + unlink("$this_id"); + } + + # waitall.sh which cannot be deleted inside moses-parallel-sge-nosync.pl + chomp(@filename_id = `ls *waitall.sh`); + foreach $this_id (@filename_id) { + unlink("$this_id"); + } + diff --git a/contrib/mert-sge-nosync/training/sge-nosync/create-config-sge-nosync.pl b/contrib/mert-sge-nosync/training/sge-nosync/create-config-sge-nosync.pl new file mode 100755 index 000000000..3f47cda82 --- /dev/null +++ b/contrib/mert-sge-nosync/training/sge-nosync/create-config-sge-nosync.pl @@ -0,0 +1,426 @@ +#!/usr/bin/perl -w + +# $Id$ +# after filter-mode-given-input.pl, process the feature list + +# original code by Philipp Koehn +# changes by Ondrej Bojar +# adapted for hierarchical models by Phil Williams + +use strict; + +use FindBin qw($Bin); +use Getopt::Long; + + + +my $SCRIPTS_ROOTDIR; +if (defined($ENV{"SCRIPTS_ROOTDIR"})) { + $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; +} else { + $SCRIPTS_ROOTDIR = $Bin; + if ($SCRIPTS_ROOTDIR eq '') { + $SCRIPTS_ROOTDIR = dirname(__FILE__); + } + $SCRIPTS_ROOTDIR =~ s/\/training$//; + $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR; +} + + +# moses.ini file uses FULL names for lambdas, while this training script +# internally (and on the command line) uses ABBR names. +my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w + g=weight-generation lex=weight-lex I=weight-i); +my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP; +my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP; + + + +my $verbose = 0; +my $usage = 0; # request for --help + + + + +##!# # consider phrases in input up to $MAX_LENGTH +##!# # in other words, all phrase-tables will be truncated at least to 10 words per +##!# # phrase. +##!# my $MAX_LENGTH = 10; + +# utilities +##!# my $ZCAT = "gzip -cd"; + +# get optional parameters +##!# my $opt_hierarchical = 0; +##!# my $binarizer = undef; +##!# my $opt_min_non_initial_rule_count = undef; +##!# my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats) + +my $___RANGES = undef; +my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on + # if undef work on all features + # (others are fixed to the starting values) +my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder + +my $devbleu = undef; +my $___WORKING_DIR = undef; +my $___DEV_F = undef; +my $run = undef; # either first or final +my $runid_final = undef; +my $runid_finalplus=0; +my $sparse_weights_file = undef; + + +# set 0 if input type is text, set 1 if input type is confusion network +my $___INPUTTYPE = 0; + +my $___DECODER = undef; # required, pathname to the decoder executable +my $___CONFIG = undef; # required, pathname to startup ini file + + +GetOptions( + "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) + "range=s@" => \$___RANGES, + "decoder-flags=s" => \$___DECODER_FLAGS, + "inputtype=i" => \$___INPUTTYPE, + "devbleu=s" => \$devbleu, + "sparse_weight_file=s" => \$sparse_weights_file, + "working-dir=s" => \$___WORKING_DIR, +) or exit(1); + +##!# GetOptions( +##!# "gzip!" => \$opt_gzip, +##!# "Hierarchical" => \$opt_hierarchical, +##!# "Binarizer=s" => \$binarizer, +##!# "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count +##!# ) or exit(1); + + +# the ?? required parameters can be supplied on the command line directly +# or using the --options +if (scalar @ARGV == 4) { + # required parameters: options + $___DEV_F = shift; + $___DECODER = shift; + $___CONFIG = shift; + $run = shift; # first or final +} + +if ($usage || !defined $___DECODER || !defined $___CONFIG) { + print STDERR "usage: $0 \$___DECODER \$___CONFIG(decoder.ini) +Options: + --activate-features=STRING ... comma-separated list of features to optimize, + others are fixed to the starting values + default: optimize all features + example: tm_0,tm_4,d_0 + --range=tm:0..1,-1..1 ... specify min and max value for some features + --range can be repeated as needed. + The order of the various --range specifications + is important only within a feature name. + E.g.: + --range=tm:0..1,-1..1 --range=tm:0..2 + is identical to: + --range=tm:0..1,-1..1,0..2 + but not to: + --range=tm:0..2 --range=tm:0..1,-1..1 + --decoder-flags=STRING ... extra parameters for the decoder + --inputtype=[0|1|2] ... Handle different input types: (0 for text, + 1 for confusion network, 2 for lattices, + default is 0) +"; + exit 1; +} + + + +##!# # get command line parameters +##!# my $dir = shift; +##!# my $config = shift; +##!# my $input = shift; + +##!# $dir = ensure_full_path($dir); + +############################################################ +############################################################ +############################################################ + +# main + +# we run moses to check validity of moses.ini and to obtain all the feature +# names + +if (($run eq "first")){ + my $featlist = get_featlist_from_moses($___CONFIG,$___CONFIG,"first"); + $featlist = insert_ranges_to_featlist($featlist, $___RANGES); + create_config($___CONFIG,"$___WORKING_DIR/run1.moses.ini",$featlist,1,(defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file); +} else { # $run eq "final" + chomp ($runid_final = `cat $___WORKING_DIR/finished_step.txt | tail -n 1`); + $runid_finalplus = $runid_final + 1; + `mv run${runid_finalplus}.moses.ini run_final.moses.ini`; + chomp ($devbleu = `cat $___WORKING_DIR/run_final.moses.ini | tail -n +3 | head -n 1 | gawk '{print \$3}'`); + my $featlist = get_featlist_from_moses($___CONFIG,"$___WORKING_DIR/run_final.moses.ini","final"); + $featlist = insert_ranges_to_featlist($featlist, $___RANGES); + create_config($___CONFIG,"$___WORKING_DIR/moses.ini",$featlist,$runid_finalplus,$devbleu,$sparse_weights_file); +} + +##COPIED## Mark which features are disabled: +##COPIED#if (defined $___ACTIVATE_FEATURES) { +##COPIED# my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES; +##COPIED# my %cnt; +##COPIED# for(my $i=0; $i{"names"}}); $i++) { +##COPIED# my $name = $featlist->{"names"}->[$i]; +##COPIED# $cnt{$name} = 0 if !defined $cnt{$name}; +##COPIED# $featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}}; +##COPIED# $cnt{$name}++; +##COPIED# } +##COPIED#} else { +##COPIED# # all enabled +##COPIED# for(my $i=0; $i{"names"}}); $i++) { +##COPIED# $featlist->{"enabled"}->[$i] = 1; +##COPIED# } +##COPIED#} +##COPIED# +##COPIED#print STDERR "MERT starting values and ranges for random generation:\n"; +##COPIED#for(my $i=0; $i{"names"}}); $i++) { +##COPIED# my $name = $featlist->{"names"}->[$i]; +##COPIED# my $val = $featlist->{"values"}->[$i]; +##COPIED# my $min = $featlist->{"mins"}->[$i]; +##COPIED# my $max = $featlist->{"maxs"}->[$i]; +##COPIED# my $enabled = $featlist->{"enabled"}->[$i]; +##COPIED# printf STDERR " %5s = %7.3f", $name, $val; +##COPIED# if ($enabled) { +##COPIED# printf STDERR " (%5.2f .. %5.2f)\n", $min, $max; +##COPIED# } else { +##COPIED# print STDERR " --- inactive, not optimized ---\n"; +##COPIED# } +##COPIED#} + + + + + +sub get_featlist_from_moses { + # run moses with the given config file and return the list of features and + # their initial values + my $configfn = shift; + my $config_score = shift; + my $run = shift; + + my $featlistfn = ""; + if ($run eq 'first') { + $featlistfn = "./features.list"; # given feature list + } elsif ($run eq "final") { + $featlistfn = "./features.list.run_final"; + } + if (-e $featlistfn) { + print STDERR "Using cached features list: $featlistfn\n"; + } else { + print STDERR "Asking moses for feature names and values from $config_score\n"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $config_score -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + print STDERR "$cmd\n"; #DEBUG + safesystem($cmd) or die "Failed to run moses with the config $config_score"; + } + + # read feature list + my @names = (); + my @startvalues = (); + open(INI,$featlistfn) or die "Can't read $featlistfn"; + my $nr = 0; + my @errs = (); + while () { + $nr++; + chomp; + /^(.+) (\S+) (\S+)$/ || die("invalid feature: $_"); + my ($longname, $feature, $value) = ($1,$2,$3); + next if $value eq "sparse"; + push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" + if $value !~ /^[+-]?[0-9.e]+$/; + push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" + if !defined $ABBR2FULL{$feature}; + push @names, $feature; + push @startvalues, $value; + } + close INI; + if (scalar @errs) { + print STDERR join("", @errs); + exit 1; + } + return {"names"=>\@names, "values"=>\@startvalues}; +} + + +sub insert_ranges_to_featlist { + my $featlist = shift; + my $ranges = shift; + + $ranges = [] if !defined $ranges; + + # first collect the ranges from options + my $niceranges; + foreach my $range (@$ranges) { + my $name = undef; + foreach my $namedpair (split /,/, $range) { + if ($namedpair =~ /^(.*?):/) { + $name = $1; + $namedpair =~ s/^.*?://; + die "Unrecognized name '$name' in --range=$range" + if !defined $ABBR2FULL{$name}; + } + my ($min, $max) = split /\.\./, $namedpair; + die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "No name given in --range=$range" if !defined $name; + push @{$niceranges->{$name}}, [$min, $max]; + } + } + + # now populate featlist + my $seen = undef; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $seen->{$name} ++; + my $min = 0.0; + my $max = 1.0; + if (defined $niceranges->{$name}) { + my $minmax = shift @{$niceranges->{$name}}; + ($min, $max) = @$minmax if defined $minmax; + } + $featlist->{"mins"}->[$i] = $min; + $featlist->{"maxs"}->[$i] = $max; + } + return $featlist; +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + + + +sub create_config { + my $infn = shift; # source config + my $outfn = shift; # where to save the config + my $featlist = shift; # the lambdas we should write + my $iteration = shift; # just for verbosity + my $bleu_achieved = shift; # just for verbosity + my $sparse_weights_file = shift; # only defined when optimizing sparse features + + my %P; # the hash of all parameters we wish to override + + # first convert the command line parameters to the hash + { # ensure local scope of vars + my $parameter=undef; + print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n"; + $___DECODER_FLAGS =~ s/^\s*|\s*$//; + $___DECODER_FLAGS =~ s/\s+/ /; + foreach (split(/ /,$___DECODER_FLAGS)) { + if (/^\-([^\d].*)$/) { + $parameter = $1; + $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); + } + else { + die "Found value with no -paramname before it: $_" + if !defined $parameter; + push @{$P{$parameter}},$_; + } + } + } + + # First delete all weights params from the input, we're overwriting them. + # Delete both short and long-named version. + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + delete($P{$name}); + delete($P{$ABBR2FULL{$name}}); + } + + # Convert weights to elements in P + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + my $val = $featlist->{"values"}->[$i]; + $name = defined $ABBR2FULL{$name} ? $ABBR2FULL{$name} : $name; + # ensure long name + push @{$P{$name}}, $val; + } + + if (defined($sparse_weights_file)) { + push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file; + } + + # create new moses.ini decoder config file by cloning and overriding the original one + open(INI,$infn) or die "Can't read $infn"; + delete($P{"config"}); # never output + print "Saving new config to: $outfn\n"; + open(OUT,"> $outfn") or die "Can't write $outfn"; + print OUT "# MERT optimized configuration\n"; + print OUT "# decoder $___DECODER\n"; + print OUT "# BLEU $bleu_achieved on dev $___DEV_F\n"; + print OUT "# We were before running iteration $iteration\n"; + print OUT "# finished ".`date`; + my $line = ; + while(1) { + last unless $line; + + # skip until hit [parameter] + if ($line !~ /^\[(.+)\]\s*$/) { + $line = ; + print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; + next; + } + + # parameter name + my $parameter = $1; + $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); + print OUT "[$parameter]\n"; + + # change parameter, if new values + if (defined($P{$parameter})) { + # write new values + foreach (@{$P{$parameter}}) { + print OUT $_."\n"; + } + delete($P{$parameter}); + # skip until new parameter, only write comments + while($line = ) { + print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; + last if $line =~ /^\[/; + last unless $line; + } + next; + } + # unchanged parameter, write old + while($line = ) { + last if $line =~ /^\[/; + print OUT $line; + } + } + + # write all additional parameters + foreach my $parameter (keys %P) { + print OUT "\n[$parameter]\n"; + foreach (@{$P{$parameter}}) { + print OUT $_."\n"; + } + } + + close(INI); + close(OUT); + print STDERR "Saved: $outfn\n"; +} + + diff --git a/contrib/mert-sge-nosync/training/sge-nosync/moses-parallel-postdecode-sge-nosync.pl b/contrib/mert-sge-nosync/training/sge-nosync/moses-parallel-postdecode-sge-nosync.pl new file mode 100755 index 000000000..0edbe37db --- /dev/null +++ b/contrib/mert-sge-nosync/training/sge-nosync/moses-parallel-postdecode-sge-nosync.pl @@ -0,0 +1,235 @@ +#!/usr/bin/perl + + +my $logflag=""; +my $logfile=""; +my $alifile=undef; +my $nbestflag=0; +my $processid=0; +my $idxliststr=""; +my $workingdir=""; +my $inputfile=""; +my $tmpdir=""; +my $splitpfx=""; +my $jobscript=""; +my $qsubout=""; +my $qsuberr=""; +my $nbestfile=undef; +my $nbestlist=undef; +my $outnbest=""; +my $lsamp_filename=""; +my @idxlist=(); + + +############################### +# Script starts here + +init(); + + + +#concatenating translations and removing temporary files +concatenate_1best(); +concatenate_logs() if $logflag; +concatenate_ali() if defined $alifile; +concatenate_nbest() if $nbestflag; +safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-'; + + +print STDERR "Not support searchgraphflag for sync mert\n" if $searchgraphflag; +# concatenate_searchgraph() if $searchgraphflag; +# safesystem("cat searchgraph$$ >> /dev/stdout") if $searchgraphlist eq '-'; + +print STDERR "Not support wordgraphflag for sync mert\n" if $searchgraphflag; +# concatenate_wordgraph() if $wordgraphflag; +# safesystem("cat wordgraph$$ >> /dev/stdout") if $wordgraphlist[0] eq '-'; + +remove_temporary_files(); +#### +#### ### ending scripts in run_decoder() ############## +#### sanity_check_order_of_lambdas($featlist, $filename); +#### ## how to do return??? +#### return ($filename, $lsamp_filename); +###################################################### + + + + +sub init(){ + use strict; + use Getopt::Long qw(:config pass_through no_ignore_case permute); + + GetOptions('alignment-output-file=s'=>\$alifile, + 'process-id=s'=>\$processid, + 'idxliststr=s'=>\$idxliststr, + 'logfile=s'=>\$logfile, + 'nbestfile=s'=>\$nbestfile, + 'outnbest=s'=>\$outnbest, + 'lsamp-filename=s'=>\$lsamp_filename, + 'input-file=s'=>\$inputfile + ) or exit(1); + + if ($logfile){ $logflag=1; } + + if (defined $nbestfile) { $nbestflag=1; } + + $idxliststr =~ s/^\s+|\s+$//g; + @idxlist = split(/\s+/,$idxliststr); + + my $pwdcmd = getPwdCmd(); + + $workingdir = `$pwdcmd`; chomp $workingdir; + $tmpdir="$workingdir/tmp$processid"; + $splitpfx="split$processid"; + + $jobscript="$workingdir/job$processid"; + $qsubout="$workingdir/out.job$processid"; + $qsuberr="$workingdir/err.job$processid"; + + # print STDERR "$idxliststr\n"; + +} + + +sub concatenate_nbest(){ + my $oldcode=""; + my $newcode=-1; + my %inplength = (); + my $offset = 0; + +# get the list of feature and set a fictitious string with zero scores + open (IN, "${nbestfile}.${splitpfx}$idxlist[0]"); + my $str = ; + chomp($str); + close(IN); + my ($code,$trans,$featurescores,$globalscore)=split(/\|\|\|/,$str); + + my $emptytrans = " "; + my $emptyglobalscore = " 0.0"; + my $emptyfeaturescores = $featurescores; + $emptyfeaturescores =~ s/[-0-9\.]+/0/g; + + if ($outnbest eq '-'){ $outnbest="nbest$processid"; } + + # my $outnbest=$nbestlist[0]; + # if ($nbestlist[0] eq '-'){ $outnbest="nbest$$"; } + + open (OUT, "> $outnbest"); + foreach my $idx (@idxlist){ + +#computing the length of each input file + # print STDERR "this idx: $idx\n"; + + my @in=(); + open (IN, "${inputfile}.${splitpfx}${idx}.trans"); + @in=; + close(IN); + $inplength{$idx} = scalar(@in); + + open (IN, "${nbestfile}.${splitpfx}${idx}"); + while (){ + my ($code,@extra)=split(/\|\|\|/,$_); + $code += $offset; + if ($code ne $oldcode){ +# if there is a jump between two consecutive codes +# it means that an input sentence is not translated +# fill this hole with a "fictitious" list of translation +# comprising just one "emtpy translation" with zero scores + while ($code - $oldcode > 1){ + $oldcode++; + print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n"; + } + } + $oldcode=$code; + print OUT join("\|\|\|",($oldcode,@extra)); + } + close(IN); + $offset += $inplength{$idx}; + + while ($offset - $oldcode > 1){ + $oldcode++; + print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n"; + } + } + close(OUT); +} + + +sub concatenate_1best(){ + foreach my $idx (@idxlist){ + # print STDERR "reading 1best file ${inputfile}.${splitpfx}$idx.trans\n"; + my @in=(); + open (IN, "${inputfile}.${splitpfx}${idx}.trans"); + @in=; + # print STDERR "in array is : @in"; + print STDOUT "@in"; + close(IN); + } +} + +sub concatenate_logs(){ + open (OUT, "> ${logfile}"); + foreach my $idx (@idxlist){ + my @in=(); + open (IN, "$qsubout$idx"); + @in=; + print OUT "@in"; + close(IN); + } + close(OUT); +} + +sub concatenate_ali(){ + open (OUT, "> ${alifile}"); + foreach my $idx (@idxlist){ + my @in=(); + open (IN, "$alifile.$splitpfx$idx"); + @in=; + print OUT "@in"; + close(IN); + } + close(OUT); +} + + + + +# look for the correct pwdcmd (pwd by default, pawd if it exists) +# I assume that pwd always exists +sub getPwdCmd(){ + my $pwdcmd="pwd"; + my $a; + chomp($a=`which pawd | head -1 | awk '{print $1}'`); + if ($a && -e $a){ $pwdcmd=$a; } + return $pwdcmd; +} + + +sub remove_temporary_files(){ + # removing temporary files + foreach my $idx (@idxlist){ + unlink("${inputfile}.${splitpfx}${idx}.trans"); + unlink("${inputfile}.${splitpfx}${idx}"); + if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); } + if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); } + if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); } + if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); } + + # print STDERR "Deleting ${jobscript}${idx}.bash\n"; + unlink("${jobscript}${idx}.bash"); + unlink("${jobscript}${idx}.log"); + unlink("$qsubname.W.log"); + unlink("$qsubout$idx"); + unlink("$qsuberr$idx"); + rmdir("$tmpdir"); + } + # unlink("${jobscript}.sync_workaround_script.sh"); + if ($nbestflag && $nbestlist[0] eq '-'){ unlink("${nbestfile}$$"); }; + if ($searchgraphflag && $searchgraphlist eq '-'){ unlink("${searchgraphfile}$$"); }; + if ($wordgraphflag && $wordgraphlist eq '-'){ unlink("${wordgraphfile}$$"); }; +} + + + + + diff --git a/contrib/mert-sge-nosync/training/sge-nosync/poll-decoder.pl b/contrib/mert-sge-nosync/training/sge-nosync/poll-decoder.pl new file mode 100755 index 000000000..54e9eceda --- /dev/null +++ b/contrib/mert-sge-nosync/training/sge-nosync/poll-decoder.pl @@ -0,0 +1,30 @@ +#!/usr/bin/perl + +use Getopt::Long qw(:config pass_through no_ignore_case permute); + +my $poll_target = undef; +my $working_dir = undef; + +GetOptions('poll-target=s'=> \$poll_target, + 'working-dir'=> \$working_dir + ) or exit(1); + + +if (defined $working_dir) { + chdir($working_dir); +} + +my $cnt = 1; + +print STDERR "Wait for file: $poll_target\n"; + +while (1) { + if (-e $poll_target){ + print STDERR "\n File found!!\n"; + last; + } else { + sleep(10); + print STDERR "."; + } +} + diff --git a/contrib/mert-sge-nosync/training/sge-nosync/process-featlist-sge-nosync.pl b/contrib/mert-sge-nosync/training/sge-nosync/process-featlist-sge-nosync.pl new file mode 100755 index 000000000..02b18526a --- /dev/null +++ b/contrib/mert-sge-nosync/training/sge-nosync/process-featlist-sge-nosync.pl @@ -0,0 +1,283 @@ +#!/usr/bin/perl -w + +# $Id$ +# after filter-mode-given-input.pl, process the feature list + +# original code by Philipp Koehn +# changes by Ondrej Bojar +# adapted for hierarchical models by Phil Williams + +use strict; + +use FindBin qw($Bin); +use Getopt::Long; + + + +my $SCRIPTS_ROOTDIR; +if (defined($ENV{"SCRIPTS_ROOTDIR"})) { + $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; +} else { + $SCRIPTS_ROOTDIR = $Bin; + if ($SCRIPTS_ROOTDIR eq '') { + $SCRIPTS_ROOTDIR = dirname(__FILE__); + } + $SCRIPTS_ROOTDIR =~ s/\/training$//; + $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR; +} + + +# moses.ini file uses FULL names for lambdas, while this training script +# internally (and on the command line) uses ABBR names. +my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w + g=weight-generation lex=weight-lex I=weight-i); +my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP; +my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP; + + + +my $verbose = 0; +my $usage = 0; # request for --help + + + + +##!# # consider phrases in input up to $MAX_LENGTH +##!# # in other words, all phrase-tables will be truncated at least to 10 words per +##!# # phrase. +##!# my $MAX_LENGTH = 10; + +# utilities +##!# my $ZCAT = "gzip -cd"; + +# get optional parameters +##!# my $opt_hierarchical = 0; +##!# my $binarizer = undef; +##!# my $opt_min_non_initial_rule_count = undef; +##!# my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats) + +my $___RANGES = undef; +my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on + # if undef work on all features + # (others are fixed to the starting values) +my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder + +# set 0 if input type is text, set 1 if input type is confusion network +my $___INPUTTYPE = 0; + +my $___DECODER = undef; # required, pathname to the decoder executable +my $___CONFIG = undef; # required, pathname to startup ini file + + +GetOptions( + "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) + "range=s@" => \$___RANGES, + "decoder-flags=s" => \$___DECODER_FLAGS, + "inputtype=i" => \$___INPUTTYPE +) or exit(1); + +##!# GetOptions( +##!# "gzip!" => \$opt_gzip, +##!# "Hierarchical" => \$opt_hierarchical, +##!# "Binarizer=s" => \$binarizer, +##!# "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count +##!# ) or exit(1); + + +# the ?? required parameters can be supplied on the command line directly +# or using the --options +if (scalar @ARGV == 2) { + # required parameters: options + $___DECODER = shift; + $___CONFIG = shift; +} + +if ($usage || !defined $___DECODER || !defined $___CONFIG) { + print STDERR "usage: $0 \$___DECODER \$___CONFIG(decoder.ini) +Options: + --activate-features=STRING ... comma-separated list of features to optimize, + others are fixed to the starting values + default: optimize all features + example: tm_0,tm_4,d_0 + --range=tm:0..1,-1..1 ... specify min and max value for some features + --range can be repeated as needed. + The order of the various --range specifications + is important only within a feature name. + E.g.: + --range=tm:0..1,-1..1 --range=tm:0..2 + is identical to: + --range=tm:0..1,-1..1,0..2 + but not to: + --range=tm:0..2 --range=tm:0..1,-1..1 + --decoder-flags=STRING ... extra parameters for the decoder + --inputtype=[0|1|2] ... Handle different input types: (0 for text, + 1 for confusion network, 2 for lattices, + default is 0) +"; + exit 1; +} + + + +##!# # get command line parameters +##!# my $dir = shift; +##!# my $config = shift; +##!# my $input = shift; + +##!# $dir = ensure_full_path($dir); + +############################################################ +############################################################ +############################################################ + +# main + +# we run moses to check validity of moses.ini and to obtain all the feature +# names +my $featlist = get_featlist_from_moses($___CONFIG); +$featlist = insert_ranges_to_featlist($featlist, $___RANGES); + + +# Mark which features are disabled: +if (defined $___ACTIVATE_FEATURES) { + my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES; + my %cnt; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $cnt{$name} = 0 if !defined $cnt{$name}; + $featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}}; + $cnt{$name}++; + } +} else { + # all enabled + for(my $i=0; $i{"names"}}); $i++) { + $featlist->{"enabled"}->[$i] = 1; + } +} + +print STDERR "MERT starting values and ranges for random generation:\n"; +for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + my $val = $featlist->{"values"}->[$i]; + my $min = $featlist->{"mins"}->[$i]; + my $max = $featlist->{"maxs"}->[$i]; + my $enabled = $featlist->{"enabled"}->[$i]; + printf STDERR " %5s = %7.3f", $name, $val; + if ($enabled) { + printf STDERR " (%5.2f .. %5.2f)\n", $min, $max; + } else { + print STDERR " --- inactive, not optimized ---\n"; + } +} + + + + + +sub get_featlist_from_moses { + # run moses with the given config file and return the list of features and + # their initial values + my $configfn = shift; + my $featlistfn = "./features.list"; + if (-e $featlistfn) { + print STDERR "Using cached features list: $featlistfn\n"; + } else { + print STDERR "Asking moses for feature names and values from $___CONFIG\n"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + print STDERR "$cmd\n"; #DEBUG + safesystem($cmd) or die "Failed to run moses with the config $configfn"; + } + + # read feature list + my @names = (); + my @startvalues = (); + open(INI,$featlistfn) or die "Can't read $featlistfn"; + my $nr = 0; + my @errs = (); + while () { + $nr++; + chomp; + /^(.+) (\S+) (\S+)$/ || die("invalid feature: $_"); + my ($longname, $feature, $value) = ($1,$2,$3); + next if $value eq "sparse"; + push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" + if $value !~ /^[+-]?[0-9.e]+$/; + push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" + if !defined $ABBR2FULL{$feature}; + push @names, $feature; + push @startvalues, $value; + } + close INI; + if (scalar @errs) { + print STDERR join("", @errs); + exit 1; + } + return {"names"=>\@names, "values"=>\@startvalues}; +} + + +sub insert_ranges_to_featlist { + my $featlist = shift; + my $ranges = shift; + + $ranges = [] if !defined $ranges; + + # first collect the ranges from options + my $niceranges; + foreach my $range (@$ranges) { + my $name = undef; + foreach my $namedpair (split /,/, $range) { + if ($namedpair =~ /^(.*?):/) { + $name = $1; + $namedpair =~ s/^.*?://; + die "Unrecognized name '$name' in --range=$range" + if !defined $ABBR2FULL{$name}; + } + my ($min, $max) = split /\.\./, $namedpair; + die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "No name given in --range=$range" if !defined $name; + push @{$niceranges->{$name}}, [$min, $max]; + } + } + + # now populate featlist + my $seen = undef; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $seen->{$name} ++; + my $min = 0.0; + my $max = 1.0; + if (defined $niceranges->{$name}) { + my $minmax = shift @{$niceranges->{$name}}; + ($min, $max) = @$minmax if defined $minmax; + } + $featlist->{"mins"}->[$i] = $min; + $featlist->{"maxs"}->[$i] = $max; + } + return $featlist; +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + + + + + diff --git a/contrib/mert-sge-nosync/training/sge-nosync/process-moses-result-sge-nosync.pl b/contrib/mert-sge-nosync/training/sge-nosync/process-moses-result-sge-nosync.pl new file mode 100755 index 000000000..ee37b0f8f --- /dev/null +++ b/contrib/mert-sge-nosync/training/sge-nosync/process-moses-result-sge-nosync.pl @@ -0,0 +1,1377 @@ +#! /usr/bin/perl + +# $Id$ +# Usage: +# mert-moses.pl +# For other options see below or run 'mert-moses.pl --help' + +# Notes: +# and should be raw text files, one sentence per line +# can be a prefix, in which case the files are 0, 1, etc. are used + +# Excerpts from revision history + +# Sept 2011 multi-threaded mert (Barry Haddow) +# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK) +# Jul 2011 simplifications (Ondrej Bojar) +# -- rely on moses' -show-weights instead of parsing moses.ini +# ... so moses is also run once *before* mert starts, checking +# the model to some extent +# -- got rid of the 'triples' mess; +# use --range to supply bounds for random starting values: +# --range tm:-3..3 --range lm:-3..3 +# 5 Aug 2009 Handling with different reference length policies (shortest, average, closest) for BLEU +# and case-sensistive/insensitive evaluation (Nicola Bertoldi) +# 5 Jun 2008 Forked previous version to support new mert implementation. +# 13 Feb 2007 Better handling of default values for lambda, now works with multiple +# models and lexicalized reordering +# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1] +# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi) +# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table) +# useful if binary phrase tables are used (Nicola Bertoldi) +# 28 Aug 2006 Use either closest or average or shortest (default) reference +# length as effective reference length +# Use either normalization or not (default) of texts (Nicola Bertoldi) +# 31 Jul 2006 move gzip run*.out to avoid failure wit restartings +# adding default paths +# 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again) +# 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar) +# 27 Jul 2006 adding the safesystem() function to handle with process failure +# 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi) +# 21 Jul 2006 adapted for Moses-in-parallel (Nicola Bertoldi) +# 18 Jul 2006 adapted for Moses and cleaned up (PK) +# 21 Jan 2005 unified various versions, thorough cleanup (DWC) +# now indexing accumulated n-best list solely by feature vectors +# 14 Dec 2004 reimplemented find_threshold_points in C (NMD) +# 25 Oct 2004 Use either average or shortest (default) reference +# length as effective reference length (DWC) +# 13 Oct 2004 Use alternative decoders (DWC) +# Original version by Philipp Koehn + +use strict; +use Net::OpenSSH::Compat::Perl; + +use FindBin qw($Bin); +use File::Basename; +use File::Path; +use File::Spec; +use Cwd; + +my $SCRIPTS_ROOTDIR = $Bin; +$SCRIPTS_ROOTDIR =~ s/\/training$//; +$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); + +## We preserve this bit of comments to keep the traditional weight ranges. +# "w" => [ [ 0.0, -1.0, 1.0 ] ], # word penalty +# "d" => [ [ 1.0, 0.0, 2.0 ] ], # lexicalized reordering model +# "lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model +# "g" => [ [ 1.0, 0.0, 2.0 ], # generation model +# [ 1.0, 0.0, 2.0 ] ], +# "tm" => [ [ 0.3, 0.0, 0.5 ], # translation model +# [ 0.2, 0.0, 0.5 ], +# [ 0.3, 0.0, 0.5 ], +# [ 0.2, 0.0, 0.5 ], +# [ 0.0,-1.0, 1.0 ] ], # ... last weight is phrase penalty +# "lex"=> [ [ 0.1, 0.0, 0.2 ] ], # global lexical model +# "I" => [ [ 0.0,-1.0, 1.0 ] ], # input lattice scores + + + +# moses.ini file uses FULL names for lambdas, while this training script +# internally (and on the command line) uses ABBR names. +my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w + g=weight-generation lex=weight-lex I=weight-i); +my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP; +my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP; + +my $minimum_required_change_in_weights = 0.00001; + # stop if no lambda changes more than this + +my $verbose = 0; +my $usage = 0; # request for --help + +# We assume that if you don't specify working directory, +# we set the default is set to `pwd`/mert-work +# my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work"); +my $___WORKING_DIR = undef; +my $___DEV_E = undef; +my $___DEV_F = undef; # required, input text to decode +my $___DECODER = undef; # required, pathname to the decoder executable +my $___CONFIG = undef; # required, pathname to startup ini file +my $___N_BEST_LIST_SIZE = 100; +my $___LATTICE_SAMPLES = 0; +my $queue_flags = "-hard"; # extra parameters for parallelizer + # the -l ws0ssmt was relevant only to JHU 2006 workshop +my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial) +my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder +my $continue = 0; # should we try to continue from the last saved step? +my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert) +my $___FILTER_PHRASE_TABLE = 1; # filter phrase table +my $___PREDICTABLE_SEEDS = 0; +my $___START_WITH_HISTORIC_BESTS = 0; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009] +my $___RANDOM_DIRECTIONS = 0; # search in random directions only +my $___NUM_RANDOM_DIRECTIONS = 0; # number of random directions, also works with default optimizer [Cer&al.,2008] +my $___PAIRWISE_RANKED_OPTIMIZER = 0; # use Hopkins&May[2011] +my $___PRO_STARTING_POINT = 0; # get a starting point from pairwise ranked optimizer +my $___RANDOM_RESTARTS = 20; +my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous iteration's weights [Hopkins&May,2011,5.4.3] +my $__THREADS = 0; +my $run = 0; +my $nextrun = 0; +my $submithost = ""; + + +# Parameter for effective reference length when computing BLEU score +# Default is to use shortest reference +# Use "--shortest" to use shortest reference length +# Use "--average" to use average reference length +# Use "--closest" to use closest reference length +# Only one between --shortest, --average and --closest can be set +# If more than one choice the defualt (--shortest) is used +my $___SHORTEST = 0; +my $___AVERAGE = 0; +my $___CLOSEST = 0; + +# Use "--nocase" to compute case-insensitive scores +my $___NOCASE = 0; + +# Use "--nonorm" to non normalize translation before computing scores +my $___NONORM = 0; + +# set 0 if input type is text, set 1 if input type is confusion network +my $___INPUTTYPE = 0; + + +my $mertdir = undef; # path to new mert directory +my $mertargs = undef; # args to pass through to mert & extractor +my $mertmertargs = undef; # args to pass through to mert only +my $extractorargs = undef; # args to pass through to extractor only +my $filtercmd = undef; # path to filter-model-given-input.pl +my $filterfile = undef; +my $qsubwrapper = undef; +my $qsubwrapper_exit = undef; +my $moses_parallel_cmd = undef; +my $old_sge = 0; # assume sge<6.0 +my $___CONFIG_ORIG = undef; # pathname to startup ini file before filtering +my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on + # if undef work on all features + # (others are fixed to the starting values) +my $___RANGES = undef; +my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loading data (default =-1) + # -1 means all previous, i.e. from iteration 1 + # 0 means no previous data, i.e. from actual iteration + # 1 means 1 previous data , i.e. from the actual iteration and from the previous one + # and so on +my $maximum_iterations = 25; + +##################### +my $processfeatlistcmd = undef; +my $processfeatlistargs = undef; +my $createconfigcmd = undef; +my $createconfigargs = undef; +my $decoderargs = undef; +##################### + +use Getopt::Long; +GetOptions( + "working-dir=s" => \$___WORKING_DIR, + "input=s" => \$___DEV_F, + "inputtype=i" => \$___INPUTTYPE, + "refs=s" => \$___DEV_E, + "decoder=s" => \$___DECODER, + "config=s" => \$___CONFIG, + "nbest=i" => \$___N_BEST_LIST_SIZE, + "lattice-samples=i" => \$___LATTICE_SAMPLES, + "queue-flags=s" => \$queue_flags, + "jobs=i" => \$___JOBS, + "decoder-flags=s" => \$___DECODER_FLAGS, + "continue" => \$continue, + "skip-decoder" => \$skip_decoder, + "shortest" => \$___SHORTEST, + "average" => \$___AVERAGE, + "closest" => \$___CLOSEST, + "nocase" => \$___NOCASE, + "nonorm" => \$___NONORM, + "help" => \$usage, + "verbose" => \$verbose, + "mertdir=s" => \$mertdir, + "mertargs=s" => \$mertargs, + "extractorargs=s" => \$extractorargs, + "mertmertargs=s" => \$mertmertargs, + "rootdir=s" => \$SCRIPTS_ROOTDIR, + "filtercmd=s" => \$filtercmd, # allow to override the default location + "filterfile=s" => \$filterfile, # input to filtering script (useful for lattices/confnets) + "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location + "mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location + "old-sge" => \$old_sge, #passed to moses-parallel + "filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # (dis)allow of phrase tables + "predictable-seeds" => \$___PREDICTABLE_SEEDS, # make random restarts deterministic + "historic-bests" => \$___START_WITH_HISTORIC_BESTS, # use best settings from all previous iterations as starting points + "random-directions" => \$___RANDOM_DIRECTIONS, # search only in random directions + "run=i" => \$run, + "number-of-random-directions=i" => \$___NUM_RANDOM_DIRECTIONS, # number of random directions + "random-restarts=i" => \$___RANDOM_RESTARTS, # number of random restarts + "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) + "range=s@" => \$___RANGES, + "submithost=s" => \$submithost, + "prev-aggregate-nbestlist=i" => \$prev_aggregate_nbl_size, #number of previous step to consider when loading data (default =-1, i.e. all previous) + "maximum-iterations=i" => \$maximum_iterations, + "pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER, + "pro-starting-point" => \$___PRO_STARTING_POINT, + "historic-interpolation=f" => \$___HISTORIC_INTERPOLATION, + "threads=i" => \$__THREADS +) or exit(1); + +# the 4 required parameters can be supplied on the command line directly +# or using the --options +if (scalar @ARGV == 3) { + # required parameters: input_file references_basename decoder_executable + $___DEV_F = shift; + $___DECODER = shift; + $___CONFIG = shift; +} + + + +if ($usage || !defined $___DECODER || !defined $___CONFIG || !defined $___DEV_F) { + print STDERR "usage: $0 input-text decoder-executable decoder.ini +Options: + --working-dir=mert-dir ... where all the files are created + --nbest=100 ... how big nbestlist to generate + --lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010) + --jobs=N ... set this to anything to run moses in parallel + --mosesparallelcmd=STR ... use a different script instead of moses-parallel + --queue-flags=STRING ... anything you with to pass to qsub, eg. + '-l ws06osssmt=true'. The default is: '-hard' + To reset the parameters, please use + --queue-flags=' ' + (i.e. a space between the quotes). + --decoder-flags=STRING ... extra parameters for the decoder + --continue ... continue from the last successful iteration + --skip-decoder ... skip the decoder run for the first time, + assuming that we got interrupted during + optimization + --shortest --average --closest + ... Use shortest/average/closest reference length + as effective reference length (mutually exclusive) + --nocase ... Do not preserve case information; i.e. + case-insensitive evaluation (default is false). + --nonorm ... Do not use text normalization (flag is not active, + i.e. text is NOT normalized) + --filtercmd=STRING ... path to filter-model-given-input.pl + --filterfile=STRING ... path to alternative to input-text for filtering + model. useful for lattice decoding + --rootdir=STRING ... where do helpers reside (if not given explicitly) + --mertdir=STRING ... path to new mert implementation + --mertargs=STRING ... extra args for both extractor and mert + --extractorargs=STRING ... extra args for extractor only + --mertmertargs=STRING ... extra args for mert only + --scorenbestcmd=STRING ... path to score-nbest.py + --old-sge ... passed to parallelizers, assume Grid Engine < 6.0 + --inputtype=[0|1|2] ... Handle different input types: (0 for text, + 1 for confusion network, 2 for lattices, + default is 0) + --no-filter-phrase-table ... disallow filtering of phrase tables + (useful if binary phrase tables are available) + --random-restarts=INT ... number of random restarts (default: 20) + --predictable-seeds ... provide predictable seeds to mert so that random + restarts are the same on every run + --range=tm:0..1,-1..1 ... specify min and max value for some features + --range can be repeated as needed. + The order of the various --range specifications + is important only within a feature name. + E.g.: + --range=tm:0..1,-1..1 --range=tm:0..2 + is identical to: + --range=tm:0..1,-1..1,0..2 + but not to: + --range=tm:0..2 --range=tm:0..1,-1..1 + --activate-features=STRING ... comma-separated list of features to optimize, + others are fixed to the starting values + default: optimize all features + example: tm_0,tm_4,d_0 + --prev-aggregate-nbestlist=INT ... number of previous step to consider when + loading data (default = $prev_aggregate_nbl_size) + -1 means all previous, i.e. from iteration 1 + 0 means no previous data, i.e. only the + current iteration + N means this and N previous iterations + + --maximum-iterations=ITERS ... Maximum number of iterations. Default: $maximum_iterations + --random-directions ... search only in random directions + --number-of-random-directions=int ... number of random directions + (also works with regular optimizer, default: 0) + --pairwise-ranked ... Use PRO for optimisation (Hopkins and May, emnlp 2011) + --pro-starting-point ... Use PRO to get a starting point for MERT + --threads=NUMBER ... Use multi-threaded mert (must be compiled in). + --historic-interpolation ... Interpolate optimized weights with prior iterations' weight + (parameter sets factor [0;1] given to current weights) +"; + exit 1; +} + + +# Check validity of input parameters and set defaults if needed + +print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n"; + +# path of script for filtering phrase tables and running the decoder +$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd; + +if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) { + print STDERR "Filtering command not found: $filtercmd.\n"; + print STDERR "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table\n"; + exit 1; +} + +# $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper; +$qsubwrapper = "$SCRIPTS_ROOTDIR/generic/qsub-wrapper-sge-nosync.pl" if !defined $qsubwrapper; + +$qsubwrapper_exit = "$SCRIPTS_ROOTDIR/generic/qsub-wrapper-exit-sge-nosync.pl" if !defined $qsubwrapper_exit; + +# $moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl" +# if !defined $moses_parallel_cmd; +$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel-sge-nosync.pl" + if !defined $moses_parallel_cmd; + +if (!defined $mertdir) { + $mertdir = "$SCRIPTS_ROOTDIR/../mert"; + print STDERR "Assuming --mertdir=$mertdir\n"; +} + +my $mert_extract_cmd = "$mertdir/extractor"; +my $mert_mert_cmd = "$mertdir/mert"; +my $mert_pro_cmd = "$mertdir/pro"; + +die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd; +die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd; +die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd; + +my $pro_optimizer = "$mertdir/megam_i686.opt"; # or set to your installation +if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) { + print "did not find $pro_optimizer, installing it in $mertdir\n"; + `cd $mertdir; wget http://www.cs.utah.edu/~hal/megam/megam_i686.opt.gz;`; + `gunzip $pro_optimizer.gz`; + `chmod +x $pro_optimizer`; + die("ERROR: Installation of megam_i686.opt failed! Install by hand from http://www.cs.utah.edu/~hal/megam/") unless -x $pro_optimizer; +} + +$mertargs = "" if !defined $mertargs; + +my $scconfig = undef; +if ($mertargs =~ /\-\-scconfig\s+(.+?)(\s|$)/){ + $scconfig=$1; + $scconfig =~ s/\,/ /g; + $mertargs =~ s/\-\-scconfig\s+(.+?)(\s|$)//; +} + +# handling reference lengh strategy +if (($___CLOSEST + $___AVERAGE + $___SHORTEST) > 1){ + die "You can specify just ONE reference length strategy (closest or shortest or average) not both\n"; +} + +if ($___SHORTEST){ + $scconfig .= " reflen:shortest"; +}elsif ($___AVERAGE){ + $scconfig .= " reflen:average"; +}elsif ($___CLOSEST){ + $scconfig .= " reflen:closest"; +} + +# handling case-insensitive flag +if ($___NOCASE) { + $scconfig .= " case:false"; +}else{ + $scconfig .= " case:true"; +} +$scconfig =~ s/^\s+//; +$scconfig =~ s/\s+$//; +$scconfig =~ s/\s+/,/g; + +$scconfig = "--scconfig $scconfig" if ($scconfig); + +my $mert_extract_args=$mertargs; +$mert_extract_args .=" $scconfig"; +$mert_extract_args .=" $extractorargs"; + +$mertmertargs = "" if !defined $mertmertargs; + +my $mert_mert_args="$mertargs $mertmertargs"; +$mert_mert_args =~ s/\-+(binary|b)\b//; +$mert_mert_args .=" $scconfig"; +if ($___ACTIVATE_FEATURES){ $mert_mert_args .=" -o \"$___ACTIVATE_FEATURES\""; } + +my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd); +die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd; +die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd; +die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper; +# die "Not executable: $___DECODER" if ! -x $___DECODER; + +my $input_abs = ensure_full_path($___DEV_F); +die "File not found: $___DEV_F (interpreted as $input_abs)." +if ! -e $input_abs; + $___DEV_F = $input_abs; + +# Option to pass to qsubwrapper and moses-parallel +my $pass_old_sge = $old_sge ? "-old-sge" : ""; + +my $decoder_abs = ensure_full_path($___DECODER); +die "File not executable: $___DECODER (interpreted as $decoder_abs)." + if ! -x $decoder_abs; +$___DECODER = $decoder_abs; + + +my $config_abs = ensure_full_path($___CONFIG); +die "File not found: $___CONFIG (interpreted as $config_abs)." + if ! -e $config_abs; +$___CONFIG = $config_abs; + +# moses should use our config +if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) / +|| $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) / +|| $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(generation-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) / +) { + die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files."; +} + +# as weights are normalized in the next steps (by cmert) +# normalize initial LAMBDAs, too +my $need_to_normalize = 0; + +#store current directory and create the working directory (if needed) +my $cwd = `pawd 2>/dev/null`; +if(!$cwd){$cwd = `pwd`;} +chomp($cwd); + +$___WORKING_DIR = $cwd if (!defined $___WORKING_DIR); +chomp $___WORKING_DIR; + +# mkpath($___WORKING_DIR); + +{ +# open local scope + +#chdir to the working directory +chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR"; + +# fixed file names +my $mert_outfile = "mert.out"; +my $mert_logfile = "mert.log"; +my $weights_in_file = "init.opt"; +my $weights_out_file = "weights.txt"; + +# set start run +my $start_run = 1; +my $bestpoint = undef; +my $devbleu = undef; +my $sparse_weights_file = undef; +my $jobid = -1; + +my $prev_feature_file = undef; +my $prev_score_file = undef; +my $prev_init_file = undef; + + +######################### +# set jobid to trace different jobs +my $prevjid = undef; + + + +### load featlist when needed######## +print STDERR "loading feat config ./run$run.moses.ini\n"; +my $featlist = get_featlist_from_moses("./run$run.moses.ini"); +$featlist = insert_ranges_to_featlist($featlist, $___RANGES); + +# Mark which features are disabled: +if (defined $___ACTIVATE_FEATURES) { + my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES; + my %cnt; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $cnt{$name} = 0 if !defined $cnt{$name}; + $featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}}; + $cnt{$name}++; + } +} else { + # all enabled + for(my $i=0; $i{"names"}}); $i++) { + $featlist->{"enabled"}->[$i] = 1; + } +} + +##################################### + + + +my $oldallsorted = undef; +my $allsorted = undef; + +my $nbest_file=undef; +my $lsamp_file=undef; #Lattice samples +my $orig_nbest_file=undef; # replaced if lattice sampling +my $cmd=undef; + + + + + my $base_feature_file = "features.dat"; + my $base_score_file = "scores.dat"; + my $feature_file = "run$run.${base_feature_file}"; + my $score_file = "run$run.${base_score_file}"; + + + + # Create the initial weights file for mert: init.opt + foreach (@{$featlist->{"names"}}) { + print STDERR "feature list name: $_\n"; + } + + my @MIN = @{$featlist->{"mins"}}; + my @MAX = @{$featlist->{"maxs"}}; + my @CURR = @{$featlist->{"values"}}; + my @NAME = @{$featlist->{"names"}}; + + open(OUT,"> $weights_in_file") + or die "Can't write $weights_in_file (WD now $___WORKING_DIR)"; + print OUT join(" ", @CURR)."\n"; + print OUT join(" ", @MIN)."\n"; # this is where we could pass MINS + print OUT join(" ", @MAX)."\n"; # this is where we could pass MAXS + close(OUT); + # print join(" ", @NAME)."\n"; + + print "CURR before MERT @CURR\n"; + + # make a backup copy labelled with this run number + safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die; + + my $DIM = scalar(@CURR); # number of lambdas + + # run mert + $cmd = "$mert_mert_cmd -d $DIM $mert_mert_args"; + + my $mert_settings = " -n $___RANDOM_RESTARTS"; + my $seed_settings = ""; + if ($___PREDICTABLE_SEEDS) { + my $seed = $run * 1000; + $seed_settings .= " -r $seed"; + } + $mert_settings .= $seed_settings; + if ($___RANDOM_DIRECTIONS) { + if ($___NUM_RANDOM_DIRECTIONS == 0) { + $mert_settings .= " -m 50"; + } + $mert_settings .= " -t random-direction"; + } + if ($___NUM_RANDOM_DIRECTIONS) { + $mert_settings .= " -m $___NUM_RANDOM_DIRECTIONS"; + } + if ($__THREADS) { + $mert_settings .= " --threads $__THREADS"; + } + + ############################## + # construct prev_feature_file and prev_socre_file + + my $firstrun; + if ($prev_aggregate_nbl_size==-1){ + $firstrun=1; + } + else{ + $firstrun=$run-$prev_aggregate_nbl_size+1; + $firstrun=($firstrun>0)?$firstrun:1; + } + ############################## + + print "loading data from $firstrun to $run (prev_aggregate_nbl_size=$prev_aggregate_nbl_size)\n"; + $prev_feature_file = undef; + $prev_score_file = undef; + $prev_init_file = undef; + for (my $i=$firstrun;$i<$run;$i++){ + if (defined $prev_feature_file){ + $prev_feature_file = "${prev_feature_file},run${i}.${base_feature_file}"; + } + else{ + $prev_feature_file = "run${i}.${base_feature_file}"; + } + if (defined $prev_score_file){ + $prev_score_file = "${prev_score_file},run${i}.${base_score_file}"; + } + else{ + $prev_score_file = "run${i}.${base_score_file}"; + } + } + + print STDERR "prev_feature_file is $prev_feature_file\n"; + print STDERR "prev_score_files is $prev_score_file\n"; + print STDERR "prev_init_files is undefined\n"; + ############################ + + + my $ffiles = ""; + my $scfiles = ""; + if (defined $prev_feature_file) { + $ffiles = "$prev_feature_file,$feature_file"; + } + else{ + $ffiles = "$feature_file"; + } + if (defined $prev_score_file) { + $scfiles = "$prev_score_file,$score_file"; + } + else{ + $scfiles = "$score_file"; + } + + my $file_settings = " --ffile $ffiles --scfile $scfiles"; + my $pro_file_settings = "--ffile " . join( " --ffile ", split(/,/, $ffiles)) . + " --scfile " . join( " --scfile ", split(/,/, $scfiles)); + + if ($___START_WITH_HISTORIC_BESTS && defined $prev_init_file) { + $file_settings .= " --ifile $prev_init_file,run$run.$weights_in_file"; + } + else{ + $file_settings .= " --ifile run$run.$weights_in_file"; + } + + $cmd .= $file_settings; + + # pro optimization + if ($___PAIRWISE_RANKED_OPTIMIZER) { + print STDERR "\$___PAIRWISE_RANKED_OPTIMIZER not supported in non-sync mode\n"; + exit 1; + } + # first pro, then mert + elsif ($___PRO_STARTING_POINT) { + print STDERR "\$___PRO_STARTING_POINT not supported in non-sync mode\n"; + exit 1; + # run pro... + my $pro_cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data"; + &submit_or_exec($pro_cmd,"run$run.pro.out","run$run.pro.err"); + # ... get results ... + my %dummy; + ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.pro.out","run$run.pro.err",scalar @{$featlist->{"names"}},\%dummy); + open(PRO_START,">run$run.init.pro"); + print PRO_START $bestpoint."\n"; + close(PRO_START); + # ... and run mert + $cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/; + &submit_or_exec($cmd.$mert_settings,$mert_outfile,$mert_logfile); + } + # just mert + else { + print STDERR "I am running just MERT\n"; + &submit_or_exec($cmd.$mert_settings,$mert_outfile,$mert_logfile); + } + + die "Optimization failed, file $weights_out_file does not exist or is empty" + if ! -s $weights_out_file; + + # backup copies + safesystem ("\\cp -f extract.err run$run.extract.err") or die; + safesystem ("\\cp -f extract.out run$run.extract.out") or die; + safesystem ("\\cp -f $mert_outfile run$run.$mert_outfile") or die; + safesystem ("\\cp -f $mert_logfile run$run.$mert_logfile") or die; + safesystem ("touch $mert_logfile run$run.$mert_logfile") or die; + safesystem ("\\cp -f $weights_out_file run$run.$weights_out_file") or die; # this one is needed for restarts, too + + print "run $run end at ".`date`; + + my %sparse_weights; # sparse features + ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights); + + die "Failed to parse mert.log, missed Best point there." + if !defined $bestpoint || !defined $devbleu; + + print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`; + + # update my cache of lambda values + my @newweights = split /\s+/, $bestpoint; + + # interpolate with prior's interation weight, if historic-interpolation is specified + if ($___HISTORIC_INTERPOLATION>0 && $run>3) { + my %historic_sparse_weights; + if (-e "run$run.sparse-weights") { + open(SPARSE,"run$run.sparse-weights"); + while() { + chop; + my ($feature,$weight) = split; + $historic_sparse_weights{$feature} = $weight; + } + } + my $prev = $run-1; + my @historic_weights = split /\s+/, `cat run$prev.$weights_out_file`; + for(my $i=0;$i $sparse_weights{$_}\n"; + } + foreach (keys %historic_sparse_weights) { + $sparse_weights{$_} += (1-$___HISTORIC_INTERPOLATION) * $historic_sparse_weights{$_}; + #print STDERR "sparse_weights{$_} += (1-$___HISTORIC_INTERPOLATION) * $historic_sparse_weights{$_} -> $sparse_weights{$_}\n"; + } + } + if ($___HISTORIC_INTERPOLATION>0) { + open(WEIGHTS,">run$run.$weights_out_file"); + print WEIGHTS join(" ",@newweights); + close(WEIGHTS); + } + + $featlist->{"values"} = \@newweights; + + if (scalar keys %sparse_weights) { + $sparse_weights_file = "run".($run+1).".sparse-weights"; + open(SPARSE,">".$sparse_weights_file); + foreach my $feature (keys %sparse_weights) { + print SPARSE "$feature $sparse_weights{$feature}\n"; + } + close(SPARSE); + } + + ## additional stopping criterion: weights have not changed + my $shouldstop = 1; + print "CURR after MERT: @CURR\n"; + + for(my $i=0; $i<@CURR; $i++) { + die "Lost weight! mert reported fewer weights (@newweights) than we gave it (@CURR)" + if !defined $newweights[$i]; + # print STDERR "$i Current weight : $CURR[$i]\n"; + # print STDERR "$i New weight: $newweights[$i]\n"; + if (abs($CURR[$i] - $newweights[$i]) >= $minimum_required_change_in_weights) { + $shouldstop = 0; + # last; + } + } + + + open F, "> finished_step.txt" or die "Can't mark finished step"; + print F $run."\n"; + close F; + + ##################################################33 + print "loading data from $prev_feature_file\n" if defined($prev_feature_file); + print "loading data from $prev_score_file\n" if defined($prev_score_file); + print "loading data from $prev_init_file\n" if defined($prev_init_file); + + + ### extra step to create config for next step ################### + $nextrun=$run+1; + + create_config($___CONFIG, "./run$nextrun.moses.ini", $featlist, $nextrun, $devbleu, $sparse_weights_file); + ################################################################# + + + if ($shouldstop) { + print STDERR "None of the weights changed more than $minimum_required_change_in_weights. Stopping.\n"; + + for (my $i=$run+1;$i<=$maximum_iterations;$i++){ + kill_unnecessary_jobs($submithost,$i); + } + last; ## end while loop + } + + + + +} # end of local scope + + + +sub kill_unnecessary_jobs { + my ($submithost,$iter_id) = @_; + my @sgepids = (); + my @processids = (); + my $jobid = ""; + my $pidid = undef; + my $kill_target = ""; + my $ssh = undef; + + ## read all relevant jobid ######### + chomp($jobid=`cat decode$iter_id.id decode$iter_id.POLL.id zipext$iter_id.id processmoses$iter_id.id`); + + if ($jobid != "") { + @sgepids = split("\n",$jobid); + $kill_target = join(" ",@sgepids); + } + #################################### + + if ($kill_target != "") { + chomp(my $my_username = `whoami`); + $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + + $ssh->login("$my_username",`cat /home/$my_username/accpw`); + $ssh->cmd("qdel $kill_target"); + } + + + + ## read relevant pid ############### + chomp($pidid=`ls decode$iter_id.id.pid decode$iter_id.POLL.id.pid zipext$iter_id.id.pid processmoses$iter_id.id.pid`); + + if (defined $pidid) { + @processids = split("\n",$pidid); + } + ################################### + + + foreach my $kill_pidfile (@processids) { + print STDERR "Force delete process represented by $kill_pidfile\n"; + &force_exit_submit_thu_host($submithost,$kill_pidfile); + } + #################################### + + + + +} + +sub get_weights_from_mert { + my ($outfile,$logfile,$weight_count,$sparse_weights) = @_; + my ($bestpoint,$devbleu); + + if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/)) { + open(IN,$outfile) or die "Can't open $outfile"; + my (@WEIGHT,$sum); + for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; } + while() { + # regular features + if (/^F(\d+) ([\-\.\de]+)/) { + $WEIGHT[$1] = $2; + $sum += abs($2); + } + # sparse features + elsif(/^(.+_.+) ([\-\.\de]+)/) { + $$sparse_weights{$1} = $2; + } + } + $devbleu = "unknown"; + foreach (@WEIGHT) { $_ /= $sum; } + foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; } + $bestpoint = join(" ",@WEIGHT); + close IN; + } + else { + open(IN,$logfile) or die "Can't open $logfile"; + while () { + if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { + $bestpoint = $1; + $devbleu = $2; + last; + } + } + close IN; + } + return ($bestpoint,$devbleu); +} + + +sub insert_ranges_to_featlist { + my $featlist = shift; + my $ranges = shift; + + $ranges = [] if !defined $ranges; + + # first collect the ranges from options + my $niceranges; + foreach my $range (@$ranges) { + my $name = undef; + foreach my $namedpair (split /,/, $range) { + if ($namedpair =~ /^(.*?):/) { + $name = $1; + $namedpair =~ s/^.*?://; + die "Unrecognized name '$name' in --range=$range" + if !defined $ABBR2FULL{$name}; + } + my ($min, $max) = split /\.\./, $namedpair; + die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "No name given in --range=$range" if !defined $name; + push @{$niceranges->{$name}}, [$min, $max]; + } + } + + # now populate featlist + my $seen = undef; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $seen->{$name} ++; + my $min = 0.0; + my $max = 1.0; + if (defined $niceranges->{$name}) { + my $minmax = shift @{$niceranges->{$name}}; + ($min, $max) = @$minmax if defined $minmax; + } + $featlist->{"mins"}->[$i] = $min; + $featlist->{"maxs"}->[$i] = $max; + } + return $featlist; + } + +sub sanity_check_order_of_lambdas { + my $featlist = shift; + my $filename_or_stream = shift; + + my @expected_lambdas = @{$featlist->{"names"}}; + my @got = get_order_of_scores_from_nbestlist($filename_or_stream); + die "Mismatched lambdas. Decoder returned @got, we expected @expected_lambdas" + if "@got" ne "@expected_lambdas"; +} + +sub get_featlist_from_moses { + # run moses with the given config file and return the list of features and + # their initial values + + my $configfn = shift; + my $featlistfn = "./features.list.run${run}_end"; + if (-e $featlistfn) { + print STDERR "Deleting featlistfn: $featlistfn\n"; + } + + print STDERR "Asking moses for feature names and values from $___CONFIG\n"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + safesystem($cmd) or die "Failed to run moses with the config $configfn"; + + + + + # read feature list + my @names = (); + my @startvalues = (); + open(INI,$featlistfn) or die "Can't read $featlistfn"; + my $nr = 0; + my @errs = (); + while () { + $nr++; + chomp; + /^(.+) (\S+) (\S+)$/ || die("invalid feature: $_"); + my ($longname, $feature, $value) = ($1,$2,$3); + next if $value eq "sparse"; + push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" + if $value !~ /^[+-]?[0-9.e]+$/; + push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" + if !defined $ABBR2FULL{$feature}; + push @names, $feature; + push @startvalues, $value; + } + close INI; + if (scalar @errs) { + print STDERR join("", @errs); + exit 1; + } + return {"names"=>\@names, "values"=>\@startvalues}; +} + +sub get_order_of_scores_from_nbestlist { + # read the first line and interpret the ||| label: num num num label2: num ||| column in nbestlist + # return the score labels in order + my $fname_or_source = shift; + # print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n"; + open IN, $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source'"; + my $line = ; + close IN; + die "Line empty in nbestlist '$fname_or_source'" if !defined $line; + my ($sent, $hypo, $scores, $total) = split /\|\|\|/, $line; + $scores =~ s/^\s*|\s*$//g; + die "No scores in line: $line" if $scores eq ""; + + my @order = (); + my $label = undef; + my $sparse = 0; # we ignore sparse features here + foreach my $tok (split /\s+/, $scores) { + if ($tok =~ /.+_.+:/) { + $sparse = 1; + } elsif ($tok =~ /^([a-z][0-9a-z]*):/i) { + $label = $1; + } elsif ($tok =~ /^-?[-0-9.e]+$/) { + if (!$sparse) { + # a score found, remember it + die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!" + if !defined $label; + push @order, $label; + } + $sparse = 0; + } else { + die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'"; + } + } + print STDERR "The decoder returns the scores in this order: @order\n"; + return @order; +} + + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} +sub ensure_full_path { + my $PATH = shift; +$PATH =~ s/\/nfsmnt//; + return $PATH if $PATH =~ /^\//; + my $dir = `pawd 2>/dev/null`; + if(!$dir){$dir = `pwd`;} + chomp($dir); + $PATH = $dir."/".$PATH; + $PATH =~ s/[\r\n]//g; + $PATH =~ s/\/\.\//\//g; + $PATH =~ s/\/+/\//g; + my $sanity = 0; + while($PATH =~ /\/\.\.\// && $sanity++<10) { + $PATH =~ s/\/+/\//g; + $PATH =~ s/\/[^\/]+\/\.\.\//\//g; + } + $PATH =~ s/\/[^\/]+\/\.\.$//; + $PATH =~ s/\/+$//; +$PATH =~ s/\/nfsmnt//; + return $PATH; +} + +sub submit_or_exec { + + my $argvlen = @_; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $prevjid = undef; + + # if supply 3 arguments, exec without submit + # if supply 4 arguments, then submit new job + # if supply 5 arguments, wait for the previous job to finish + if ($argvlen == 3){ + ($cmd,$stdout,$stderr) = @_; + } elsif ($argvlen == 4){ + ($cmd,$stdout,$stderr,$jidfile) = @_; + } elsif ($argvlen == 5){ + ($cmd,$stdout,$stderr,$jidfile,$prevjid) = @_; + } + + print STDERR "exec: $cmd\n"; + if (defined $___JOBS && $___JOBS > 0 && $argvlen==5) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameters=\"$queue_flags\" -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -prevjid=$prevjid" ) + or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; + } + elsif (defined $___JOBS && $___JOBS > 0 && $argvlen==4) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameters=\"$queue_flags\" -stdout=$stdout -stderr=$stderr -jidfile=$jidfile" ) + or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; + } else { + safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'."; + } +} + +sub exit_submit { + + my $argvlen = @_; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $pidfile = undef; + my $prevjid = undef; + my $prevjidarraysize = 0; + my @prevjidarray = (); + my $pid = undef; + my $qsubcmd=""; + my $hj=""; + + # if supply 4 arguments, then submit new job + # if supply 5 arguments, wait for the previous job to finish + if ($argvlen == 2) { + ($stdout,$stderr) = @_; + } elsif ($argvlen == 4){ + ($stdout,$stderr,$jidfile,$pidfile) = @_; + } elsif ($argvlen == 5){ + ($stdout,$stderr,$jidfile,$pidfile,$prevjid) = @_; + } + + # parse prevjid ######################## + $prevjid =~ s/^\s+|\s+$//g; + @prevjidarray = split(/\s+/,$prevjid); + $prevjidarraysize = scalar(@prevjidarray); + ######################################## + + + # print STDERR "exec: $stdout\n"; + + # read pid from file, and draft exit script ################## + chomp ($pid=`tail -n 1 $pidfile`); + open (OUT, ">exitjob$pid.sh"); + + my $scriptheader="\#\!/bin/bash\n\#\$ -S /bin/sh\n# Both lines are needed to invoke base\n#the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n"; + $scriptheader .="uname -a\n\n"; + $scriptheader .="cd $___WORKING_DIR\n\n"; + + print OUT $scriptheader; + + print OUT "if $qsubwrapper_exit -submithost=$submithost -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -pidfile=$pidfile > exitjob$pid.out 2> exitjob$pid.err ; then + echo 'succeeded' +else + echo failed with exit status \$\? + die=1 +fi +"; + print OUT "\n\n"; + + close (OUT); + # setting permissions of the script + chmod(oct(755),"exitjob$pid.sh"); + ############################################################## + + + if (defined $___JOBS && $___JOBS > 0 && $argvlen==5) { + if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1){ + $hj = "-hold_jid $prevjid"; + } elsif (defined $prevjid && $prevjidarraysize > 1){ + $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray); + } + $qsubcmd="qsub $queue_flags -V $hj exitjob$pid.sh > exitjob$pid.log 2>&1"; + safesystem($qsubcmd) or die "ERROR: Failed to exit-submit $pid (via $qsubwrapper_exit)"; + } elsif (defined $___JOBS && $___JOBS > 0 && $argvlen==4) { + $qsubcmd="qsub $queue_flags -V exitjob$pid.sh > exitjob$pid.log 2>&1"; + safesystem($qsubcmd) or die "ERROR: Failed to exit-submit $pid (via $qsubwrapper_exit)"; + } else { + safesystem("rm $stdout") or die "ERROR: Failed to remove '$stdout'."; + safesystem("rm $stderr") or die "ERROR: Failed to remove '$stderr'."; + } +} + + + + + + + +sub create_extractor_script() +{ + my ($cmd, $outdir) = @_; + my $script_path = File::Spec->catfile($outdir, "extractor.sh"); + + open my $out, '>', $script_path + or die "Couldn't open $script_path for writing: $!\n"; + print $out "#!/bin/bash\n"; + print $out "cd $outdir\n"; + print $out "$cmd\n"; + close($out); + + `chmod +x $script_path`; + + return $script_path; +} + + +sub create_config { + my $infn = shift; # source config + my $outfn = shift; # where to save the config + my $featlist = shift; # the lambdas we should write + my $iteration = shift; # just for verbosity + my $bleu_achieved = shift; # just for verbosity + my $sparse_weights_file = shift; # only defined when optimizing sparse features + + my %P; # the hash of all parameters we wish to override + + # first convert the command line parameters to the hash + { # ensure local scope of vars + my $parameter=undef; + print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n"; + $___DECODER_FLAGS =~ s/^\s*|\s*$//; + $___DECODER_FLAGS =~ s/\s+/ /; + foreach (split(/ /,$___DECODER_FLAGS)) { + if (/^\-([^\d].*)$/) { + $parameter = $1; + $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); + } + else { + die "Found value with no -paramname before it: $_" + if !defined $parameter; + push @{$P{$parameter}},$_; + } + } + } + + # First delete all weights params from the input, we're overwriting them. + # Delete both short and long-named version. + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + delete($P{$name}); + delete($P{$ABBR2FULL{$name}}); + } + + # Convert weights to elements in P + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + my $val = $featlist->{"values"}->[$i]; + $name = defined $ABBR2FULL{$name} ? $ABBR2FULL{$name} : $name; + # ensure long name + push @{$P{$name}}, $val; + } + + if (defined($sparse_weights_file)) { + push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file; + } + + # create new moses.ini decoder config file by cloning and overriding the original one + open(INI,$infn) or die "Can't read $infn"; + delete($P{"config"}); # never output + print "Saving new config to: $outfn\n"; + open(OUT,"> $outfn") or die "Can't write $outfn"; + print OUT "# MERT optimized configuration\n"; + print OUT "# decoder $___DECODER\n"; + print OUT "# BLEU $bleu_achieved on dev $___DEV_F\n"; + print OUT "# We were before running iteration $iteration\n"; + print OUT "# finished ".`date`; + my $line = ; + while(1) { + last unless $line; + + # skip until hit [parameter] + if ($line !~ /^\[(.+)\]\s*$/) { + $line = ; + print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; + next; + } + + # parameter name + my $parameter = $1; + $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); + print OUT "[$parameter]\n"; + + # change parameter, if new values + if (defined($P{$parameter})) { + # write new values + foreach (@{$P{$parameter}}) { + print OUT $_."\n"; + } + delete($P{$parameter}); + # skip until new parameter, only write comments + while($line = ) { + print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; + last if $line =~ /^\[/; + last unless $line; + } + next; + } + # unchanged parameter, write old + while($line = ) { + last if $line =~ /^\[/; + print OUT $line; + } + } + + # write all additional parameters + foreach my $parameter (keys %P) { + print OUT "\n[$parameter]\n"; + foreach (@{$P{$parameter}}) { + print OUT $_."\n"; + } + } + + close(INI); + close(OUT); + print STDERR "Saved: $outfn\n"; +} + + + + +sub force_exit_submit_thu_host { + + my $argvlen = @_; + my $submithost = undef; + my $run = -1; + my $idx = ""; + my $batch_and_join = ""; + my $my_username = undef; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $pidfile = undef; + my $prevjid = undef; + my $prevjidarraysize = 0; + my @prevjidarray = (); + my $pid = undef; + my $qsubcmd=""; + my $hj=""; + + # if supply 8 arguments, then submit new job + # if supply 9 arguments, wait for the previous job to finish + ($submithost,$pidfile) = @_; + + #if ($argvlen == 6){ + # ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr) = @_; + #} elsif ($argvlen == 8){ + # ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr,$jidfile,$pidfile) = @_; + #} elsif ($argvlen == 9){ + # ($submithost,$run,$idx,$batch_and_join,$stdout,$stderr,$jidfile,$pidfile,$prevjid) = @_; + #} + + # parse prevjid ######################## + #$prevjid =~ s/^\s+|\s+$//g; + #@prevjidarray = split(/\s+/,$prevjid); + $prevjidarraysize = scalar(@prevjidarray); + ######################################## + + # print STDERR "exec: $stdout\n"; + + # read pid from file, and draft exit script ################## + chomp ($pid=`tail -n 1 $pidfile`); + open (OUT, ">forceexitjob$pid.sh"); + + my $scriptheader="\#\!/bin/bash\n\#\$ -S /bin/sh\n# Both lines are needed to invoke base\n#the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n"; + $scriptheader .="uname -a\n\n"; + $scriptheader .="cd $___WORKING_DIR\n\n"; + + print OUT $scriptheader; + + print OUT "if $qsubwrapper_exit -submithost=$submithost -force-delete=1 -pidfile=$pidfile > forceexitjob$pid.out 2> forceexitjob$pid.err ; then + echo 'succeeded' +else + echo failed with exit status \$\? + die=1 +fi +"; + print OUT "\n\n"; + + close (OUT); + # setting permissions of the script + chmod(oct(755),"forceexitjob$pid.sh"); + ############################################################## + # + # log in submit host ######################################### + # chomp(my $my_username = `whoami`); + # my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0); + + # $ssh->login("$my_username",`cat /home/$my_username/accpw`); + ############################################################## + + + # $qsubcmd="qsub $queue_flags -sync y -o /dev/null -e /dev/null forceexitjob$pid.sh > forceexitjob$pid.log 2>&1"; + # $ssh->cmd("cd $___WORKING_DIR && $qsubcmd"); + safesystem("./forceexitjob$pid.sh"); + + print STDERR "Executing $qsubcmd in $___WORKING_DIR\n"; + +} + + + + diff --git a/contrib/mert-sge-nosync/training/sge-nosync/run-decoder-sge-nosync.pl b/contrib/mert-sge-nosync/training/sge-nosync/run-decoder-sge-nosync.pl new file mode 100755 index 000000000..ab0c698af --- /dev/null +++ b/contrib/mert-sge-nosync/training/sge-nosync/run-decoder-sge-nosync.pl @@ -0,0 +1,271 @@ +#!/usr/bin/perl -w + +# $Id$ +# after filter-mode-given-input.pl, process the feature list + +# original code by Philipp Koehn +# changes by Ondrej Bojar +# adapted for hierarchical models by Phil Williams + +use strict; + +use FindBin qw($Bin); +use Getopt::Long; + + + +my $SCRIPTS_ROOTDIR; +if (defined($ENV{"SCRIPTS_ROOTDIR"})) { + $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; +} else { + $SCRIPTS_ROOTDIR = $Bin; + if ($SCRIPTS_ROOTDIR eq '') { + $SCRIPTS_ROOTDIR = dirname(__FILE__); + } + $SCRIPTS_ROOTDIR =~ s/\/training$//; + $ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR; +} + + +##!# # moses.ini file uses FULL names for lambdas, while this training script +##!# # internally (and on the command line) uses ABBR names. +##!# my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w +##!# g=weight-generation lex=weight-lex I=weight-i); +##!# my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP; +##!# my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP; + + +my $verbose = 0; +my $usage = 0; # request for --help + + + + +##!# # consider phrases in input up to $MAX_LENGTH +##!# # in other words, all phrase-tables will be truncated at least to 10 words per +##!# # phrase. +##!# my $MAX_LENGTH = 10; + +# utilities +##!# my $ZCAT = "gzip -cd"; + +# get optional parameters +##!# my $opt_hierarchical = 0; +##!# my $binarizer = undef; +##!# my $opt_min_non_initial_rule_count = undef; +##!# my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats) + +my $___RANGES = undef; +my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on + # if undef work on all features + # (others are fixed to the starting values) +my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder + +# set 0 if input type is text, set 1 if input type is confusion network +my $___INPUTTYPE = 0; + +my $___DECODER = undef; # required, pathname to the decoder executable +my $___CONFIG = undef; # required, pathname to startup ini file + + +GetOptions( +##!# "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) +##!# "range=s@" => \$___RANGES, +##!# "decoder-flags=s" => \$___DECODER_FLAGS, +##!# "inputtype=i" => \$___INPUTTYPE + "" +) or exit(1); + + + +# the ?? required parameters can be supplied on the command line directly +# or using the --options +if (scalar @ARGV == 2) { + # required parameters: options + $___DECODER = shift; + $___CONFIG = shift; +} + +if ($usage || !defined $___DECODER || !defined $___CONFIG) { + print STDERR "usage: $0 \$___DECODER \$___CONFIG(decoder.ini) +Options: + --activate-features=STRING ... comma-separated list of features to optimize, + others are fixed to the starting values + default: optimize all features + example: tm_0,tm_4,d_0 + --range=tm:0..1,-1..1 ... specify min and max value for some features + --range can be repeated as needed. + The order of the various --range specifications + is important only within a feature name. + E.g.: + --range=tm:0..1,-1..1 --range=tm:0..2 + is identical to: + --range=tm:0..1,-1..1,0..2 + but not to: + --range=tm:0..2 --range=tm:0..1,-1..1 + --decoder-flags=STRING ... extra parameters for the decoder + --inputtype=[0|1|2] ... Handle different input types: (0 for text, + 1 for confusion network, 2 for lattices, + default is 0) +"; + exit 1; +} + + + + +############################################################ +############################################################ +############################################################ + +# main + +# we run moses to check validity of moses.ini and to obtain all the feature +# names +my $featlist = get_featlist_from_moses($___CONFIG); +$featlist = insert_ranges_to_featlist($featlist, $___RANGES); + + +# Mark which features are disabled: +if (defined $___ACTIVATE_FEATURES) { + my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES; + my %cnt; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $cnt{$name} = 0 if !defined $cnt{$name}; + $featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}}; + $cnt{$name}++; + } +} else { + # all enabled + for(my $i=0; $i{"names"}}); $i++) { + $featlist->{"enabled"}->[$i] = 1; + } +} + +print STDERR "MERT starting values and ranges for random generation:\n"; +for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + my $val = $featlist->{"values"}->[$i]; + my $min = $featlist->{"mins"}->[$i]; + my $max = $featlist->{"maxs"}->[$i]; + my $enabled = $featlist->{"enabled"}->[$i]; + printf STDERR " %5s = %7.3f", $name, $val; + if ($enabled) { + printf STDERR " (%5.2f .. %5.2f)\n", $min, $max; + } else { + print STDERR " --- inactive, not optimized ---\n"; + } +} + + + + + +sub get_featlist_from_moses { + # run moses with the given config file and return the list of features and + # their initial values + my $configfn = shift; + my $featlistfn = "./features.list"; + if (-e $featlistfn) { + print STDERR "Using cached features list: $featlistfn\n"; + } else { + print STDERR "Asking moses for feature names and values from $___CONFIG\n"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + print STDERR "$cmd\n"; #DEBUG + safesystem($cmd) or die "Failed to run moses with the config $configfn"; + } + + # read feature list + my @names = (); + my @startvalues = (); + open(INI,$featlistfn) or die "Can't read $featlistfn"; + my $nr = 0; + my @errs = (); + while () { + $nr++; + chomp; + /^(.+) (\S+) (\S+)$/ || die("invalid feature: $_"); + my ($longname, $feature, $value) = ($1,$2,$3); + next if $value eq "sparse"; + push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" + if $value !~ /^[+-]?[0-9.e]+$/; + push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" + if !defined $ABBR2FULL{$feature}; + push @names, $feature; + push @startvalues, $value; + } + close INI; + if (scalar @errs) { + print STDERR join("", @errs); + exit 1; + } + return {"names"=>\@names, "values"=>\@startvalues}; +} + + +sub insert_ranges_to_featlist { + my $featlist = shift; + my $ranges = shift; + + $ranges = [] if !defined $ranges; + + # first collect the ranges from options + my $niceranges; + foreach my $range (@$ranges) { + my $name = undef; + foreach my $namedpair (split /,/, $range) { + if ($namedpair =~ /^(.*?):/) { + $name = $1; + $namedpair =~ s/^.*?://; + die "Unrecognized name '$name' in --range=$range" + if !defined $ABBR2FULL{$name}; + } + my ($min, $max) = split /\.\./, $namedpair; + die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "No name given in --range=$range" if !defined $name; + push @{$niceranges->{$name}}, [$min, $max]; + } + } + + # now populate featlist + my $seen = undef; + for(my $i=0; $i{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $seen->{$name} ++; + my $min = 0.0; + my $max = 1.0; + if (defined $niceranges->{$name}) { + my $minmax = shift @{$niceranges->{$name}}; + ($min, $max) = @$minmax if defined $minmax; + } + $featlist->{"mins"}->[$i] = $min; + $featlist->{"maxs"}->[$i] = $max; + } + return $featlist; +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + + + + + diff --git a/contrib/mert-sge-nosync/training/sge-nosync/zipextract-decoder-result.pl b/contrib/mert-sge-nosync/training/sge-nosync/zipextract-decoder-result.pl new file mode 100755 index 000000000..defc4fda7 --- /dev/null +++ b/contrib/mert-sge-nosync/training/sge-nosync/zipextract-decoder-result.pl @@ -0,0 +1,832 @@ +#!/usr/bin/perl -w +# $Id$ +# Usage: +# mert-moses.pl +# For other options see below or run 'mert-moses.pl --help' + +# Notes: +# and should be raw text files, one sentence per line +# can be a prefix, in which case the files are 0, 1, etc. are used + +# Excerpts from revision history + +# Sept 2011 multi-threaded mert (Barry Haddow) +# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK) +# Jul 2011 simplifications (Ondrej Bojar) +# -- rely on moses' -show-weights instead of parsing moses.ini +# ... so moses is also run once *before* mert starts, checking +# the model to some extent +# -- got rid of the 'triples' mess; +# use --range to supply bounds for random starting values: +# --range tm:-3..3 --range lm:-3..3 +# 5 Aug 2009 Handling with different reference length policies (shortest, average, closest) for BLEU +# and case-sensistive/insensitive evaluation (Nicola Bertoldi) +# 5 Jun 2008 Forked previous version to support new mert implementation. +# 13 Feb 2007 Better handling of default values for lambda, now works with multiple +# models and lexicalized reordering +# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1] +# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi) +# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table) +# useful if binary phrase tables are used (Nicola Bertoldi) +# 28 Aug 2006 Use either closest or average or shortest (default) reference +# length as effective reference length +# Use either normalization or not (default) of texts (Nicola Bertoldi) +# 31 Jul 2006 move gzip run*.out to avoid failure wit restartings +# adding default paths +# 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again) +# 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar) +# 27 Jul 2006 adding the safesystem() function to handle with process failure +# 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi) +# 21 Jul 2006 adapted for Moses-in-parallel (Nicola Bertoldi) +# 18 Jul 2006 adapted for Moses and cleaned up (PK) +# 21 Jan 2005 unified various versions, thorough cleanup (DWC) +# now indexing accumulated n-best list solely by feature vectors +# 14 Dec 2004 reimplemented find_threshold_points in C (NMD) +# 25 Oct 2004 Use either average or shortest (default) reference +# length as effective reference length (DWC) +# 13 Oct 2004 Use alternative decoders (DWC) +# Original version by Philipp Koehn + +use strict; +use FindBin qw($Bin); +use File::Basename; +use File::Path; +use File::Spec; +use Cwd; + +my $SCRIPTS_ROOTDIR = $Bin; +$SCRIPTS_ROOTDIR =~ s/\/training$//; +$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); + +## We preserve this bit of comments to keep the traditional weight ranges. +# "w" => [ [ 0.0, -1.0, 1.0 ] ], # word penalty +# "d" => [ [ 1.0, 0.0, 2.0 ] ], # lexicalized reordering model +# "lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model +# "g" => [ [ 1.0, 0.0, 2.0 ], # generation model +# [ 1.0, 0.0, 2.0 ] ], +# "tm" => [ [ 0.3, 0.0, 0.5 ], # translation model +# [ 0.2, 0.0, 0.5 ], +# [ 0.3, 0.0, 0.5 ], +# [ 0.2, 0.0, 0.5 ], +# [ 0.0,-1.0, 1.0 ] ], # ... last weight is phrase penalty +# "lex"=> [ [ 0.1, 0.0, 0.2 ] ], # global lexical model +# "I" => [ [ 0.0,-1.0, 1.0 ] ], # input lattice scores + + + +# moses.ini file uses FULL names for lambdas, while this training script +# internally (and on the command line) uses ABBR names. +my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w + g=weight-generation lex=weight-lex I=weight-i); +my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP; +my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP; + +my $minimum_required_change_in_weights = 0.00001; + # stop if no lambda changes more than this + +my $verbose = 0; +my $usage = 0; # request for --help + +# We assume that if you don't specify working directory, +# we set the default is set to `pwd`/mert-work +#@@# my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work"); +my $___WORKING_DIR = undef; +my $___DEV_F = undef; # required, input text to decode +my $___DEV_E = undef; # required, basename of files with references +my $___DECODER = undef; # required, pathname to the decoder executable +my $___CONFIG = undef; # required, pathname to startup ini file +my $___N_BEST_LIST_SIZE = 100; +my $___LATTICE_SAMPLES = 0; +my $submithost = ""; +my $queue_flags = "-hard"; # extra parameters for parallelizer + # the -l ws0ssmt was relevant only to JHU 2006 workshop +my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial) +my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder +my $continue = 0; # should we try to continue from the last saved step? +my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert) +my $___FILTER_PHRASE_TABLE = 1; # filter phrase table +my $___PREDICTABLE_SEEDS = 0; +my $___START_WITH_HISTORIC_BESTS = 0; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009] +my $___RANDOM_DIRECTIONS = 0; # search in random directions only +my $___NUM_RANDOM_DIRECTIONS = 0; # number of random directions, also works with default optimizer [Cer&al.,2008] +my $___PAIRWISE_RANKED_OPTIMIZER = 0; # use Hopkins&May[2011] +my $___PRO_STARTING_POINT = 0; # get a starting point from pairwise ranked optimizer +my $___RANDOM_RESTARTS = 20; +my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous iteration's weights [Hopkins&May,2011,5.4.3] +my $__THREADS = 0; +my $run = 0; + +# Parameter for effective reference length when computing BLEU score +# Default is to use shortest reference +# Use "--shortest" to use shortest reference length +# Use "--average" to use average reference length +# Use "--closest" to use closest reference length +# Only one between --shortest, --average and --closest can be set +# If more than one choice the defualt (--shortest) is used +my $___SHORTEST = 0; +my $___AVERAGE = 0; +my $___CLOSEST = 0; + +# Use "--nocase" to compute case-insensitive scores +my $___NOCASE = 0; + +# Use "--nonorm" to non normalize translation before computing scores +my $___NONORM = 0; + +# set 0 if input type is text, set 1 if input type is confusion network +my $___INPUTTYPE = 0; + + +my $mertdir = undef; # path to new mert directory +my $mertargs = undef; # args to pass through to mert & extractor +my $mertmertargs = undef; # args to pass through to mert only +my $extractorargs = undef; # args to pass through to extractor only +my $filtercmd = undef; # path to filter-model-given-input.pl +my $filterfile = undef; +my $qsubwrapper = undef; +my $qsubwrapper_exit = undef; +my $moses_parallel_cmd = undef; +my $old_sge = 0; # assume sge<6.0 +my $___CONFIG_ORIG = undef; # pathname to startup ini file before filtering +my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on + # if undef work on all features + # (others are fixed to the starting values) +my $___RANGES = undef; +my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loading data (default =-1) + # -1 means all previous, i.e. from iteration 1 + # 0 means no previous data, i.e. from actual iteration + # 1 means 1 previous data , i.e. from the actual iteration and from the previous one + # and so on +my $maximum_iterations = 25; + +##################### +my $processfeatlistcmd = undef; +my $processfeatlistargs = undef; +my $createconfigcmd = undef; +my $createconfigargs = undef; +my $decoderargs = undef; +##################### + +use Getopt::Long; +GetOptions( + "working-dir=s" => \$___WORKING_DIR, + "input=s" => \$___DEV_F, + "inputtype=i" => \$___INPUTTYPE, + "refs=s" => \$___DEV_E, + "decoder=s" => \$___DECODER, + "config=s" => \$___CONFIG, + "nbest=i" => \$___N_BEST_LIST_SIZE, + "lattice-samples=i" => \$___LATTICE_SAMPLES, + "submithost=s" => \$submithost, + "queue-flags=s" => \$queue_flags, + "jobs=i" => \$___JOBS, + "decoder-flags=s" => \$___DECODER_FLAGS, + "continue" => \$continue, + "skip-decoder" => \$skip_decoder, + "shortest" => \$___SHORTEST, + "average" => \$___AVERAGE, + "closest" => \$___CLOSEST, + "nocase" => \$___NOCASE, + "nonorm" => \$___NONORM, + "help" => \$usage, + "verbose" => \$verbose, + "mertdir=s" => \$mertdir, + "mertargs=s" => \$mertargs, + "extractorargs=s" => \$extractorargs, + "mertmertargs=s" => \$mertmertargs, + "rootdir=s" => \$SCRIPTS_ROOTDIR, + "filtercmd=s" => \$filtercmd, # allow to override the default location + "filterfile=s" => \$filterfile, # input to filtering script (useful for lattices/confnets) + "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location + "mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location + "old-sge" => \$old_sge, #passed to moses-parallel + "filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # (dis)allow of phrase tables + "predictable-seeds" => \$___PREDICTABLE_SEEDS, # make random restarts deterministic + "historic-bests" => \$___START_WITH_HISTORIC_BESTS, # use best settings from all previous iterations as starting points + "random-directions" => \$___RANDOM_DIRECTIONS, # search only in random directions + "run=i" => \$run, + "number-of-random-directions=i" => \$___NUM_RANDOM_DIRECTIONS, # number of random directions + "random-restarts=i" => \$___RANDOM_RESTARTS, # number of random restarts + "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) + "range=s@" => \$___RANGES, + "prev-aggregate-nbestlist=i" => \$prev_aggregate_nbl_size, #number of previous step to consider when loading data (default =-1, i.e. all previous) + "maximum-iterations=i" => \$maximum_iterations, + "pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER, + "pro-starting-point" => \$___PRO_STARTING_POINT, + "historic-interpolation=f" => \$___HISTORIC_INTERPOLATION, + "threads=i" => \$__THREADS +) or exit(1); + +# the 4 required parameters can be supplied on the command line directly +# or using the --options +if (scalar @ARGV == 2) { + # required parameters: input_file references_basename decoder_executable +# $___DEV_F = shift; + $___DEV_E = shift; +# $___DECODER = shift; + $___CONFIG = shift; +} + +# if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER || !defined $___CONFIG) { +if ($usage || !defined $___CONFIG || !defined $___DEV_E ){ + print STDERR "usage: $0 reference decoder.ini +Options: + --working-dir=mert-dir ... where all the files are created + --nbest=100 ... how big nbestlist to generate + --lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010) + --jobs=N ... set this to anything to run moses in parallel + --mosesparallelcmd=STR ... use a different script instead of moses-parallel + --queue-flags=STRING ... anything you with to pass to qsub, eg. + '-l ws06osssmt=true'. The default is: '-hard' + To reset the parameters, please use + --queue-flags=' ' + (i.e. a space between the quotes). + --decoder-flags=STRING ... extra parameters for the decoder + --continue ... continue from the last successful iteration + --skip-decoder ... skip the decoder run for the first time, + assuming that we got interrupted during + optimization + --shortest --average --closest + ... Use shortest/average/closest reference length + as effective reference length (mutually exclusive) + --nocase ... Do not preserve case information; i.e. + case-insensitive evaluation (default is false). + --nonorm ... Do not use text normalization (flag is not active, + i.e. text is NOT normalized) + --filtercmd=STRING ... path to filter-model-given-input.pl + --filterfile=STRING ... path to alternative to input-text for filtering + model. useful for lattice decoding + --rootdir=STRING ... where do helpers reside (if not given explicitly) + --mertdir=STRING ... path to new mert implementation + --mertargs=STRING ... extra args for both extractor and mert + --extractorargs=STRING ... extra args for extractor only + --mertmertargs=STRING ... extra args for mert only + --scorenbestcmd=STRING ... path to score-nbest.py + --old-sge ... passed to parallelizers, assume Grid Engine < 6.0 + --inputtype=[0|1|2] ... Handle different input types: (0 for text, + 1 for confusion network, 2 for lattices, + default is 0) + --no-filter-phrase-table ... disallow filtering of phrase tables + (useful if binary phrase tables are available) + --random-restarts=INT ... number of random restarts (default: 20) + --predictable-seeds ... provide predictable seeds to mert so that random + restarts are the same on every run + --range=tm:0..1,-1..1 ... specify min and max value for some features + --range can be repeated as needed. + The order of the various --range specifications + is important only within a feature name. + E.g.: + --range=tm:0..1,-1..1 --range=tm:0..2 + is identical to: + --range=tm:0..1,-1..1,0..2 + but not to: + --range=tm:0..2 --range=tm:0..1,-1..1 + --activate-features=STRING ... comma-separated list of features to optimize, + others are fixed to the starting values + default: optimize all features + example: tm_0,tm_4,d_0 + --prev-aggregate-nbestlist=INT ... number of previous step to consider when + loading data (default = $prev_aggregate_nbl_size) + -1 means all previous, i.e. from iteration 1 + 0 means no previous data, i.e. only the + current iteration + N means this and N previous iterations + + --maximum-iterations=ITERS ... Maximum number of iterations. Default: $maximum_iterations + --random-directions ... search only in random directions + --number-of-random-directions=int ... number of random directions + (also works with regular optimizer, default: 0) + --pairwise-ranked ... Use PRO for optimisation (Hopkins and May, emnlp 2011) + --pro-starting-point ... Use PRO to get a starting point for MERT + --threads=NUMBER ... Use multi-threaded mert (must be compiled in). + --historic-interpolation ... Interpolate optimized weights with prior iterations' weight + (parameter sets factor [0;1] given to current weights) +"; + exit 1; +} + + +# Check validity of input parameters and set defaults if needed + +print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n"; + +# path of script for filtering phrase tables and running the decoder +$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd; + +if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) { + print STDERR "Filtering command not found: $filtercmd.\n"; + print STDERR "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table\n"; + exit 1; +} + +# $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper; +$qsubwrapper = "$SCRIPTS_ROOTDIR/generic/qsub-wrapper-sge-nosync.pl" if !defined $qsubwrapper; + +$qsubwrapper_exit = "$SCRIPTS_ROOTDIR/generic/qsub-wrapper-exit-sge-nosync.pl" if !defined $qsubwrapper_exit; + +# $moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl" +# if !defined $moses_parallel_cmd; +$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel-sge-nosync.pl" + if !defined $moses_parallel_cmd; + +if (!defined $mertdir) { + $mertdir = "$SCRIPTS_ROOTDIR/../mert"; + print STDERR "Assuming --mertdir=$mertdir\n"; +} + +my $mert_extract_cmd = "$mertdir/extractor"; +my $mert_mert_cmd = "$mertdir/mert"; +my $mert_pro_cmd = "$mertdir/pro"; + +die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd; +die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd; +die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd; + +my $pro_optimizer = "$mertdir/megam_i686.opt"; # or set to your installation +if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) { + print "did not find $pro_optimizer, installing it in $mertdir\n"; + `cd $mertdir; wget http://www.cs.utah.edu/~hal/megam/megam_i686.opt.gz;`; + `gunzip $pro_optimizer.gz`; + `chmod +x $pro_optimizer`; + die("ERROR: Installation of megam_i686.opt failed! Install by hand from http://www.cs.utah.edu/~hal/megam/") unless -x $pro_optimizer; +} + +$mertargs = "" if !defined $mertargs; + +my $scconfig = undef; +if ($mertargs =~ /\-\-scconfig\s+(.+?)(\s|$)/){ + $scconfig=$1; + $scconfig =~ s/\,/ /g; + $mertargs =~ s/\-\-scconfig\s+(.+?)(\s|$)//; +} + +# handling reference lengh strategy +if (($___CLOSEST + $___AVERAGE + $___SHORTEST) > 1){ + die "You can specify just ONE reference length strategy (closest or shortest or average) not both\n"; +} + +if ($___SHORTEST){ + $scconfig .= " reflen:shortest"; +}elsif ($___AVERAGE){ + $scconfig .= " reflen:average"; +}elsif ($___CLOSEST){ + $scconfig .= " reflen:closest"; +} + +# handling case-insensitive flag +if ($___NOCASE) { + $scconfig .= " case:false"; +}else{ + $scconfig .= " case:true"; +} +$scconfig =~ s/^\s+//; +$scconfig =~ s/\s+$//; +$scconfig =~ s/\s+/,/g; + +$scconfig = "--scconfig $scconfig" if ($scconfig); + +my $mert_extract_args=$mertargs; +$mert_extract_args .=" $scconfig"; +$mert_extract_args .=" $extractorargs"; + +$mertmertargs = "" if !defined $mertmertargs; + +my $mert_mert_args="$mertargs $mertmertargs"; +$mert_mert_args =~ s/\-+(binary|b)\b//; +$mert_mert_args .=" $scconfig"; +if ($___ACTIVATE_FEATURES){ $mert_mert_args .=" -o \"$___ACTIVATE_FEATURES\""; } + +my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd); +die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd; +die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd; +die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper; +#@@# die "Not executable: $___DECODER" if ! -x $___DECODER; + +#@@# my $input_abs = ensure_full_path($___DEV_F); +#@@# die "File not found: $___DEV_F (interpreted as $input_abs)." +#@@# if ! -e $input_abs; +#@@# $___DEV_F = $input_abs; + +# Option to pass to qsubwrapper and moses-parallel +my $pass_old_sge = $old_sge ? "-old-sge" : ""; + +#@@# my $decoder_abs = ensure_full_path($___DECODER); +#@@# die "File not executable: $___DECODER (interpreted as $decoder_abs)." +#@@# if ! -x $decoder_abs; +#@@# $___DECODER = $decoder_abs; + +my $ref_abs = ensure_full_path($___DEV_E); +# check if English dev set (reference translations) exist and store a list of all references +my @references; +if (-e $ref_abs) { + push @references, $ref_abs; +} +else { + # if multiple file, get a full list of the files + my $part = 0; + if (! -e $ref_abs."0" && -e $ref_abs.".ref0") { + $ref_abs .= ".ref"; + } + while (-e $ref_abs.$part) { + push @references, $ref_abs.$part; + $part++; + } + die("Reference translations not found: $___DEV_E (interpreted as $ref_abs)") unless $part; +} + +my $config_abs = ensure_full_path($___CONFIG); +die "File not found: $___CONFIG (interpreted as $config_abs)." + if ! -e $config_abs; +$___CONFIG = $config_abs; + +# moses should use our config +if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) / +|| $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) / +|| $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(generation-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) / +) { + die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files."; +} + +# as weights are normalized in the next steps (by cmert) +# normalize initial LAMBDAs, too +my $need_to_normalize = 1; + +#store current directory and create the working directory (if needed) +my $cwd = `pawd 2>/dev/null`; +if(!$cwd){$cwd = `pwd`;} +chomp($cwd); + +$___WORKING_DIR = $cwd if (!defined $___WORKING_DIR); +chomp $___WORKING_DIR; + +print STDERR "working dir is $___WORKING_DIR\n"; +#@@# mkpath($___WORKING_DIR); + +{ +# open local scope + +#chdir to the working directory +chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR"; + +# fixed file names +my $mert_outfile = "mert.out"; +my $mert_logfile = "mert.log"; +my $weights_in_file = "init.opt"; +my $weights_out_file = "weights.txt"; + +# set start run +my $start_run = 1; +my $bestpoint = undef; +my $devbleu = undef; +my $sparse_weights_file = undef; +my $jobid = -1; + +my $prev_feature_file = undef; +my $prev_score_file = undef; +my $prev_init_file = undef; + + +######################### +# set jobid to trace different jobs +my $prevjid = undef; + + + +#### my $run=$start_run-1; + +my $oldallsorted = undef; +my $allsorted = undef; + +my $nbest_file=undef; +my $lsamp_file=undef; #Lattice samples +my $orig_nbest_file=undef; # replaced if lattice sampling +my $cmd=undef; + + + + + + $nbest_file="run$run.best$___N_BEST_LIST_SIZE.out"; + safesystem("gzip -f $nbest_file") or die "Failed to gzip run*out"; + $nbest_file = $nbest_file.".gz"; + + + # extract score statistics and features from the nbest lists + print STDERR "Scoring the nbestlist.\n"; + + my $base_feature_file = "features.dat"; + my $base_score_file = "scores.dat"; + my $feature_file = "run$run.${base_feature_file}"; + my $score_file = "run$run.${base_score_file}"; + + $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r ".join(",", @references)." -n $nbest_file"; + $cmd = create_extractor_script($cmd, $___WORKING_DIR); + + &submit_or_exec($cmd,"extract.out","extract.err","extract.id"); + + + +} # end of local scope + +sub get_weights_from_mert { + my ($outfile,$logfile,$weight_count,$sparse_weights) = @_; + my ($bestpoint,$devbleu); + if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/)) { + open(IN,$outfile) or die "Can't open $outfile"; + my (@WEIGHT,$sum); + for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; } + while() { + # regular features + if (/^F(\d+) ([\-\.\de]+)/) { + $WEIGHT[$1] = $2; + $sum += abs($2); + } + # sparse features + elsif(/^(.+_.+) ([\-\.\de]+)/) { + $$sparse_weights{$1} = $2; + } + } + $devbleu = "unknown"; + foreach (@WEIGHT) { $_ /= $sum; } + foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; } + $bestpoint = join(" ",@WEIGHT); + close IN; + } + else { + open(IN,$logfile) or die "Can't open $logfile"; + while () { + if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { + $bestpoint = $1; + $devbleu = $2; + last; + } + } + close IN; + } + return ($bestpoint,$devbleu); +} + + + +sub sanity_check_order_of_lambdas { + my $featlist = shift; + my $filename_or_stream = shift; + + my @expected_lambdas = @{$featlist->{"names"}}; + my @got = get_order_of_scores_from_nbestlist($filename_or_stream); + die "Mismatched lambdas. Decoder returned @got, we expected @expected_lambdas" + if "@got" ne "@expected_lambdas"; +} + +sub get_featlist_from_moses { + # run moses with the given config file and return the list of features and + # their initial values + my $configfn = shift; + my $featlistfn = "./features.list"; + if (-e $featlistfn) { + print STDERR "Using cached features list: $featlistfn\n"; + } else { + print STDERR "Asking moses for feature names and values from $___CONFIG\n"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + safesystem($cmd) or die "Failed to run moses with the config $configfn"; + } + + # read feature list + my @names = (); + my @startvalues = (); + open(INI,$featlistfn) or die "Can't read $featlistfn"; + my $nr = 0; + my @errs = (); + while () { + $nr++; + chomp; + /^(.+) (\S+) (\S+)$/ || die("invalid feature: $_"); + my ($longname, $feature, $value) = ($1,$2,$3); + next if $value eq "sparse"; + push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" + if $value !~ /^[+-]?[0-9.e]+$/; + push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" + if !defined $ABBR2FULL{$feature}; + push @names, $feature; + push @startvalues, $value; + } + close INI; + if (scalar @errs) { + print STDERR join("", @errs); + exit 1; + } + return {"names"=>\@names, "values"=>\@startvalues}; +} + +sub get_order_of_scores_from_nbestlist { + # read the first line and interpret the ||| label: num num num label2: num ||| column in nbestlist + # return the score labels in order + my $fname_or_source = shift; + # print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n"; + open IN, $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source'"; + my $line = ; + close IN; + die "Line empty in nbestlist '$fname_or_source'" if !defined $line; + my ($sent, $hypo, $scores, $total) = split /\|\|\|/, $line; + $scores =~ s/^\s*|\s*$//g; + die "No scores in line: $line" if $scores eq ""; + + my @order = (); + my $label = undef; + my $sparse = 0; # we ignore sparse features here + foreach my $tok (split /\s+/, $scores) { + if ($tok =~ /.+_.+:/) { + $sparse = 1; + } elsif ($tok =~ /^([a-z][0-9a-z]*):/i) { + $label = $1; + } elsif ($tok =~ /^-?[-0-9.e]+$/) { + if (!$sparse) { + # a score found, remember it + die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!" + if !defined $label; + push @order, $label; + } + $sparse = 0; + } else { + die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'"; + } + } + print STDERR "The decoder returns the scores in this order: @order\n"; + return @order; +} + + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} +sub ensure_full_path { + my $PATH = shift; +$PATH =~ s/\/nfsmnt//; + return $PATH if $PATH =~ /^\//; + my $dir = `pawd 2>/dev/null`; + if(!$dir){$dir = `pwd`;} + chomp($dir); + $PATH = $dir."/".$PATH; + $PATH =~ s/[\r\n]//g; + $PATH =~ s/\/\.\//\//g; + $PATH =~ s/\/+/\//g; + my $sanity = 0; + while($PATH =~ /\/\.\.\// && $sanity++<10) { + $PATH =~ s/\/+/\//g; + $PATH =~ s/\/[^\/]+\/\.\.\//\//g; + } + $PATH =~ s/\/[^\/]+\/\.\.$//; + $PATH =~ s/\/+$//; +$PATH =~ s/\/nfsmnt//; + return $PATH; +} + +sub submit_or_exec { + + my $argvlen = @_; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $prevjid = undef; + + # if supply 3 arguments, exec without submit + # if supply 4 arguments, then submit new job + # if supply 5 arguments, wait for the previous job to finish + if ($argvlen == 3){ + ($cmd,$stdout,$stderr) = @_; + } elsif ($argvlen == 4){ + ($cmd,$stdout,$stderr,$jidfile) = @_; + } elsif ($argvlen == 5){ + ($cmd,$stdout,$stderr,$jidfile,$prevjid) = @_; + } + + print STDERR "exec: $cmd\n"; + if (defined $___JOBS && $___JOBS > 0 && $argvlen==5) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameters=\"$queue_flags\" -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -prevjid=$prevjid" ) + or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; + } + elsif (defined $___JOBS && $___JOBS > 0 && $argvlen==4) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameters=\"$queue_flags\" -stdout=$stdout -stderr=$stderr -jidfile=$jidfile" ) + or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; + } else { + safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'."; + } +} + +sub exit_submit { + + my $argvlen = @_; + my $cmd = undef; + my $stdout = undef; + my $stderr = undef; + my $jidfile = undef; + my $pidfile = undef; + my $prevjid = undef; + my $prevjidarraysize = 0; + my @prevjidarray = (); + my $pid = undef; + my $qsubcmd=""; + my $hj=""; + + # if supply 4 arguments, then submit new job + # if supply 5 arguments, wait for the previous job to finish + if ($argvlen == 2) { + ($stdout,$stderr) = @_; + } elsif ($argvlen == 4){ + ($stdout,$stderr,$jidfile,$pidfile) = @_; + } elsif ($argvlen == 5){ + ($stdout,$stderr,$jidfile,$pidfile,$prevjid) = @_; + } + + # parse prevjid ######################## + $prevjid =~ s/^\s+|\s+$//g; + @prevjidarray = split(/\s+/,$prevjid); + $prevjidarraysize = scalar(@prevjidarray); + ######################################## + + + # print STDERR "exec: $stdout\n"; + + # read pid from file, and draft exit script ################## + chomp ($pid=`tail -n 1 $pidfile`); + open (OUT, ">exitjob$pid.sh"); + + my $scriptheader="\#\!/bin/bash\n\#\$ -S /bin/sh\n# Both lines are needed to invoke base\n#the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n"; + $scriptheader .="uname -a\n\n"; + $scriptheader .="cd $___WORKING_DIR\n\n"; + + print OUT $scriptheader; + + print OUT "if $qsubwrapper_exit -submithost=$submithost -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -pidfile=$pidfile > exitjob$pid.out 2> exitjob$pid.err ; then + echo 'succeeded' +else + echo failed with exit status \$\? + die=1 +fi +"; + print OUT "\n\n"; + + close (OUT); + # setting permissions of the script + chmod(oct(755),"exitjob$pid.sh"); + ############################################################## + + + if (defined $___JOBS && $___JOBS > 0 && $argvlen==5) { + if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1){ + $hj = "-hold_jid $prevjid"; + } elsif (defined $prevjid && $prevjidarraysize > 1){ + $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray); + } + $qsubcmd="qsub $queue_flags -V $hj exitjob$pid.sh > exitjob$pid.log 2>&1"; + safesystem($qsubcmd) or die "ERROR: Failed to exit-submit $pid (via $qsubwrapper_exit)"; + } elsif (defined $___JOBS && $___JOBS > 0 && $argvlen==4) { + $qsubcmd="qsub $queue_flags -V exitjob$pid.sh > exitjob$pid.log 2>&1"; + safesystem($qsubcmd) or die "ERROR: Failed to exit-submit $pid (via $qsubwrapper_exit)"; + } else { + safesystem("rm $stdout") or die "ERROR: Failed to remove '$stdout'."; + safesystem("rm $stderr") or die "ERROR: Failed to remove '$stderr'."; + } +} + + + + + + + +sub create_extractor_script() +{ + my ($cmd, $outdir) = @_; + my $script_path = File::Spec->catfile($outdir, "extractor.sh"); + + open my $out, '>', $script_path + or die "Couldn't open $script_path for writing: $!\n"; + print $out "#!/bin/bash\n"; + print $out "cd $outdir\n"; + print $out "$cmd\n"; + close($out); + + `chmod +x $script_path`; + + return $script_path; +}