better support of grid engine cluster

This commit is contained in:
Philipp Koehn 2015-07-29 11:03:24 -04:00
parent b29166e2fe
commit 836ca8212a
4 changed files with 125 additions and 39 deletions

View File

@ -1098,7 +1098,7 @@ sub draw_agenda_graph {
print DOT "}\n";
close(DOT);
my $graph_file = &steps_file("graph.$VERSION",$VERSION);
`dot -Tps $graph_file.dot >$graph_file.ps`;
`dot -Tps $graph_file.dot >$graph_file.ps 2>/dev/null`;
`convert -alpha off $graph_file.ps $graph_file.png`;
}
@ -1992,6 +1992,12 @@ sub define_tuning_tune {
my $tune_inputtype = &backoff_and_get("TUNING:inputtype");
my $jobs = &backoff_and_get("TUNING:jobs");
my $decoder = &check_backoff_and_get("TUNING:decoder");
my $cache_model = &backoff_and_get("GENERAL:cache-model");
if (defined($cache_model) && !($jobs && $jobs>1 && $CLUSTER)) {
$cmd .= "MOSES_INI=`$scripts/ems/support/cache-model.perl $config $cache_model`\n";
$config = "\$MOSES_INI";
}
my $decoder_settings = &backoff_and_get("TUNING:decoder-settings");
$decoder_settings = "" unless $decoder_settings;
@ -2000,7 +2006,7 @@ sub define_tuning_tune {
my $tuning_settings = &backoff_and_get("TUNING:tuning-settings");
$tuning_settings = "" unless $tuning_settings;
$cmd = "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $tmp_dir --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table";
$cmd .= "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $tmp_dir --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table";
$cmd .= " --lambdas \"$lambda\"" if $lambda;
$cmd .= " --continue" if $tune_continue;
$cmd .= " --skip-decoder" if $skip_decoder;
@ -2009,6 +2015,7 @@ sub define_tuning_tune {
my $qsub_args = &get_qsub_args($DO_STEP[$step_id]);
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
$cmd .= " --jobs $jobs" if $CLUSTER && $jobs && $jobs>1;
$cmd .= " --cache-model $cache_model" if $cache_model && $CLUSTER && $jobs && $jobs>1;
my $tuning_dir = $tuned_config;
$tuning_dir =~ s/\/[^\/]+$//;
$cmd .= "\nmkdir -p $tuning_dir";
@ -3190,12 +3197,6 @@ sub define_evaluation_decode {
my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules");
my $post_decoding_transliteration = &get("TRAINING:post-decoding-transliteration");
# If Transliteration Module is to be used as post-decoding step ...
if (defined($post_decoding_transliteration) && $post_decoding_transliteration eq "yes"){
$settings .= " -output-unknowns $system_output.oov";
}
# specify additional output for analysis
if (defined($report_precision_by_coverage) && $report_precision_by_coverage eq "yes") {
$settings .= " -alignment-output-file $system_output.wa";
@ -3224,8 +3225,16 @@ sub define_evaluation_decode {
$input = $input_with_tags;
}
# create command
# cache model on local disk
my $cmd;
my $cache_model = &backoff_and_get("GENERAL:cache-model");
if (defined($cache_model) && !($jobs && $jobs>1 && $CLUSTER)) {
my $scripts = &check_and_get("GENERAL:moses-script-dir");
$cmd = "MOSES_INI=`$scripts/ems/support/cache-model.perl $config $cache_model`\n";
$config = "\$MOSES_INI";
}
# create command
my $nbest_size;
$nbest_size = $nbest if $nbest;
$nbest_size =~ s/[^\d]//g if $nbest;
@ -3241,6 +3250,7 @@ sub define_evaluation_decode {
$cmd .= " -queue-parameters \"$qsub_args\"" if ($CLUSTER && $qsub_args);
$cmd .= " -decoder $decoder";
$cmd .= " -config $config";
$cmd .= " -cache-model $cache_model" if defined($cache_model);
$cmd .= " -input-file $input";
$cmd .= " --jobs $jobs";
$cmd .= " -decoder-parameters \"$settings\" > $system_output";
@ -3251,6 +3261,10 @@ sub define_evaluation_decode {
$cmd .= " -n-best-list $system_output.best$nbest_size $nbest" if $nbest;
}
# If Transliteration Module is to be used as post-decoding step ...
$cmd .= " -output-unknowns $system_output.oov"
if defined($post_decoding_transliteration) && $post_decoding_transliteration eq "yes";
&create_step($step_id,$cmd);
}

View File

@ -20,6 +20,7 @@
use warnings;
use strict;
use FindBin qw($RealBin);
#######################
#Customizable parameters
@ -33,7 +34,7 @@ my $queueparameters="";
# etc.
# look for the correct pwdcmd
my $pwdcmd = getPwdCmd();
my $pwdcmd = &getPwdCmd();
my $workingdir = `$pwdcmd`; chomp $workingdir;
my $tmpdir="$workingdir/tmp$$";
@ -57,6 +58,7 @@ my $version=undef;
my $help=0;
my $dbg=0;
my $jobs=4;
my $cache_model=undef;
my $mosescmd="$ENV{MOSESBIN}/moses"; #decoder in use
my $inputlist=undef;
my $inputfile=undef;
@ -67,6 +69,9 @@ my $nbestfile=undef;
my $oldnbestfile=undef;
my $oldnbest=undef;
my $nbestflag=0;
my $oovlist=undef;
my $oovfile=undef;
my $oovflag=0;
my @wordgraphlist=();
my $wordgraphlist=undef;
my $wordgraphfile=undef;
@ -94,6 +99,7 @@ sub init(){
'help'=>\$help,
'debug'=>\$dbg,
'jobs=i'=>\$jobs,
'cache-model=s'=>\$cache_model,
'decoder=s'=> \$mosescmd,
'robust=i' => \$robust,
'decoder-parameters=s'=> \$mosesparameters,
@ -105,6 +111,7 @@ sub init(){
'n-best-size=i'=> \$oldnbest,
'output-search-graph|osg=s'=> \$searchgraphlist,
'output-word-graph|owg=s'=> \$wordgraphlist,
'output-unknowns=s'=> \$oovlist,
'alignment-output-file=s'=> \$alifile,
'translation-details|T=s'=> \$detailsfile,
'qsub-prefix=s'=> \$qsubname,
@ -120,6 +127,8 @@ sub init(){
getWordGraphParameters();
getOOVParameters();
getLogParameters();
#print_parameters();
@ -130,7 +139,7 @@ print STDERR "wordgraphflag:$wordgraphflag\n";
chomp($inputfile=`basename $inputlist`) if defined($inputlist);
$mosesparameters.="@ARGV -config $cfgfile -inputtype $inputtype";
$mosesparameters.="@ARGV -inputtype $inputtype";
}
@ -162,6 +171,7 @@ sub usage(){
print STDERR "* -decoder <file> Moses decoder to use\n";
print STDERR "* -i|inputfile|input-file <file> the input text to translate\n";
print STDERR "* -jobs <N> number of required jobs\n";
print STDERR " -cache-model <dir> local directory for copying model files\n";
print STDERR " -logfile <file> file where storing log files of all jobs\n";
print STDERR " -qsub-prefix <string> name for sumbitte jobs\n";
print STDERR " -queue-parameters <string> specific requirements for queue\n";
@ -199,9 +209,11 @@ sub print_parameters(){
print STDERR "Configuration file: $cfgfile\n";
print STDERR "Decoder in use: $mosescmd\n";
print STDERR "Number of jobs:$jobs\n";
print STDERR "Model cache directory: $cache_model\n" if ($cache_model);
print STDERR "Nbest list: $nbestlist\n" if ($nbestflag);
print STDERR "Output Search Graph: $searchgraphlist\n" if ($searchgraphflag);
print STDERR "Output Word Graph: $wordgraphlist\n" if ($wordgraphflag);
print STDERR "Output OOV: $oovlist\n" if ($oovflag);
print STDERR "LogFile:$logfile\n" if ($logflag);
print STDERR "Qsub name: $qsubname\n";
print STDERR "Queue parameters: $queueparameters\n";
@ -209,7 +221,7 @@ sub print_parameters(){
print STDERR "Inputtype: confusion network\n" if $inputtype == 1;
print STDERR "Inputtype: lattices\n" if $inputtype == 2;
print STDERR "parameters directly passed to Moses: $mosesparameters\n";
print STDERR "parameters directly passed to Moses: $mosesparameters -config $cfgfile\n";
}
#get parameters for log file
@ -310,6 +322,19 @@ sub getWordGraphParameters(){
}
}
sub getOOVParameters {
# only on command line
if ($oovlist) {
if ($oovlist eq "-") {
$oovfile = "oov";
}
else {
chomp($oovfile = `basename $oovlist`);
}
$oovflag = 1;
}
}
#######################
#Script starts here
@ -436,7 +461,7 @@ grep(s/.+(\-\S+)$/$1/e,@idxlist);
safesystem("mkdir -p $tmpdir") or die;
preparing_script();
&preparing_script();
#launching process through the queue
my @sgepids =();
@ -483,18 +508,18 @@ while ($robust && scalar @idx_todo) {
if ($old_sge) {
# we need to implement our own waiting script
my $syncscript = "${jobscript}.sync_workaround_script.sh";
safesystem("echo 'date' > $syncscript") or kill_all_and_quit();
safesystem("echo 'date' > $syncscript") or &kill_all_and_quit();
my $pwd = `$pwdcmd`; chomp $pwd;
my $checkpointfile = "${jobscript}.sync_workaround_checkpoint";
# delete previous checkpoint, if left from previous runs
safesystem("\\rm -f $checkpointfile") or kill_all_and_quit();
safesystem("\\rm -f $checkpointfile") or &kill_all_and_quit();
# start the 'hold' job, i.e. the job that will wait
$cmd="qsub -cwd $queueparameters $hj -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript 2> $qsubname.W.log";
safesystem($cmd) or kill_all_and_quit();
safesystem($cmd) or &kill_all_and_quit();
# and wait for checkpoint file to appear
my $nr=0;
@ -504,15 +529,15 @@ while ($robust && scalar @idx_todo) {
print STDERR "w" if $nr % 3 == 0;
}
print STDERR "End of waiting.\n";
safesystem("\\rm -f $checkpointfile $syncscript") or kill_all_and_quit();
safesystem("\\rm -f $checkpointfile $syncscript") or &kill_all_and_quit();
my $failure = 1;
my $nr = 0;
$nr = 0;
while ($nr < 60 && $failure) {
$nr ++;
$failure=&check_exit_status();
if (!$failure) {
$failure = check_translation_old_sge();
$failure = &check_translation_old_sge();
}
last if !$failure;
print STDERR "Extra wait ($nr) for possibly unfinished processes.\n";
@ -521,55 +546,65 @@ while ($robust && scalar @idx_todo) {
} else {
# use the -sync option for qsub
$cmd="qsub $queueparameters -sync y $hj -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls > $qsubname.W.log";
safesystem($cmd) or kill_all_and_quit();
safesystem($cmd) or &kill_all_and_quit();
$failure=&check_exit_status();
}
kill_all_and_quit() if $failure && !$robust;
&kill_all_and_quit() if $failure && !$robust;
# check if some translations failed
my @idx_still_todo = check_translation();
my @idx_still_todo = &check_translation();
if ($robust) {
# if robust, redo crashed jobs
if ((scalar @idx_still_todo) == (scalar @idxlist)) {
# ... but not if all crashed
print STDERR "everything crashed, not trying to resubmit jobs\n";
$robust = 0;
kill_all_and_quit();
&kill_all_and_quit();
}
@idx_todo = @idx_still_todo;
}
else {
if (scalar (@idx_still_todo)) {
print STDERR "some jobs crashed: ".join(" ",@idx_still_todo)."\n";
kill_all_and_quit();
&kill_all_and_quit();
}
}
}
#concatenating translations and removing temporary files
concatenate_1best();
concatenate_logs() if $logflag;
concatenate_ali() if defined $alifile;
concatenate_details() if defined $detailsfile;
concatenate_nbest() if $nbestflag;
&concatenate_1best();
&concatenate_logs() if $logflag;
&concatenate_ali() if defined $alifile;
&concatenate_details() if defined $detailsfile;
&concatenate_nbest() if $nbestflag;
safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-';
concatenate_searchgraph() if $searchgraphflag;
&concatenate_searchgraph() if $searchgraphflag;
safesystem("cat searchgraph$$ >> /dev/stdout") if $searchgraphlist eq '-';
concatenate_wordgraph() if $wordgraphflag;
&concatenate_wordgraph() if $wordgraphflag;
safesystem("cat wordgraph$$ >> /dev/stdout") if $wordgraphlist[0] eq '-';
remove_temporary_files();
&concatenate_oov() if $oovflag;
safesystem("cat oov$$ >> /dev/stdout") if $oovlist eq '-';
&remove_temporary_files();
#script creation
sub preparing_script(){
my $currStartTranslationId = 0;
my $possibly_modified_cfgfile = $cfgfile;
my $cache_model_cmd = "";
if ($cache_model) {
$cache_model_cmd = "MOSES_INI=`$RealBin/../ems/support/cache-model.perl $cfgfile $cache_model`\n";
$possibly_modified_cfgfile = "\$MOSES_INI";
}
foreach my $idx (@idxlist){
my $scriptheader="";
$scriptheader.="\#\! /bin/bash\n\n";
@ -594,6 +629,10 @@ sub preparing_script(){
open (OUT, "> ${jobscript}${idx}.bash");
print OUT $scriptheader;
# copy model files into local directory
print OUT $cache_model_cmd;
my $inputmethod = $feed_moses_via_stdin ? "<" : "-input-file";
my $tmpnbestlist="";
@ -623,9 +662,14 @@ sub preparing_script(){
$tmpwordgraphlist="-output-word-graph $tmpdir/$wordgraphfile.$splitpfx$idx $wordgraphlist[1]";
}
my $tmpoovlist="";
if ($oovflag){
$tmpoovlist="-output-unknowns $tmpdir/$oovfile.$splitpfx$idx";
}
my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId";
print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
print OUT "$mosescmd $mosesparameters -config $possibly_modified_cfgfile $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpoovlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
print OUT "echo exit status \$\?\n\n";
if (defined $alifile){
@ -644,11 +688,14 @@ sub preparing_script(){
print OUT "\\mv -f $tmpdir/${searchgraphfile}.$splitpfx$idx .\n\n";
print OUT "echo exit status \$\?\n\n";
}
if ($wordgraphflag){
print OUT "\\mv -f $tmpdir/${wordgraphfile}.$splitpfx$idx .\n\n";
print OUT "echo exit status \$\?\n\n";
}
if ($oovflag){
print OUT "\\mv -f $tmpdir/${oovfile}.$splitpfx$idx .\n\n";
print OUT "echo exit status \$\?\n\n";
}
print OUT "\\mv -f $tmpdir/${inputfile}.$splitpfx$idx.trans .\n\n";
print OUT "echo exit status \$\?\n\n";
@ -840,6 +887,20 @@ sub concatenate_1best(){
}
}
sub concatenate_oov(){
my $outoov=$oovlist;
if ($oovlist eq '-'){ $outoov="oov$$"; }
open (OUT, "> $outoov");
foreach my $idx (@idxlist){
my @in=();
open (IN, "${oovfile}.${splitpfx}${idx}");
@in=<IN>;
print OUT "@in";
close(IN);
}
close(OUT);
}
sub concatenate_logs(){
open (OUT, "> ${logfile}");
foreach my $idx (@idxlist){
@ -978,6 +1039,7 @@ sub remove_temporary_files(){
if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); }
if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); }
if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); }
if ($oovfile){ unlink("${oovfile}.${splitpfx}${idx}"); }
unlink("${jobscript}${idx}.bash");
unlink("${jobscript}${idx}.log");
unlink("$qsubname.W.log");
@ -988,6 +1050,7 @@ sub remove_temporary_files(){
if ($nbestflag && $nbestlist[0] eq '-'){ unlink("${nbestfile}$$"); };
if ($searchgraphflag && $searchgraphlist eq '-'){ unlink("${searchgraphfile}$$"); };
if ($wordgraphflag && $wordgraphlist eq '-'){ unlink("${wordgraphfile}$$"); };
if ($oovflag && $oovlist eq '-'){ unlink("oov$$"); };
}
sub safesystem {

View File

@ -14,7 +14,7 @@ use strict;
my $queueparameters="";
# look for the correct pwdcmd
my $pwdcmd = getPwdCmd();
my $pwdcmd = &getPwdCmd();
my $workingdir = `$pwdcmd`; chomp $workingdir;
my $tmpdir="$workingdir/tmp$$";
@ -109,14 +109,14 @@ else
fi
";
if (defined $cmdout){
if (defined($cmdout) && $cmdout ne "/dev/null") {
print OUT "mv -f $tmpdir/cmdout$$ $cmdout || echo failed to preserve the log: $tmpdir/cmdout$$\n\n";
}
else{
print OUT "rm -f $tmpdir/cmdout$$\n\n";
}
if (defined $cmderr){
if (defined($cmderr) && $cmderr ne "/dev/null") {
print OUT "mv -f $tmpdir/cmderr$$ $cmderr || echo failed to preserve the log: $tmpdir/cmderr$$\n\n";
}
else{

View File

@ -81,6 +81,7 @@ my $___LATTICE_SAMPLES = 0;
my $queue_flags = "-hard"; # extra parameters for parallelizer
# the -l ws0ssmt was relevant only to JHU 2006 workshop
my $___JOBS = undef; # if parallel, number of jobs to use (undef or <= 0 -> serial)
my $___CACHE_MODEL = undef; # if models need to be copied to local disk from NFS
my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
my $continue = 0; # should we try to continue from the last saved step?
my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert)
@ -183,6 +184,7 @@ GetOptions(
"lattice-samples=i" => \$___LATTICE_SAMPLES,
"queue-flags=s" => \$queue_flags,
"jobs=i" => \$___JOBS,
"cache-model=s" => \$___CACHE_MODEL,
"decoder-flags=s" => \$___DECODER_FLAGS,
"continue" => \$continue,
"skip-decoder" => \$skip_decoder,
@ -245,6 +247,7 @@ Options:
--nbest=100 ... how big nbestlist to generate
--lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010)
--jobs=N ... set this to anything to run moses in parallel
--cache-model=STRING ... local directory into which copy model before running decoder
--mosesparallelcmd=STR ... use a different script instead of moses-parallel
--queue-flags=STRING ... anything you with to pass to qsub, eg.
'-l ws06osssmt=true'. The default is: '-hard'
@ -1143,7 +1146,7 @@ if($___RETURN_BEST_DEV) {
}
my $cmd = "$mert_eval_cmd --reference " . join(",", @references) . " $mert_extract_args $candidate";
$cmd .= " -l $__REMOVE_SEGMENTATION" if defined( $__PROMIX_TRAINING);
safesystem("$cmd 2> /dev/null 1> $evalout");
&submit_or_exec($cmd, $evalout, "/dev/null", 1);
open my $fh, '<', $evalout or die "Can't read $evalout : $!";
my $bleu = <$fh>;
chomp $bleu;
@ -1291,6 +1294,7 @@ sub run_decoder {
die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
$decoder_cmd .= " -cache-model $___CACHE_MODEL" if defined($___CACHE_MODEL);
$decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
} else {
my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct";
@ -1380,7 +1384,12 @@ sub get_featlist_from_moses {
print STDERR "Using cached features list: $featlistfn\n";
} else {
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
my $cmd;
if ($___CACHE_MODEL) {
$cmd = "MOSES_INI=`$SCRIPTS_ROOTDIR/ems/support/cache-model.perl $configfn $___CACHE_MODEL` && ";
$configfn = "\$MOSES_INI";
}
$cmd .= "$___DECODER $___DECODER_FLAGS -config $configfn";
$cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
$cmd .= " -show-weights";
print STDERR "Executing: $cmd\n";