mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
ef028446f3
This is not pleasant to read (and much, much less pleasant to write!) but sort of necessary in an open project. Right now it's quite hard to figure out what is licensed how, which doesn't matter much to most people but can suddenly become very important when people want to know what they're being allowed to do. I kept the notices as short as I could. As far as I could see, everything without a clear license notice is LGPL v2.1 or later.
1023 lines
30 KiB
Perl
Executable File
1023 lines
30 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
#
|
|
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
|
# Public License version 2.1 or, at your option, any later version.
|
|
|
|
# $Id$
|
|
#######################
|
|
# Revision history
|
|
#
|
|
# 02 Aug 2006 added strict requirement
|
|
# 01 Aug 2006 fix bug about inputfile parameter
|
|
# fix bug about suffix index generation
|
|
# 31 Jul 2006 added parameter for reading queue parameters
|
|
# 29 Jul 2006 added code to handling consfusion networks
|
|
# 28 Jul 2006 added a better policy for removing jobs from the queue in case of killing signal (CTRL-C)
|
|
# added the parameter -qsub-prefix which sets the prefix for the name of submitted jobs
|
|
# 27 Jul 2006 added safesystem() function and other checks to handle with process failure
|
|
# added checks for existence of decoder and configuration file
|
|
# 26 Jul 2006 fix a bug related to the use of absolute path for srcfile and nbestfile
|
|
|
|
use warnings;
|
|
use strict;
|
|
|
|
#######################
|
|
#Customizable parameters
|
|
|
|
#parameters for submiiting processes through Sun GridEngine
|
|
my $queueparameters="";
|
|
|
|
#SGE-make-it-work:
|
|
# my $queueparameters="-l q1w -l mem_free=0.5G -hard";
|
|
# my $queueparameters="-l q1dm -pe pe_mth 2 -hard";
|
|
# etc.
|
|
|
|
# look for the correct pwdcmd
|
|
my $pwdcmd = getPwdCmd();
|
|
|
|
my $workingdir = `$pwdcmd`; chomp $workingdir;
|
|
my $tmpdir="$workingdir/tmp$$";
|
|
my $splitpfx="split$$";
|
|
|
|
$SIG{'INT'} = \&kill_all_and_quit; # catch exception for CTRL-C
|
|
|
|
#######################
|
|
#Default parameters
|
|
my $jobscript="$workingdir/job$$";
|
|
my $qsubout="$workingdir/out.job$$";
|
|
my $qsuberr="$workingdir/err.job$$";
|
|
|
|
my $mosesparameters="";
|
|
my $feed_moses_via_stdin = 0;
|
|
# a workaround, for a reason, the default "-input-file X" blocks
|
|
# my moses, while "< X" works fine.
|
|
my $cfgfile=""; #configuration file
|
|
|
|
my $version=undef;
|
|
my $help=0;
|
|
my $dbg=0;
|
|
my $jobs=4;
|
|
my $mosescmd="$ENV{MOSESBIN}/moses"; #decoder in use
|
|
my $inputlist=undef;
|
|
my $inputfile=undef;
|
|
my $inputtype=0;
|
|
my @nbestlist=();
|
|
my $nbestlist=undef;
|
|
my $nbestfile=undef;
|
|
my $oldnbestfile=undef;
|
|
my $oldnbest=undef;
|
|
my $nbestflag=0;
|
|
my @wordgraphlist=();
|
|
my $wordgraphlist=undef;
|
|
my $wordgraphfile=undef;
|
|
my $wordgraphflag=0;
|
|
my $robust=5; # resubmit crashed jobs robust-times
|
|
my $alifile=undef;
|
|
my $detailsfile=undef;
|
|
my $logfile="";
|
|
my $logflag="";
|
|
my $searchgraphlist="";
|
|
my $searchgraphfile="";
|
|
my $searchgraphflag=0;
|
|
my $qsubname="MOSES";
|
|
my $old_sge = 0; # assume old Sun Grid Engine (<6.0) where qsub does not
|
|
# implement -sync and -b
|
|
|
|
#SGE-make-it-work:
|
|
# my $old_sge = "1";
|
|
|
|
#######################
|
|
# Command line options processing
|
|
sub init(){
|
|
use Getopt::Long qw(:config pass_through no_ignore_case permute);
|
|
GetOptions('version'=>\$version,
|
|
'help'=>\$help,
|
|
'debug'=>\$dbg,
|
|
'jobs=i'=>\$jobs,
|
|
'decoder=s'=> \$mosescmd,
|
|
'robust=i' => \$robust,
|
|
'decoder-parameters=s'=> \$mosesparameters,
|
|
'feed-decoder-via-stdin'=> \$feed_moses_via_stdin,
|
|
'logfile=s'=> \$logfile,
|
|
'i|inputfile|input-file=s'=> \$inputlist,
|
|
'n-best-list=s'=> \$nbestlist,
|
|
'n-best-file=s'=> \$oldnbestfile,
|
|
'n-best-size=i'=> \$oldnbest,
|
|
'output-search-graph|osg=s'=> \$searchgraphlist,
|
|
'output-word-graph|owg=s'=> \$wordgraphlist,
|
|
'alignment-output-file=s'=> \$alifile,
|
|
'translation-details|T=s'=> \$detailsfile,
|
|
'qsub-prefix=s'=> \$qsubname,
|
|
'queue-parameters=s'=> \$queueparameters,
|
|
'inputtype=i'=> \$inputtype,
|
|
'config|f=s'=>\$cfgfile,
|
|
'old-sge' => \$old_sge,
|
|
) or exit(1);
|
|
|
|
getNbestParameters();
|
|
|
|
getSearchGraphParameters();
|
|
|
|
getWordGraphParameters();
|
|
|
|
getLogParameters();
|
|
|
|
#print_parameters();
|
|
#print STDERR "nbestflag:$nbestflag\n";
|
|
#print STDERR "searchgraphflag:$searchgraphflag\n";
|
|
print STDERR "wordgraphflag:$wordgraphflag\n";
|
|
#print STDERR "inputlist:$inputlist\n";
|
|
|
|
chomp($inputfile=`basename $inputlist`) if defined($inputlist);
|
|
|
|
$mosesparameters.="@ARGV -config $cfgfile -inputtype $inputtype";
|
|
}
|
|
|
|
|
|
#######################
|
|
##print version
|
|
sub version(){
|
|
# print STDERR "version 1.0 (15-07-2006)\n";
|
|
# print STDERR "version 1.1 (17-07-2006)\n";
|
|
# print STDERR "version 1.2 (18-07-2006)\n";
|
|
# print STDERR "version 1.3 (21-07-2006)\n";
|
|
# print STDERR "version 1.4 (26-07-2006)\n";
|
|
# print STDERR "version 1.5 (27-07-2006)\n";
|
|
# print STDERR "version 1.6 (28-07-2006)\n";
|
|
# print STDERR "version 1.7 (29-07-2006)\n";
|
|
# print STDERR "version 1.8 (31-07-2006)\n";
|
|
# print STDERR "version 1.9 (01-08-2006)\n";
|
|
# print STDERR "version 1.10 (02-08-2006)\n";
|
|
# print STDERR "version 1.11 (10-10-2006)\n";
|
|
# print STDERR "version 1.12 (27-12-2006)\n";
|
|
print STDERR "version 1.13 (29-12-2006)\n";
|
|
exit(1);
|
|
}
|
|
|
|
#usage
|
|
sub usage(){
|
|
print STDERR "moses-parallel.pl [parallel-options] [moses-options]\n";
|
|
print STDERR "Options marked (*) are required.\n";
|
|
print STDERR "Parallel options:\n";
|
|
print STDERR "* -decoder <file> Moses decoder to use\n";
|
|
print STDERR "* -i|inputfile|input-file <file> the input text to translate\n";
|
|
print STDERR "* -jobs <N> number of required jobs\n";
|
|
print STDERR " -logfile <file> file where storing log files of all jobs\n";
|
|
print STDERR " -qsub-prefix <string> name for sumbitte jobs\n";
|
|
print STDERR " -queue-parameters <string> specific requirements for queue\n";
|
|
print STDERR " -old-sge Assume Sun Grid Engine < 6.0\n";
|
|
print STDERR " -debug debug\n";
|
|
print STDERR " -version print version of the script\n";
|
|
print STDERR " -help this help\n";
|
|
print STDERR "Moses options:\n";
|
|
print STDERR " -inputtype <0|1|2> 0 for text, 1 for confusion networks, 2 for lattices\n";
|
|
print STDERR " -output-search-graph (osg) <file>: Output connected hypotheses of search into specified filename\n";
|
|
print STDERR " -output-word-graph (osg) '<file> <0|1>': Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos\n";
|
|
print STDERR " IMPORTANT NOTE: use single quote to group parameters of -output-word-graph\n";
|
|
print STDERR " This is different from standard moses\n";
|
|
print STDERR " -n-best-list '<file> <N> [distinct]' where\n";
|
|
print STDERR " <file>: file where storing nbest lists\n";
|
|
print STDERR " <N>: size of nbest lists\n";
|
|
print STDERR " distinct: (optional) to activate generation of distinct nbest alternatives\n";
|
|
print STDERR " IMPORTANT NOTE: use single quote to group parameters of -n-best-list\n";
|
|
print STDERR " This is different from standard moses\n";
|
|
print STDERR " IMPORTANT NOTE: The following two parameters are now OBSOLETE, and they are no more supported\n";
|
|
print STDERR " -n-best-file <file> file where storing nbet lists\n";
|
|
print STDERR " -n-best-size <N> size of nbest lists\n";
|
|
print STDERR " NOTE: -n-best-file-n-best-size are passed to the decoder as \"-n-best-list <file> <N>\"\n";
|
|
print STDERR "* -config (f) <cfgfile> configuration file\n";
|
|
print STDERR " -decoder-parameters <string> specific parameters for the decoder\n";
|
|
print STDERR "All other options are passed to Moses\n";
|
|
print STDERR " (This way to pass parameters is maintained for back compatibility\n";
|
|
print STDERR " but preferably use -decoder-parameters)\n";
|
|
exit(1);
|
|
}
|
|
|
|
#printparameters
|
|
sub print_parameters(){
|
|
print STDERR "Inputfile: $inputlist\n";
|
|
print STDERR "Configuration file: $cfgfile\n";
|
|
print STDERR "Decoder in use: $mosescmd\n";
|
|
print STDERR "Number of jobs:$jobs\n";
|
|
print STDERR "Nbest list: $nbestlist\n" if ($nbestflag);
|
|
print STDERR "Output Search Graph: $searchgraphlist\n" if ($searchgraphflag);
|
|
print STDERR "Output Word Graph: $wordgraphlist\n" if ($wordgraphflag);
|
|
print STDERR "LogFile:$logfile\n" if ($logflag);
|
|
print STDERR "Qsub name: $qsubname\n";
|
|
print STDERR "Queue parameters: $queueparameters\n";
|
|
print STDERR "Inputtype: text\n" if $inputtype == 0;
|
|
print STDERR "Inputtype: confusion network\n" if $inputtype == 1;
|
|
print STDERR "Inputtype: lattices\n" if $inputtype == 2;
|
|
|
|
print STDERR "parameters directly passed to Moses: $mosesparameters\n";
|
|
}
|
|
|
|
#get parameters for log file
|
|
sub getLogParameters(){
|
|
if ($logfile){ $logflag=1; }
|
|
}
|
|
|
|
#get parameters for nbest computation (possibly from configuration file)
|
|
sub getNbestParameters(){
|
|
if (!$nbestlist){
|
|
open (CFG, "$cfgfile");
|
|
while (chomp($_=<CFG>)){
|
|
if (/^\[n-best-list\]/){
|
|
my $tmp;
|
|
while (chomp($tmp=<CFG>)){
|
|
last if $tmp eq "" || $tmp=~/^\[/;
|
|
$nbestlist .= "$tmp ";
|
|
}
|
|
last;
|
|
}
|
|
}
|
|
close(CFG);
|
|
}
|
|
|
|
if ($nbestlist){
|
|
if ($oldnbestfile){
|
|
print STDERR "There is a conflict between NEW parameter -n-best-list and OBSOLETE parameter -n-best-file\n";
|
|
print STDERR "Please use only -nbest-list '<file> <N> [distinct]\n";
|
|
exit;
|
|
}
|
|
}
|
|
else{
|
|
if ($oldnbestfile){
|
|
print STDERR "You are using the OBSOLETE parameter -n-best-file\n";
|
|
print STDERR "Next time please use only -n-best-list '<file> <N> [distinct]\n";
|
|
$nbestlist="$oldnbestfile";
|
|
if ($oldnbest){ $nbestlist.=" $oldnbest"; }
|
|
else { $nbestlist.=" 1"; }
|
|
}
|
|
}
|
|
|
|
if ($nbestlist){
|
|
my @tmp=split(/[ \t]+/,$nbestlist);
|
|
@nbestlist = @tmp;
|
|
|
|
if ($nbestlist[0] eq '-'){ $nbestfile="nbest"; }
|
|
else{ chomp($nbestfile=`basename $nbestlist[0]`); }
|
|
$nbestflag=1;
|
|
}
|
|
}
|
|
|
|
#get parameters for search graph computation (possibly from configuration file)
|
|
sub getSearchGraphParameters(){
|
|
if (!$searchgraphlist){
|
|
open (CFG, "$cfgfile");
|
|
while (chomp($_=<CFG>)){
|
|
if (/^\[output-search-graph\]/ || /^\[osg\]/){
|
|
my $tmp;
|
|
while (chomp($tmp=<CFG>)){
|
|
last if $tmp eq "" || $tmp=~/^\[/;
|
|
$searchgraphlist = "$tmp";
|
|
}
|
|
last;
|
|
}
|
|
}
|
|
close(CFG);
|
|
}
|
|
if ($searchgraphlist){
|
|
if ($searchgraphlist eq '-'){ $searchgraphfile="searchgraph"; }
|
|
else{ chomp($searchgraphfile=`basename $searchgraphlist`); }
|
|
$searchgraphflag=1;
|
|
}
|
|
}
|
|
|
|
#get parameters for word graph computation (possibly from configuration file)
|
|
sub getWordGraphParameters(){
|
|
if (!$wordgraphlist){
|
|
open (CFG, "$cfgfile");
|
|
while (chomp($_=<CFG>)){
|
|
if (/^\[output-word-graph\]/ || /^\[owg\]/){
|
|
my $tmp;
|
|
while (chomp($tmp=<CFG>)){
|
|
last if $tmp eq "" || $tmp=~/^\[/;
|
|
$wordgraphlist .= "$tmp ";
|
|
}
|
|
last;
|
|
}
|
|
}
|
|
close(CFG);
|
|
}
|
|
if ($wordgraphlist){
|
|
my @tmp=split(/[ \t]+/,$wordgraphlist);
|
|
@wordgraphlist = @tmp;
|
|
|
|
if ($wordgraphlist[0] eq '-'){ $wordgraphfile="wordgraph"; }
|
|
else{ chomp($wordgraphfile=`basename $wordgraphlist[0]`); }
|
|
$wordgraphflag=1;
|
|
}
|
|
}
|
|
|
|
#######################
|
|
#Script starts here
|
|
|
|
init();
|
|
|
|
version() if $version;
|
|
usage() if $help;
|
|
|
|
|
|
if (!defined $inputlist || !defined $mosescmd || ! defined $cfgfile) {
|
|
print STDERR "Please specify -input-file, -decoder and -config\n";
|
|
usage();
|
|
}
|
|
|
|
#checking if inputfile exists
|
|
if (! -e ${inputlist} ){
|
|
print STDERR "Inputfile ($inputlist) does not exists\n";
|
|
usage();
|
|
}
|
|
|
|
#checking if decoder exists
|
|
if (! -e $mosescmd) {
|
|
print STDERR "Decoder ($mosescmd) does not exists\n";
|
|
usage();
|
|
}
|
|
|
|
#checking if configfile exists
|
|
if (! -e $cfgfile) {
|
|
print STDERR "Configuration file ($cfgfile) does not exists\n";
|
|
usage();
|
|
}
|
|
|
|
|
|
print_parameters(); # so that people know
|
|
exit(1) if $dbg; # debug mode: just print and do not run
|
|
|
|
|
|
#splitting test file in several parts
|
|
#$decimal="-d"; #split does not accept this options (on MAC OS)
|
|
my $decimal="";
|
|
|
|
my $cmd;
|
|
my $sentenceN;
|
|
my $splitN;
|
|
|
|
my @idxlist=();
|
|
|
|
if ($inputtype==0){ #text input
|
|
#getting the number of input sentences (one sentence per line)
|
|
chomp($sentenceN=`wc -l ${inputlist} | awk '{print \$1}' `);
|
|
|
|
#Reducing the number of jobs if less sentences to translate
|
|
if ($jobs>$sentenceN){ $jobs=$sentenceN; }
|
|
|
|
#Computing the number of sentences for each files
|
|
if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); }
|
|
else{ $splitN=int($sentenceN /$jobs) + 1; }
|
|
|
|
if ($dbg){
|
|
print STDERR "There are $sentenceN sentences to translate\n";
|
|
print STDERR "There are at most $splitN sentences per job\n";
|
|
}
|
|
|
|
$cmd="split $decimal -a 2 -l $splitN $inputlist ${inputfile}.$splitpfx-";
|
|
safesystem("$cmd") or die;
|
|
}
|
|
elsif ($inputtype==1){ #confusion network input
|
|
my $tmpfile="/tmp/cnsplit$$";
|
|
$cmd="cat $inputlist | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' > $tmpfile";
|
|
safesystem("$cmd") or die;
|
|
|
|
#getting the number of input CNs
|
|
chomp($sentenceN=`wc -l $tmpfile | awk '{print \$1}' `);
|
|
|
|
#Reducing the number of jobs if less CNs to translate
|
|
if ($jobs>$sentenceN){ $jobs=$sentenceN; }
|
|
|
|
#Computing the number of CNs for each files
|
|
if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); }
|
|
else{ $splitN=int($sentenceN /$jobs) + 1; }
|
|
|
|
if ($dbg){
|
|
print STDERR "There are $sentenceN confusion networks to translate\n";
|
|
print STDERR "There are at most $splitN sentences per job\n";
|
|
}
|
|
|
|
$cmd="split $decimal -a 2 -l $splitN $tmpfile $tmpfile-";
|
|
safesystem("$cmd") or die;
|
|
|
|
my @idxlist=();
|
|
chomp(@idxlist=`ls $tmpfile-*`);
|
|
grep(s/.+(\-\S+)$/$1/e,@idxlist);
|
|
|
|
foreach my $idx (@idxlist){
|
|
$cmd="perl -pe 's/ _CNendline_ /\\n/g;s/ _CNendline_/\\n/g;'";
|
|
safesystem("cat $tmpfile$idx | $cmd > ${inputfile}.$splitpfx$idx ; \\rm -f $tmpfile$idx;");
|
|
}
|
|
}
|
|
elsif ($inputtype==2){ #confusion network input
|
|
#getting the number of input lattices (one lattice per line)
|
|
chomp($sentenceN=`wc -l ${inputlist} | awk '{print \$1}' `);
|
|
|
|
#Reducing the number of jobs if less lattices to translate
|
|
if ($jobs>$sentenceN){ $jobs=$sentenceN; }
|
|
|
|
#Computing the number of sentences for each files
|
|
if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); }
|
|
else{ $splitN=int($sentenceN /$jobs) + 1; }
|
|
|
|
if ($dbg){
|
|
print STDERR "There are $sentenceN lattices to translate\n";
|
|
print STDERR "There are at most $splitN lattices per job\n";
|
|
}
|
|
|
|
$cmd="split $decimal -a 2 -l $splitN $inputlist ${inputfile}.$splitpfx-";
|
|
safesystem("$cmd") or die;
|
|
}
|
|
else{ #unknown input type
|
|
die "INPUTTYPE:$inputtype is unknown!\n";
|
|
}
|
|
|
|
chomp(@idxlist=`ls ${inputfile}.$splitpfx-*`);
|
|
grep(s/.+(\-\S+)$/$1/e,@idxlist);
|
|
|
|
safesystem("mkdir -p $tmpdir") or die;
|
|
|
|
preparing_script();
|
|
|
|
#launching process through the queue
|
|
my @sgepids =();
|
|
|
|
my @idx_todo = ();
|
|
foreach (@idxlist) { push @idx_todo,$_; }
|
|
|
|
# loop up to --robust times
|
|
while ($robust && scalar @idx_todo) {
|
|
$robust--;
|
|
|
|
my $failure=0;
|
|
foreach my $idx (@idx_todo){
|
|
|
|
my $batch_and_join = undef;
|
|
if ($old_sge) {
|
|
# old SGE understands -b no as the default and does not understand 'yes'
|
|
$batch_and_join = "-j y";
|
|
} else {
|
|
$batch_and_join = "-b no -j yes";
|
|
}
|
|
$cmd="qsub $queueparameters $batch_and_join -o $qsubout$idx -e $qsuberr$idx -N $qsubname$idx ${jobscript}${idx}.bash > ${jobscript}${idx}.log 2>&1";
|
|
print STDERR "$cmd\n" if $dbg;
|
|
|
|
safesystem($cmd) or die;
|
|
|
|
my ($res,$id);
|
|
|
|
open (IN,"${jobscript}${idx}.log")
|
|
or die "Can't read id of job ${jobscript}${idx}.log";
|
|
chomp($res=<IN>);
|
|
my @arrayStr = split(/\s+/,$res);
|
|
$id=$arrayStr[2];
|
|
die "Failed to guess job id from $jobscript$idx.log, got: $res"
|
|
if $id !~ /^[0-9]+$/;
|
|
close(IN);
|
|
|
|
push @sgepids, $id;
|
|
}
|
|
|
|
#waiting until all jobs have finished
|
|
my $hj = "-hold_jid " . join(" -hold_jid ", @sgepids);
|
|
|
|
if ($old_sge) {
|
|
# we need to implement our own waiting script
|
|
my $syncscript = "${jobscript}.sync_workaround_script.sh";
|
|
safesystem("echo 'date' > $syncscript") or kill_all_and_quit();
|
|
|
|
my $pwd = `$pwdcmd`; chomp $pwd;
|
|
|
|
my $checkpointfile = "${jobscript}.sync_workaround_checkpoint";
|
|
|
|
# delete previous checkpoint, if left from previous runs
|
|
safesystem("\\rm -f $checkpointfile") or kill_all_and_quit();
|
|
|
|
# start the 'hold' job, i.e. the job that will wait
|
|
$cmd="qsub -cwd $queueparameters $hj -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript 2> $qsubname.W.log";
|
|
safesystem($cmd) or kill_all_and_quit();
|
|
|
|
# and wait for checkpoint file to appear
|
|
my $nr=0;
|
|
while (!-e $checkpointfile) {
|
|
sleep(10);
|
|
$nr++;
|
|
print STDERR "w" if $nr % 3 == 0;
|
|
}
|
|
print STDERR "End of waiting.\n";
|
|
safesystem("\\rm -f $checkpointfile $syncscript") or kill_all_and_quit();
|
|
|
|
my $failure = 1;
|
|
my $nr = 0;
|
|
while ($nr < 60 && $failure) {
|
|
$nr ++;
|
|
$failure=&check_exit_status();
|
|
if (!$failure) {
|
|
$failure = check_translation_old_sge();
|
|
}
|
|
last if !$failure;
|
|
print STDERR "Extra wait ($nr) for possibly unfinished processes.\n";
|
|
sleep 10;
|
|
}
|
|
} else {
|
|
# use the -sync option for qsub
|
|
$cmd="qsub $queueparameters -sync y $hj -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls > $qsubname.W.log";
|
|
safesystem($cmd) or kill_all_and_quit();
|
|
|
|
$failure=&check_exit_status();
|
|
}
|
|
|
|
kill_all_and_quit() if $failure && !$robust;
|
|
|
|
# check if some translations failed
|
|
my @idx_still_todo = check_translation();
|
|
if ($robust) {
|
|
# if robust, redo crashed jobs
|
|
if ((scalar @idx_still_todo) == (scalar @idxlist)) {
|
|
# ... but not if all crashed
|
|
print STDERR "everything crashed, not trying to resubmit jobs\n";
|
|
$robust = 0;
|
|
kill_all_and_quit();
|
|
}
|
|
@idx_todo = @idx_still_todo;
|
|
}
|
|
else {
|
|
if (scalar (@idx_still_todo)) {
|
|
print STDERR "some jobs crashed: ".join(" ",@idx_still_todo)."\n";
|
|
kill_all_and_quit();
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
#concatenating translations and removing temporary files
|
|
concatenate_1best();
|
|
concatenate_logs() if $logflag;
|
|
concatenate_ali() if defined $alifile;
|
|
concatenate_details() if defined $detailsfile;
|
|
concatenate_nbest() if $nbestflag;
|
|
safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-';
|
|
|
|
concatenate_searchgraph() if $searchgraphflag;
|
|
safesystem("cat searchgraph$$ >> /dev/stdout") if $searchgraphlist eq '-';
|
|
|
|
concatenate_wordgraph() if $wordgraphflag;
|
|
safesystem("cat wordgraph$$ >> /dev/stdout") if $wordgraphlist[0] eq '-';
|
|
|
|
remove_temporary_files();
|
|
|
|
|
|
#script creation
|
|
sub preparing_script(){
|
|
my $currStartTranslationId = 0;
|
|
|
|
foreach my $idx (@idxlist){
|
|
my $scriptheader="";
|
|
$scriptheader.="\#\! /bin/bash\n\n";
|
|
# !!! this is useless. qsub ignores the first line of the script.
|
|
# Pass '-S /bin/bash' to qsub instead.
|
|
|
|
#SGE-make-it-work:
|
|
# $scriptheader.="## Sun's Grid Engine parameters\n";
|
|
# $scriptheader.="# ... (see 'man qsub' for complete documentation)\n";
|
|
# $scriptheader.="# ... job name\n";
|
|
# $scriptheader.="#\$ -N mert-$qsubname\n";
|
|
# $scriptheader.="# ... make sure to use proper shell\n";
|
|
# $scriptheader.="#\$ -S /bin/bash\n";
|
|
# $scriptheader.="# ... e-mail address to send notifications\n";
|
|
# $scriptheader.="#\$ -M ww.xx\@yy.zz\n\n";
|
|
|
|
$scriptheader.="uname -a\n\n";
|
|
$scriptheader.="ulimit -c 0\n\n"; # avoid coredumps
|
|
$scriptheader.="cd $workingdir\n\n";
|
|
|
|
|
|
|
|
open (OUT, "> ${jobscript}${idx}.bash");
|
|
print OUT $scriptheader;
|
|
my $inputmethod = $feed_moses_via_stdin ? "<" : "-input-file";
|
|
|
|
my $tmpnbestlist="";
|
|
if ($nbestflag){
|
|
$tmpnbestlist="$tmpdir/$nbestfile.$splitpfx$idx $nbestlist[1]";
|
|
$tmpnbestlist = "$tmpnbestlist $nbestlist[2]" if scalar(@nbestlist)==3;
|
|
$tmpnbestlist = "-n-best-list $tmpnbestlist";
|
|
}
|
|
|
|
my $tmpalioutfile = "";
|
|
if (defined $alifile){
|
|
$tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx";
|
|
}
|
|
|
|
my $tmpdetailsoutfile = "";
|
|
if (defined $detailsfile){
|
|
$tmpdetailsoutfile="-translation-details $tmpdir/$detailsfile.$splitpfx$idx";
|
|
}
|
|
|
|
my $tmpsearchgraphlist="";
|
|
if ($searchgraphflag){
|
|
$tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx";
|
|
}
|
|
|
|
my $tmpwordgraphlist="";
|
|
if ($wordgraphflag){
|
|
$tmpwordgraphlist="-output-word-graph $tmpdir/$wordgraphfile.$splitpfx$idx $wordgraphlist[1]";
|
|
}
|
|
|
|
my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId";
|
|
|
|
print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
|
|
print OUT "echo exit status \$\?\n\n";
|
|
|
|
if (defined $alifile){
|
|
print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n";
|
|
print OUT "echo exit status \$\?\n\n";
|
|
}
|
|
if (defined $detailsfile){
|
|
print OUT "\\mv -f $tmpdir/${detailsfile}.$splitpfx$idx .\n\n";
|
|
print OUT "echo exit status \$\?\n\n";
|
|
}
|
|
if ($nbestflag){
|
|
print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n";
|
|
print OUT "echo exit status \$\?\n\n";
|
|
}
|
|
if ($searchgraphflag){
|
|
print OUT "\\mv -f $tmpdir/${searchgraphfile}.$splitpfx$idx .\n\n";
|
|
print OUT "echo exit status \$\?\n\n";
|
|
}
|
|
|
|
if ($wordgraphflag){
|
|
print OUT "\\mv -f $tmpdir/${wordgraphfile}.$splitpfx$idx .\n\n";
|
|
print OUT "echo exit status \$\?\n\n";
|
|
}
|
|
|
|
print OUT "\\mv -f $tmpdir/${inputfile}.$splitpfx$idx.trans .\n\n";
|
|
print OUT "echo exit status \$\?\n\n";
|
|
close(OUT);
|
|
|
|
#setting permissions of each script
|
|
chmod(oct(755),"${jobscript}${idx}.bash");
|
|
|
|
$currStartTranslationId += $splitN;
|
|
}
|
|
}
|
|
|
|
sub concatenate_wordgraph(){
|
|
my $oldcode="";
|
|
my $newcode=-1;
|
|
my %inplength = ();
|
|
my $offset = 0;
|
|
|
|
my $outwordgraph=$wordgraphlist[0];
|
|
if ($wordgraphlist[0] eq '-'){ $outwordgraph="wordgraph$$"; }
|
|
|
|
open (OUT, "> $outwordgraph");
|
|
foreach my $idx (@idxlist){
|
|
|
|
#computing the length of each input file
|
|
my @in=();
|
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans");
|
|
@in=<IN>;
|
|
close(IN);
|
|
$inplength{$idx} = scalar(@in);
|
|
|
|
open (IN, "${wordgraphfile}.${splitpfx}${idx}");
|
|
while (<IN>){
|
|
|
|
my $code="";
|
|
if (/^UTTERANCE=/){
|
|
($code)=($_=~/^UTTERANCE=(\d+)/);
|
|
|
|
print STDERR "code:$code offset:$offset\n";
|
|
$code += $offset;
|
|
if ($code ne $oldcode){
|
|
|
|
# if there is a jump between two consecutive codes
|
|
# it means that an input sentence is not translated
|
|
# fill this hole with a "fictitious" list of wordgraphs
|
|
# comprising just one "_EMPTYSEARCHGRAPH_
|
|
while ($code - $oldcode > 1){
|
|
$oldcode++;
|
|
print OUT "UTTERANCE=$oldcode\n";
|
|
print STDERR " to OUT -> code:$oldcode\n";
|
|
print OUT "_EMPTYWORDGRAPH_\n";
|
|
}
|
|
}
|
|
|
|
$oldcode=$code;
|
|
print OUT "UTTERANCE=$oldcode\n";
|
|
next;
|
|
}
|
|
print OUT "$_";
|
|
}
|
|
close(IN);
|
|
$offset += $inplength{$idx};
|
|
|
|
while ($offset - $oldcode > 1){
|
|
$oldcode++;
|
|
print OUT "UTTERANCE=$oldcode\n";
|
|
print OUT "_EMPTYWORDGRAPH_\n";
|
|
}
|
|
}
|
|
close(OUT);
|
|
}
|
|
|
|
|
|
sub concatenate_searchgraph(){
|
|
my $oldcode="";
|
|
my $newcode=-1;
|
|
my %inplength = ();
|
|
my $offset = 0;
|
|
|
|
my $outsearchgraph=$searchgraphlist;
|
|
if ($searchgraphlist eq '-'){ $outsearchgraph="searchgraph$$"; }
|
|
|
|
open (OUT, "> $outsearchgraph");
|
|
foreach my $idx (@idxlist){
|
|
|
|
#computing the length of each input file
|
|
my @in=();
|
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans");
|
|
@in=<IN>;
|
|
close(IN);
|
|
$inplength{$idx} = scalar(@in);
|
|
|
|
open (IN, "${searchgraphfile}.${splitpfx}${idx}");
|
|
while (<IN>){
|
|
my ($code,@extra)=split(/[ \t]+/,$_);
|
|
$code += $offset;
|
|
if ($code ne $oldcode){
|
|
|
|
# if there is a jump between two consecutive codes
|
|
# it means that an input sentence is not translated
|
|
# fill this hole with a "fictitious" list of searchgraphs
|
|
# comprising just one "_EMPTYSEARCHGRAPH_
|
|
while ($code - $oldcode > 1){
|
|
$oldcode++;
|
|
print OUT "$oldcode _EMPTYSEARCHGRAPH_\n";
|
|
}
|
|
}
|
|
$oldcode=$code;
|
|
print OUT join(" ",($oldcode,@extra));
|
|
}
|
|
close(IN);
|
|
$offset += $inplength{$idx};
|
|
|
|
while ($offset - $oldcode > 1){
|
|
$oldcode++;
|
|
print OUT "$oldcode _EMPTYSEARCHGRAPH_\n";
|
|
}
|
|
}
|
|
close(OUT);
|
|
}
|
|
|
|
sub concatenate_nbest(){
|
|
my $oldcode="";
|
|
my $newcode=-1;
|
|
my %inplength = ();
|
|
my $offset = 0;
|
|
|
|
# get the list of feature and set a fictitious string with zero scores
|
|
open (IN, "${nbestfile}.${splitpfx}$idxlist[0]");
|
|
my $str = <IN>;
|
|
chomp($str);
|
|
close(IN);
|
|
my ($code,$trans,$featurescores,$globalscore)=split(/\|\|\|/,$str);
|
|
|
|
my $emptytrans = " ";
|
|
my $emptyglobalscore = " 0.0";
|
|
my $emptyfeaturescores = $featurescores;
|
|
$emptyfeaturescores =~ s/[-0-9\.]+/0/g;
|
|
|
|
my $outnbest=$nbestlist[0];
|
|
if ($nbestlist[0] eq '-'){ $outnbest="nbest$$"; }
|
|
|
|
open (OUT, "> $outnbest");
|
|
foreach my $idx (@idxlist){
|
|
|
|
#computing the length of each input file
|
|
my @in=();
|
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans");
|
|
@in=<IN>;
|
|
close(IN);
|
|
$inplength{$idx} = scalar(@in);
|
|
|
|
open (IN, "${nbestfile}.${splitpfx}${idx}");
|
|
while (<IN>){
|
|
my ($code,@extra)=split(/\|\|\|/,$_);
|
|
$code += $offset;
|
|
if ($code ne $oldcode){
|
|
|
|
# if there is a jump between two consecutive codes
|
|
# it means that an input sentence is not translated
|
|
# fill this hole with a "fictitious" list of translation
|
|
# comprising just one "emtpy translation" with zero scores
|
|
while ($code - $oldcode > 1){
|
|
$oldcode++;
|
|
print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n";
|
|
}
|
|
}
|
|
$oldcode=$code;
|
|
print OUT join("\|\|\|",($oldcode,@extra));
|
|
}
|
|
close(IN);
|
|
$offset += $inplength{$idx};
|
|
|
|
while ($offset - $oldcode > 1){
|
|
$oldcode++;
|
|
print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n";
|
|
}
|
|
}
|
|
close(OUT);
|
|
}
|
|
|
|
sub concatenate_1best(){
|
|
foreach my $idx (@idxlist){
|
|
my @in=();
|
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans");
|
|
@in=<IN>;
|
|
print STDOUT "@in";
|
|
close(IN);
|
|
}
|
|
}
|
|
|
|
sub concatenate_logs(){
|
|
open (OUT, "> ${logfile}");
|
|
foreach my $idx (@idxlist){
|
|
my @in=();
|
|
open (IN, "$qsubout$idx");
|
|
@in=<IN>;
|
|
print OUT "@in";
|
|
close(IN);
|
|
}
|
|
close(OUT);
|
|
}
|
|
|
|
sub concatenate_ali(){
|
|
open (OUT, "> ${alifile}");
|
|
foreach my $idx (@idxlist){
|
|
my @in=();
|
|
open (IN, "$alifile.$splitpfx$idx");
|
|
@in=<IN>;
|
|
print OUT "@in";
|
|
close(IN);
|
|
}
|
|
close(OUT);
|
|
}
|
|
|
|
sub concatenate_details(){
|
|
open (OUT, "> ${detailsfile}");
|
|
foreach my $idx (@idxlist){
|
|
my @in=();
|
|
open (IN, "$detailsfile.$splitpfx$idx");
|
|
@in=<IN>;
|
|
print OUT "@in";
|
|
close(IN);
|
|
}
|
|
close(OUT);
|
|
}
|
|
|
|
|
|
sub check_exit_status(){
|
|
print STDERR "check_exit_status\n";
|
|
my $failure=0;
|
|
foreach my $idx (@idxlist){
|
|
print STDERR "check_exit_status of job $idx\n";
|
|
open(IN,"$qsubout$idx");
|
|
while (<IN>){
|
|
$failure=1 if (/exit status 1/);
|
|
}
|
|
close(IN);
|
|
}
|
|
return $failure;
|
|
}
|
|
|
|
sub kill_all_and_quit(){
|
|
print STDERR "Got interrupt or something failed.\n";
|
|
print STDERR "kill_all_and_quit\n";
|
|
foreach my $id (@sgepids){
|
|
print STDERR "qdel $id\n";
|
|
safesystem("qdel $id");
|
|
}
|
|
|
|
print STDERR "Translation was not performed correctly\n";
|
|
print STDERR "or some of the submitted jobs died.\n";
|
|
print STDERR "qdel function was called for all submitted jobs\n";
|
|
|
|
exit(1);
|
|
}
|
|
|
|
|
|
sub check_translation(){
|
|
#checking if all sentences were translated
|
|
my $inputN;
|
|
my $outputN;
|
|
my @failed = ();
|
|
foreach my $idx (@idx_todo){
|
|
if ($inputtype==0){#text input
|
|
chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`);
|
|
}
|
|
elsif ($inputtype==1){#confusion network input
|
|
chomp($inputN=`cat ${inputfile}.$splitpfx$idx | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' | wc -l | cut -d' ' -f1 `);
|
|
}
|
|
elsif ($inputtype==2){#lattice input
|
|
chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`);
|
|
}
|
|
else{#unknown input
|
|
die "INPUTTYPE:$inputtype is unknown!\n";
|
|
}
|
|
chomp($outputN=`wc -l ${inputfile}.$splitpfx$idx.trans | cut -d' ' -f1`);
|
|
|
|
if ($inputN != $outputN){
|
|
print STDERR "Split ($idx) were not entirely translated\n";
|
|
print STDERR "outputN=$outputN inputN=$inputN\n";
|
|
print STDERR "outputfile=${inputfile}.$splitpfx$idx.trans inputfile=${inputfile}.$splitpfx$idx\n";
|
|
push @failed,$idx;
|
|
}
|
|
}
|
|
return @failed;
|
|
}
|
|
|
|
sub check_translation_old_sge(){
|
|
#checking if all sentences were translated
|
|
my $inputN;
|
|
my $outputN;
|
|
foreach my $idx (@idx_todo){
|
|
if ($inputtype==0){#text input
|
|
chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`);
|
|
}
|
|
elsif ($inputtype==1){#confusion network input
|
|
chomp($inputN=`cat ${inputfile}.$splitpfx$idx | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' | wc -l |
|
|
cut -d' ' -f1 `);
|
|
}
|
|
elsif ($inputtype==2){#lattice input
|
|
chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`);
|
|
}
|
|
else{#unknown input
|
|
die "INPUTTYPE:$inputtype is unknown!\n";
|
|
}
|
|
chomp($outputN=`wc -l ${inputfile}.$splitpfx$idx.trans | cut -d' ' -f1`);
|
|
|
|
if ($inputN != $outputN){
|
|
print STDERR "Split ($idx) were not entirely translated\n";
|
|
print STDERR "outputN=$outputN inputN=$inputN\n";
|
|
print STDERR "outputfile=${inputfile}.$splitpfx$idx.trans inputfile=${inputfile}.$splitpfx$idx\n";
|
|
return 1;
|
|
}
|
|
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
sub remove_temporary_files(){
|
|
#removing temporary files
|
|
foreach my $idx (@idxlist){
|
|
unlink("${inputfile}.${splitpfx}${idx}.trans");
|
|
unlink("${inputfile}.${splitpfx}${idx}");
|
|
if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); }
|
|
if (defined $detailsfile){ unlink("${detailsfile}.${splitpfx}${idx}"); }
|
|
if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); }
|
|
if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); }
|
|
if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); }
|
|
unlink("${jobscript}${idx}.bash");
|
|
unlink("${jobscript}${idx}.log");
|
|
unlink("$qsubname.W.log");
|
|
unlink("$qsubout$idx");
|
|
unlink("$qsuberr$idx");
|
|
rmdir("$tmpdir");
|
|
}
|
|
if ($nbestflag && $nbestlist[0] eq '-'){ unlink("${nbestfile}$$"); };
|
|
if ($searchgraphflag && $searchgraphlist eq '-'){ unlink("${searchgraphfile}$$"); };
|
|
if ($wordgraphflag && $wordgraphlist eq '-'){ unlink("${wordgraphfile}$$"); };
|
|
}
|
|
|
|
sub safesystem {
|
|
print STDERR "Executing: @_\n";
|
|
system(@_);
|
|
if ($? == -1) {
|
|
print STDERR "Failed to execute: @_\n $!\n";
|
|
exit(1);
|
|
}
|
|
elsif ($? & 127) {
|
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
|
exit 1;
|
|
}
|
|
else {
|
|
my $exitcode = $? >> 8;
|
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
|
return ! $exitcode;
|
|
}
|
|
}
|
|
|
|
|
|
# look for the correct pwdcmd (pwd by default, pawd if it exists)
|
|
# I assume that pwd always exists
|
|
sub getPwdCmd(){
|
|
my $pwdcmd="pwd";
|
|
my $a;
|
|
chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print $1}'`);
|
|
if ($a && -e $a){ $pwdcmd=$a; }
|
|
return $pwdcmd;
|
|
}
|
|
|