From 57bcad0c5fa0d1461e6aab68c81f07d2b7f2b977 Mon Sep 17 00:00:00 2001 From: bojar Date: Mon, 31 Jul 2006 14:17:43 +0000 Subject: [PATCH] the cleanup of mert-moses seems to be finished added first simple 'make release' goal git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@405 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/Makefile | 18 ++++++ scripts/training/mert-moses.pl | 108 ++++++++++++++++++++++----------- 2 files changed, 91 insertions(+), 35 deletions(-) create mode 100644 scripts/Makefile diff --git a/scripts/Makefile b/scripts/Makefile new file mode 100644 index 000000000..94c6b9fe9 --- /dev/null +++ b/scripts/Makefile @@ -0,0 +1,18 @@ +# This makefile is here to simplify the automatic releases (and tests!!!) +# of the scripts + +TS?=$(shell date '+%Y%m%d-%H%M') +RELEASEDIR=/export/ws06osmt/bin/scripts-$(TS) + + +VALID_TRAINING_SCRIPTS_NAMES=filter-model-given-input.pl mert-moses.pl train-factored-phrase-model.perl +# Make trick to add directory name to all of them: +VALID_TRAINING_SCRIPTS=$(VALID_TRAINING_SCRIPTS_NAMES:%=training/%) + +# the list of all scripts that should be released +VALID_SCRIPTS= $(VALID_TRAINING_SCRIPTS) + +release: + if [ -e $(RELEASEDIR) ]; then echo "Targetdir exists! Not touching it! $(RELEASEDIR)"; exit 1; fi + mkdir -p $(RELEASEDIR) + cp $(VALID_SCRIPTS) $(RELEASEDIR) diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 7e312ddb6..29ec8e1e3 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -9,6 +9,7 @@ # Revision history +# 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again) # 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar) # 27 Jul 2006 adding the safesystem() function to handle with process failure # 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi) @@ -84,10 +85,11 @@ my $___START_STEP = undef; # which iteration step to start with my $___AVERAGE = 0; my $bindir = undef; # path to all tools (overriden by specific options) -my $CMERTDIR = undef; # path to cmert directory +my $cmertdir = undef; # path to cmert directory my $pythoncmd = undef; # path to python executable my $filtercmd = undef; # path to filter-model-given-input.pl my $SCORENBESTCMD = undef; +my $qsubwrapper = undef; use strict; @@ -107,10 +109,11 @@ GetOptions( "average" => \$___AVERAGE, "help" => \$usage, "bindir=s" => \$bindir, - "cmertdir=s" => \$CMERTDIR, + "cmertdir=s" => \$cmertdir, "pythoncmd=s" => \$pythoncmd, "filtercmd=s" => \$filtercmd, # allow to override the default location "scorenbestcmd=s" => \$SCORENBESTCMD, # path to score-nbest.py + "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location ); # the 4 required parameters can be supplied on the command line directly @@ -154,19 +157,21 @@ $bindir = $ENV{"MOSESBIN"} if !defined $bindir; # path of script for filtering phrase tables and running the decoder $filtercmd="$bindir/filter-model-given-input.pl" if !defined $filtercmd; +$qsubwrapper="$bindir/qsub-wrapper.pl" if !defined $qsubwrapper; -$CMERTDIR = "$bindir/cmert-0.5" if !defined $CMERTDIR; -my $CMERT="$CMERTDIR/mert"; -$SCORENBESTCMD = "$CMERTDIR/score-nbest.py" if ! defined $SCORENBESTCMD; +$cmertdir = "$bindir/cmert-0.5" if !defined $cmertdir; +my $cmertcmd="$cmertdir/mert"; -$pythoncmd = "$CMERTDIR/python" if !defined $pythoncmd; +$SCORENBESTCMD = "$cmertdir/score-nbest.py" if ! defined $SCORENBESTCMD; + +$pythoncmd = "$cmertdir/python" if !defined $pythoncmd; $ENV{PYTHONPATH} = $pythoncmd; # other scripts need to know die "Not executable: $filtercmd" if ! -x $filtercmd; -die "Not executable: $CMERT" if ! -x $CMERT; +die "Not executable: $cmertcmd" if ! -x $cmertcmd; die "Not executable: $pythoncmd" if ! -x $pythoncmd; die "Not executable: $___DECODER" if ! -x $___DECODER; @@ -334,7 +339,8 @@ close(RANGES); # filter the phrase tables, use --decoder-flags print "filtering the phrase tables... ".`date`; -safesystem("$filtercmd ./filtered $___CONFIG $___DEV_F") or die "Failed to filter the tables"; +my $cmd = "$filtercmd ./filtered $___CONFIG $___DEV_F"; +safesystem("$qsubwrapper -command='$cmd'") or die "Failed to submit filtering of tables to the queue (via $qsubwrapper)"; # the decoder should now use the filtered model @@ -359,30 +365,38 @@ while(1) { } close(WEIGHTS); + # In case something dies later, we might wish to have a copy + create_config($___CONFIG, "./run$run.moses.ini", \@LAMBDA, \@NAME, $run, (defined$devbleu?$devbleu:"--not-estimated--")); + # skip if restarted if (!$skip_decoder) { print "($run) run decoder to produce n-best lists\n"; print "LAMBDAS are @LAMBDA\n"; run_decoder(\@LAMBDA); + safesystem("gzip -f run*out") or die "Failed to gzip run*out"; } else { print "skipped decoder run\n"; $skip_decoder = 0; } - safesystem("gzip -f run*out") or die "Failed to gzip run*out"; my $EFF_REF_LEN = ""; if ($___AVERAGE) { $EFF_REF_LEN = "-a"; } + # To be sure that scoring script produses these fresh: + safesystem("rm -f cands.opt feats.opt") or die; + # convert n-best list into a numberized format with error scores - safesystem("gunzip run*.best*.out.gz") or die "Failed to gunzip run*.best*.out.gz"; - print STDERR "Scoring the nbestlist or whatever.\n"; - my $cmd = "sort -mn -t \"|\" -k 1,1 run*.best*.out | $SCORENBESTCMD $EFF_REF_LEN ".join(" ", @references)." ./"; - safesystem("$cmd") or die "Failed to score-nbest list or whatever."; - safesystem("gzip -f run*.best*.out") or die; + + print STDERR "Scoring the nbestlist.\n"; + my $cmd = "export PYTHONPATH=$pythoncmd ; gunzip -dc run*.best*.out.gz | sort -n -t \"|\" -k 1,1 | $SCORENBESTCMD $EFF_REF_LEN ".join(" ", @references)." ./"; + safesystem("$qsubwrapper -command='$cmd'") or die "Failed to submit scoring nbestlist to queue (via $qsubwrapper)"; + + + print STDERR "Hoping that scoring succeeded. Don't know how to check for it! XXX.\n"; # keep a count of lines in nbests lists (alltogether) @@ -406,15 +420,16 @@ while(1) { # run cmert safesystem("cat ranges.txt weights.txt > init.opt") or die; - safesystem("rm -f weights.txt") or die; + safesystem("mv weights.txt run$run.input_weights.txt") or die; # keep a copy of the weights #store actual values safesystem("cp init.opt run$run.init.opt") or die; my $DIM = scalar(@LAMBDA); # number of lambdas - - print STDERR "Running cmert.\n"; - safesystem("$CMERT -d $DIM 2> cmert.log") or die; + $cmd="$cmertcmd -d $DIM"; + + print STDERR "Starting cmert.\n"; + safesystem("$qsubwrapper -command='$cmd' -stderr=cmert.log") or die "Failed to start cmert (via qsubwrapper $qsubwrapper)"; my $bestpoint = undef; my $devbleu = undef; @@ -447,9 +462,9 @@ safesystem ("cp cmert.log run$run.cmert.log") or die; # This is fine, because the new attempt did not bring any improvement, # so we do not want to use it. # @NAME are the names of models the lambdas belong to -create_config(@LAMBDA, @NAME); +create_config($___CONFIG, "./moses.ini", \@LAMBDA, \@NAME, $run, $devbleu); -#chdir back to the original directory +#chdir back to the original directory # useless, just to remind we were not there chdir($cwd); sub run_decoder { @@ -473,38 +488,58 @@ sub run_decoder { } sub create_config { + my $infn = shift; # source config + my $outfn = shift; # where to save the config my $lambdas = shift; # the lambdas we should write - my @lambdas = @$lambdas; + my @lambdas = @$lambdas; # my own copy of the array my $names = shift; # the names of the lambdas - my @names = @$names; + my @names = @$names; # my own copy of the array + my $run = shift; # just for verbosity + my $devbleu = shift; # just for verbosity - my %P; - # parameters specified at the command line - { - my $parameter; - print "PARAM IS |$___DECODER_FLAGS|\n"; + my %P; # the hash of all parameters we wish to override + + # first convert the command line parameters to the hash + { # ensure local scope of vars + my $parameter=undef; + print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n"; + $___DECODER_FLAGS =~ s/^\s*|\s*$//; + $___DECODER_FLAGS =~ s/\s+/ /; foreach (split(/ /,$___DECODER_FLAGS)) { - print "$_ :::\n"; if (/^\-([^\d].*)$/) { $parameter = $1; $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); - print "\tis parameter $parameter\n"; } else { + die "Found value with no -paramname before it: $_" + if !defined $parameter; push @{$P{$parameter}},$_; } } } + # Convert weights to elements in P + # First delete all weights params from the input + foreach my $abbr (@names) { + my $name = defined $ABBR2FULL{$abbr} ? $ABBR2FULL{$abbr} : $abbr; + delete($P{$name}); + } + while (my $abbr = shift @names) { + my $w = shift @lambdas; + die "Lambdas and names do not have equal length!" if !defined $w; + my $name = defined $ABBR2FULL{$abbr} ? $ABBR2FULL{$abbr} : $abbr; + push @{$P{$name}}, $w; + } - # create new moses.ini decoder config file - open(INI,$P{"config"}[0]); - delete($P{"config"}); - print "OUT: > moses.ini\n"; - open(OUT,"> moses.ini"); + + # create new moses.ini decoder config file by cloning and overriding the original one + open(INI,$infn) or die "Can't read $infn"; + delete($P{"config"}); # never output + print "Saving new config to: $outfn"; + open(OUT,"> $outfn") or die "Can't write $outfn"; print OUT "# MERT optimized configuration\n"; print OUT "# decoder $___DECODER\n"; - print OUT "# $devbleu on dev $___DEV_F\n"; + print OUT "# BLEU $devbleu on dev $___DEV_F\n"; print OUT "# $run iterations\n"; print OUT "# finished ".`date`; my $line = ; @@ -546,6 +581,7 @@ sub create_config { } } + # write all additional parameters foreach my $parameter (keys %P) { print OUT "\n[$parameter]\n"; foreach (@{$P{$parameter}}) { @@ -555,6 +591,7 @@ sub create_config { close(INI); close(OUT); + print STDERR "Saved: $outfn\n"; } sub safesystem { @@ -632,6 +669,7 @@ sub scan_config { next; } if (defined $section && $section eq "mapping") { + # keep track of mapping steps used $defined_steps{$1}++ if /^([TG])/; } if (defined $section && defined $where_is_filename{$section}) {