From 30febce3e853d46d6f4f896ab511fad6ec652fd3 Mon Sep 17 00:00:00 2001 From: Jehan Date: Fri, 25 Nov 2011 17:21:55 +0000 Subject: [PATCH 1/3] - Help output for train-recaser script. --- scripts/recaser/train-recaser.perl | 64 +++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl index 8a2b17ede..d5e6c4ef1 100755 --- a/scripts/recaser/train-recaser.perl +++ b/scripts/recaser/train-recaser.perl @@ -16,24 +16,68 @@ my $TRAIN_SCRIPT = "train-factored-phrase-model.perl"; my $MAX_LEN = 1; my $FIRST_STEP = 1; my $LAST_STEP = 11; -die("train-recaser.perl --dir recaser --corpus cased") +my $HELP = 0; +my $ERROR = 0; +$ERROR = "training Aborted." unless &GetOptions('first-step=i' => \$FIRST_STEP, 'last-step=i' => \$LAST_STEP, 'corpus=s' => \$CORPUS, 'config=s' => \$CONFIG, - 'dir=s' => \$DIR, - 'ngram-count=s' => \$NGRAM_COUNT, - 'build-lm=s' => \$BUILD_LM, - 'lm=s' => \$LM, - 'train-script=s' => \$TRAIN_SCRIPT, - 'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR, - 'max-len=i' => \$MAX_LEN); + 'dir=s' => \$DIR, + 'ngram-count=s' => \$NGRAM_COUNT, + 'build-lm=s' => \$BUILD_LM, + 'lm=s' => \$LM, + 'train-script=s' => \$TRAIN_SCRIPT, + 'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR, + 'max-len=i' => \$MAX_LEN, + 'help' => \$HELP); # check and set default to unset parameters -die("please specify working dir --dir") unless defined($DIR); -die("please specify --corpus") if !defined($CORPUS) +$ERROR = "please specify working dir --dir" unless defined($DIR); +$ERROR = "please specify --corpus" if !defined($CORPUS) && $FIRST_STEP <= 2 && $LAST_STEP >= 1; +if ($HELP || $ERROR) { + if ($ERROR) { + print STDERR "ERROR: " . $ERROR . "\n"; + } + print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]"; + + print STDERR "\n\nOptions: + == MANDATORY == + --dir=dir ... outputted recaser directory. + --corpus=file ... inputted cased corpus. + + == OPTIONAL == + = Recaser Training configuration = + --train-script=file ... path to the train script (default: train-factored-phrase-model.perl in \$PATH). + --config=config ... training script configuration. + --scripts-root-dir=dir ... scripts directory. + --max-len=int ... max phrase length (default: 1). + + = Language Model Training configuration = + --lm=[IRSTLM,SRILM] ... language model (default: SRILM). + --build-lm=file ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM). + --ngram-count=file ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM). + + = Steps this script will perform = + (1) Truecasing (disabled); + (2) Language Model Training; + (3) Data Preparation + (4-10) Recaser Model Training; + (11) Cleanup. + --first-step=[1-11] ... step where script starts (default: 1). + --last-step=[1-11] ... step where script ends (default: 11). + + --help ... this usage output.\n"; + if ($ERROR) { + exit(1); + } + else { + exit(0); + } +} + # main loop `mkdir -p $DIR`; &truecase() if 0 && $FIRST_STEP == 1; From d875b0774bcff8966ec9261ccdd29afade67b55c Mon Sep 17 00:00:00 2001 From: Jehan Date: Sun, 27 Nov 2011 09:55:30 +0000 Subject: [PATCH 2/3] - Exit with failure when a step of train-recaser.sh fails. It is kind of hard to identify the cause of a problem (or even to see there is a problem) if a script continues when a main step failed. Better to exit when the error occurs with relevant logs. --- scripts/recaser/train-recaser.perl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl index d5e6c4ef1..5d1ef0b4a 100755 --- a/scripts/recaser/train-recaser.perl +++ b/scripts/recaser/train-recaser.perl @@ -104,7 +104,7 @@ sub train_lm { } print STDERR "** Using $LM **" . "\n"; print STDERR $cmd."\n"; - print STDERR `$cmd`; + system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n"); } sub prepare_data { @@ -154,12 +154,18 @@ sub train_recase_model { $cmd .= " -scripts-root-dir $SCRIPTS_ROOT_DIR" if $SCRIPTS_ROOT_DIR; $cmd .= " -config $CONFIG" if $CONFIG; print STDERR $cmd."\n"; - print STDERR `$cmd`; + system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n"); } sub cleanup { print STDERR "\n(11) Cleaning up @ ".`date`; `rm -f $DIR/extract*`; + my $clean_1 = $?; `rm -f $DIR/aligned*`; + my $clean_2 = $?; `rm -f $DIR/lex*`; + my $clean_3 = $?; + if ($clean_1 + $clean_2 + $clean_3 != 0) { + print STDERR "Training successful but some files could not be cleaned.\n"; + } } From f3cb3ad78971bb9f86b692345fef169a851d7651 Mon Sep 17 00:00:00 2001 From: Jehan Date: Sun, 27 Nov 2011 10:14:39 +0000 Subject: [PATCH 3/3] - Bug fix: when --help set, errors on absence of --corpus or --dir must not be displayed. - Unset variables must not be set as 0. --- scripts/recaser/train-recaser.perl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl index 5d1ef0b4a..a5a707554 100755 --- a/scripts/recaser/train-recaser.perl +++ b/scripts/recaser/train-recaser.perl @@ -8,7 +8,7 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); # apply switches -my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG); +my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR); my $LM = "SRILM"; # SRILM is default. my $BUILD_LM = "build-lm.sh"; my $NGRAM_COUNT = "ngram-count"; @@ -16,8 +16,6 @@ my $TRAIN_SCRIPT = "train-factored-phrase-model.perl"; my $MAX_LEN = 1; my $FIRST_STEP = 1; my $LAST_STEP = 11; -my $HELP = 0; -my $ERROR = 0; $ERROR = "training Aborted." unless &GetOptions('first-step=i' => \$FIRST_STEP, 'last-step=i' => \$LAST_STEP, @@ -33,8 +31,8 @@ $ERROR = "training Aborted." 'help' => \$HELP); # check and set default to unset parameters -$ERROR = "please specify working dir --dir" unless defined($DIR); -$ERROR = "please specify --corpus" if !defined($CORPUS) +$ERROR = "please specify working dir --dir" unless defined($DIR) || defined($HELP); +$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP) && $FIRST_STEP <= 2 && $LAST_STEP >= 1; if ($HELP || $ERROR) {