Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-12-29 06:52:34 +03:00 · 2011-11-30 08:53:45 +00:00 · 2011-11-30 08:53:45 +00:00 · 6aede90599
commit 6aede90599
parent d4099a7455 55e1c94dee
8 changed files with 84 additions and 34 deletions
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@ -62,4 +62,3 @@ Binaries for all external libraries needed can be downloaded from
 Only the decoder is developed and tested under Windows. There are difficulties using the training scripts under Windows, even with Cygwin.


-
--- a/cruise-control/config.ems
+++ b/cruise-control/config.ems
@ -30,10 +30,10 @@ toy-data = $moses-script-dir/ems/example/data
 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/moses-cmd/src/moses
+decoder = $moses-src-dir/dist/bin/moses

 # conversion of phrase table into binary on-disk format
-ttable-binarizer = $moses-src-dir/misc/processPhraseTable
+ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable

 # conversion of rule table into binary on-disk format
 #ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"
--- a/cruise-control/test_all_new_commits.sh
+++ b/cruise-control/test_all_new_commits.sh
@ -86,15 +86,7 @@ function run_single_test () {

  err=""

-  echo "## ./bjam clean" >> $longlog
-  ./bjam clean >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
-
-  echo "## ./bjam $MCC_CONFIGURE_ARGS" >> $longlog
-  if [ -z "$err" ]; then
-    ./bjam $MCC_CONFIGURE_ARGS >> $longlog 2>&1 || err="bjam"
-  fi
-
-  cd regression-testing
+   cd regression-testing
  regtest_file=$(echo "$REGTEST_ARCHIVE" | sed 's/^.*\///')

  # download data for regression tests if necessary
@ -104,15 +96,22 @@ function run_single_test () {
    tar xzf $regtest_file
    touch $regtest_file.ok
  fi
+  regtest_dir=$PWD/$(basename $regtest_file .tgz)
+  cd ..

+
+  echo "## ./bjam clean" >> $longlog
+  ./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
+
+  echo "## ./bjam $MCC_CONFIGURE_ARGS" >> $longlog
+  if [ -z "$err" ]; then
+    ./bjam $MCC_CONFIGURE_ARGS >> $longlog 2>&1 || err="bjam"
+  fi
+  
  echo "## regression tests" >> $longlog
  if [ -z "$err" ]; then
-    ./run-test-suite.perl &>> $longlog
-    regtest_status=$?
-    [ $regtest_status -eq 1 ] && die "Failed to run regression tests"
-    [ $regtest_status -eq 2 ] && err="regression tests"
+    ./bjam $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || err="regression tests"
  fi
-  cd ..

  if [ -z "$err" ] && [ "$MCC_RUN_EMS" = "yes" ]; then
    echo "## EMS" >> $longlog
--- a/lm/build_binary.cc
+++ b/lm/build_binary.cc
@ -194,10 +194,9 @@ int main(int argc, char *argv[]) {
  }
  catch (const std::exception &e) {
    std::cerr << e.what() << std::endl;
-    std::cerr << "ERROR" << std::endl;
    return 1;
  }

-  std::cerr << "SUCCESS" << std::endl;
+  std::cerr << "KenLM build_binary SUCCESS" << std::endl;
  return 0;
 }
--- a/mert/BleuScorer.h
+++ b/mert/BleuScorer.h
@ -56,7 +56,7 @@ private:

  typedef map<vector<int>,int,CompareNgrams> counts_t;
  typedef map<vector<int>,int,CompareNgrams>::iterator counts_iterator;
-  typedef map<vector<int>,int,CompareNgrams>::iterator counts_const_iterator;
+  typedef map<vector<int>,int,CompareNgrams>::const_iterator counts_const_iterator;
  typedef ScopedVector<counts_t> refcounts_t;

  /**
--- a/scripts/Jamfile
+++ b/scripts/Jamfile
@ -39,7 +39,7 @@ if $(location) {

  install phrase-extract : training/phrase-extract//released-programs : <location>$(location)/training/phrase-extract ;
  install lexical-reordering : training/lexical-reordering//score : <location>$(location)/training/lexical-reordering ;
-  install symal : training/symal//symal : <location>$(location)/symal ;
+  install symal : training/symal//symal : <location>$(location)/training/symal ;

  if $(WITH-GIZA) != no {
    install train-model : training//train-model.perl : <location>$(location)/training ;
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@ -8,7 +8,7 @@ binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");

 # apply switches
-my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG);
+my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
 my $LM = "SRILM"; # SRILM is default.
 my $BUILD_LM = "build-lm.sh";
 my $NGRAM_COUNT = "ngram-count";
@ -16,24 +16,66 @@ my $TRAIN_SCRIPT = "train-factored-phrase-model.perl";
 my $MAX_LEN = 1;
 my $FIRST_STEP = 1;
 my $LAST_STEP = 11;
-die("train-recaser.perl --dir recaser --corpus cased")
+$ERROR = "training Aborted."
    unless &GetOptions('first-step=i' => \$FIRST_STEP,
                       'last-step=i' => \$LAST_STEP,
                       'corpus=s' => \$CORPUS,
                       'config=s' => \$CONFIG,
-		       'dir=s' => \$DIR,
-		       'ngram-count=s' => \$NGRAM_COUNT,
-		       'build-lm=s' => \$BUILD_LM,
-		       'lm=s' => \$LM,
-		       'train-script=s' => \$TRAIN_SCRIPT,
-		       'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
-		       'max-len=i' => \$MAX_LEN);
+                       'dir=s' => \$DIR,
+                       'ngram-count=s' => \$NGRAM_COUNT,
+                       'build-lm=s' => \$BUILD_LM,
+                       'lm=s' => \$LM,
+                       'train-script=s' => \$TRAIN_SCRIPT,
+                       'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
+                       'max-len=i' => \$MAX_LEN,
+                       'help' => \$HELP);

 # check and set default to unset parameters
-die("please specify working dir --dir") unless defined($DIR);
-die("please specify --corpus") if !defined($CORPUS) 
+$ERROR = "please specify working dir --dir" unless defined($DIR) || defined($HELP);
+$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP) 
                                  && $FIRST_STEP <= 2 && $LAST_STEP >= 1;

+if ($HELP || $ERROR) {
+    if ($ERROR) {
+        print STDERR "ERROR: " . $ERROR . "\n";
+    }
+    print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]";
+
+    print STDERR "\n\nOptions:
+  == MANDATORY ==
+  --dir=dir                 ... outputted recaser directory.
+  --corpus=file             ... inputted cased corpus.
+
+  == OPTIONAL ==
+  = Recaser Training configuration =
+  --train-script=file       ... path to the train script (default: train-factored-phrase-model.perl in \$PATH).
+  --config=config           ... training script configuration.
+  --scripts-root-dir=dir    ... scripts directory.
+  --max-len=int             ... max phrase length (default: 1).
+
+  = Language Model Training configuration =
+  --lm=[IRSTLM,SRILM]       ... language model (default: SRILM).
+  --build-lm=file           ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
+  --ngram-count=file        ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).
+
+  = Steps this script will perform =
+  (1) Truecasing (disabled);
+  (2) Language Model Training;
+  (3) Data Preparation
+  (4-10) Recaser Model Training; 
+  (11) Cleanup.
+  --first-step=[1-11]       ... step where script starts (default: 1).
+  --last-step=[1-11]        ... step where script ends (default: 11).
+
+  --help                    ... this usage output.\n";
+  if ($ERROR) {
+    exit(1);
+  }
+  else {
+    exit(0);
+  }
+}
+
 # main loop
 `mkdir -p $DIR`;
 &truecase()           if 0 && $FIRST_STEP == 1;
@ -60,7 +102,7 @@ sub train_lm {
    }
    print STDERR "** Using $LM **" . "\n";
    print STDERR $cmd."\n";
-    print STDERR `$cmd`;
+    system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
 }

 sub prepare_data {
@ -110,12 +152,18 @@ sub train_recase_model {
    $cmd .= " -scripts-root-dir $SCRIPTS_ROOT_DIR" if $SCRIPTS_ROOT_DIR;
    $cmd .= " -config $CONFIG" if $CONFIG;
    print STDERR $cmd."\n";
-    print STDERR `$cmd`;
+    system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
 }

 sub cleanup {
    print STDERR "\n(11) Cleaning up @ ".`date`;
    `rm -f $DIR/extract*`;
+    my $clean_1 = $?;
    `rm -f $DIR/aligned*`;
+    my $clean_2 = $?;
    `rm -f $DIR/lex*`;
+    my $clean_3 = $?;
+    if ($clean_1 + $clean_2 + $clean_3 != 0) {
+        print STDERR "Training successful but some files could not be cleaned.\n";
+    }
 }
--- a/scripts/tokenizer/nonbreaking_prefixes/README.txt
+++ b/scripts/tokenizer/nonbreaking_prefixes/README.txt
@ -0,0 +1,5 @@
+The language suffix can be found here:
+
+http://www.loc.gov/standards/iso639-2/php/code_list.php
+
+
				`@ -62,4 +62,3 @@ Binaries for all external libraries needed can be downloaded from`
				`Only the decoder is developed and tested under Windows. There are difficulties using the training scripts under Windows, even with Cygwin.`