Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Kenneth Heafield 2011-11-30 08:53:45 +00:00
commit 6aede90599
8 changed files with 84 additions and 34 deletions

View File

@ -62,4 +62,3 @@ Binaries for all external libraries needed can be downloaded from
Only the decoder is developed and tested under Windows. There are difficulties using the training scripts under Windows, even with Cygwin.

View File

@ -30,10 +30,10 @@ toy-data = $moses-script-dir/ems/example/data
### basic tools
#
# moses decoder
decoder = $moses-src-dir/moses-cmd/src/moses
decoder = $moses-src-dir/dist/bin/moses
# conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
# conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"

View File

@ -86,15 +86,7 @@ function run_single_test () {
err=""
echo "## ./bjam clean" >> $longlog
./bjam clean >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
echo "## ./bjam $MCC_CONFIGURE_ARGS" >> $longlog
if [ -z "$err" ]; then
./bjam $MCC_CONFIGURE_ARGS >> $longlog 2>&1 || err="bjam"
fi
cd regression-testing
cd regression-testing
regtest_file=$(echo "$REGTEST_ARCHIVE" | sed 's/^.*\///')
# download data for regression tests if necessary
@ -104,15 +96,22 @@ function run_single_test () {
tar xzf $regtest_file
touch $regtest_file.ok
fi
regtest_dir=$PWD/$(basename $regtest_file .tgz)
cd ..
echo "## ./bjam clean" >> $longlog
./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
echo "## ./bjam $MCC_CONFIGURE_ARGS" >> $longlog
if [ -z "$err" ]; then
./bjam $MCC_CONFIGURE_ARGS >> $longlog 2>&1 || err="bjam"
fi
echo "## regression tests" >> $longlog
if [ -z "$err" ]; then
./run-test-suite.perl &>> $longlog
regtest_status=$?
[ $regtest_status -eq 1 ] && die "Failed to run regression tests"
[ $regtest_status -eq 2 ] && err="regression tests"
./bjam $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || err="regression tests"
fi
cd ..
if [ -z "$err" ] && [ "$MCC_RUN_EMS" = "yes" ]; then
echo "## EMS" >> $longlog

View File

@ -194,10 +194,9 @@ int main(int argc, char *argv[]) {
}
catch (const std::exception &e) {
std::cerr << e.what() << std::endl;
std::cerr << "ERROR" << std::endl;
return 1;
}
std::cerr << "SUCCESS" << std::endl;
std::cerr << "KenLM build_binary SUCCESS" << std::endl;
return 0;
}

View File

@ -56,7 +56,7 @@ private:
typedef map<vector<int>,int,CompareNgrams> counts_t;
typedef map<vector<int>,int,CompareNgrams>::iterator counts_iterator;
typedef map<vector<int>,int,CompareNgrams>::iterator counts_const_iterator;
typedef map<vector<int>,int,CompareNgrams>::const_iterator counts_const_iterator;
typedef ScopedVector<counts_t> refcounts_t;
/**

View File

@ -39,7 +39,7 @@ if $(location) {
install phrase-extract : training/phrase-extract//released-programs : <location>$(location)/training/phrase-extract ;
install lexical-reordering : training/lexical-reordering//score : <location>$(location)/training/lexical-reordering ;
install symal : training/symal//symal : <location>$(location)/symal ;
install symal : training/symal//symal : <location>$(location)/training/symal ;
if $(WITH-GIZA) != no {
install train-model : training//train-model.perl : <location>$(location)/training ;

View File

@ -8,7 +8,7 @@ binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
# apply switches
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG);
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
my $LM = "SRILM"; # SRILM is default.
my $BUILD_LM = "build-lm.sh";
my $NGRAM_COUNT = "ngram-count";
@ -16,24 +16,66 @@ my $TRAIN_SCRIPT = "train-factored-phrase-model.perl";
my $MAX_LEN = 1;
my $FIRST_STEP = 1;
my $LAST_STEP = 11;
die("train-recaser.perl --dir recaser --corpus cased")
$ERROR = "training Aborted."
unless &GetOptions('first-step=i' => \$FIRST_STEP,
'last-step=i' => \$LAST_STEP,
'corpus=s' => \$CORPUS,
'config=s' => \$CONFIG,
'dir=s' => \$DIR,
'ngram-count=s' => \$NGRAM_COUNT,
'build-lm=s' => \$BUILD_LM,
'lm=s' => \$LM,
'train-script=s' => \$TRAIN_SCRIPT,
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
'max-len=i' => \$MAX_LEN);
'dir=s' => \$DIR,
'ngram-count=s' => \$NGRAM_COUNT,
'build-lm=s' => \$BUILD_LM,
'lm=s' => \$LM,
'train-script=s' => \$TRAIN_SCRIPT,
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
'max-len=i' => \$MAX_LEN,
'help' => \$HELP);
# check and set default to unset parameters
die("please specify working dir --dir") unless defined($DIR);
die("please specify --corpus") if !defined($CORPUS)
$ERROR = "please specify working dir --dir" unless defined($DIR) || defined($HELP);
$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP)
&& $FIRST_STEP <= 2 && $LAST_STEP >= 1;
if ($HELP || $ERROR) {
if ($ERROR) {
print STDERR "ERROR: " . $ERROR . "\n";
}
print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]";
print STDERR "\n\nOptions:
== MANDATORY ==
--dir=dir ... outputted recaser directory.
--corpus=file ... inputted cased corpus.
== OPTIONAL ==
= Recaser Training configuration =
--train-script=file ... path to the train script (default: train-factored-phrase-model.perl in \$PATH).
--config=config ... training script configuration.
--scripts-root-dir=dir ... scripts directory.
--max-len=int ... max phrase length (default: 1).
= Language Model Training configuration =
--lm=[IRSTLM,SRILM] ... language model (default: SRILM).
--build-lm=file ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
--ngram-count=file ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).
= Steps this script will perform =
(1) Truecasing (disabled);
(2) Language Model Training;
(3) Data Preparation
(4-10) Recaser Model Training;
(11) Cleanup.
--first-step=[1-11] ... step where script starts (default: 1).
--last-step=[1-11] ... step where script ends (default: 11).
--help ... this usage output.\n";
if ($ERROR) {
exit(1);
}
else {
exit(0);
}
}
# main loop
`mkdir -p $DIR`;
&truecase() if 0 && $FIRST_STEP == 1;
@ -60,7 +102,7 @@ sub train_lm {
}
print STDERR "** Using $LM **" . "\n";
print STDERR $cmd."\n";
print STDERR `$cmd`;
system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
}
sub prepare_data {
@ -110,12 +152,18 @@ sub train_recase_model {
$cmd .= " -scripts-root-dir $SCRIPTS_ROOT_DIR" if $SCRIPTS_ROOT_DIR;
$cmd .= " -config $CONFIG" if $CONFIG;
print STDERR $cmd."\n";
print STDERR `$cmd`;
system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
}
sub cleanup {
print STDERR "\n(11) Cleaning up @ ".`date`;
`rm -f $DIR/extract*`;
my $clean_1 = $?;
`rm -f $DIR/aligned*`;
my $clean_2 = $?;
`rm -f $DIR/lex*`;
my $clean_3 = $?;
if ($clean_1 + $clean_2 + $clean_3 != 0) {
print STDERR "Training successful but some files could not be cleaned.\n";
}
}

View File

@ -0,0 +1,5 @@
The language suffix can be found here:
http://www.loc.gov/standards/iso639-2/php/code_list.php