mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
6aede90599
@ -62,4 +62,3 @@ Binaries for all external libraries needed can be downloaded from
|
||||
Only the decoder is developed and tested under Windows. There are difficulties using the training scripts under Windows, even with Cygwin.
|
||||
|
||||
|
||||
|
||||
|
@ -30,10 +30,10 @@ toy-data = $moses-script-dir/ems/example/data
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/moses-cmd/src/moses
|
||||
decoder = $moses-src-dir/dist/bin/moses
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
|
||||
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
#ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
@ -86,15 +86,7 @@ function run_single_test () {
|
||||
|
||||
err=""
|
||||
|
||||
echo "## ./bjam clean" >> $longlog
|
||||
./bjam clean >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
|
||||
|
||||
echo "## ./bjam $MCC_CONFIGURE_ARGS" >> $longlog
|
||||
if [ -z "$err" ]; then
|
||||
./bjam $MCC_CONFIGURE_ARGS >> $longlog 2>&1 || err="bjam"
|
||||
fi
|
||||
|
||||
cd regression-testing
|
||||
cd regression-testing
|
||||
regtest_file=$(echo "$REGTEST_ARCHIVE" | sed 's/^.*\///')
|
||||
|
||||
# download data for regression tests if necessary
|
||||
@ -104,15 +96,22 @@ function run_single_test () {
|
||||
tar xzf $regtest_file
|
||||
touch $regtest_file.ok
|
||||
fi
|
||||
regtest_dir=$PWD/$(basename $regtest_file .tgz)
|
||||
cd ..
|
||||
|
||||
|
||||
echo "## ./bjam clean" >> $longlog
|
||||
./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious"
|
||||
|
||||
echo "## ./bjam $MCC_CONFIGURE_ARGS" >> $longlog
|
||||
if [ -z "$err" ]; then
|
||||
./bjam $MCC_CONFIGURE_ARGS >> $longlog 2>&1 || err="bjam"
|
||||
fi
|
||||
|
||||
echo "## regression tests" >> $longlog
|
||||
if [ -z "$err" ]; then
|
||||
./run-test-suite.perl &>> $longlog
|
||||
regtest_status=$?
|
||||
[ $regtest_status -eq 1 ] && die "Failed to run regression tests"
|
||||
[ $regtest_status -eq 2 ] && err="regression tests"
|
||||
./bjam $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || err="regression tests"
|
||||
fi
|
||||
cd ..
|
||||
|
||||
if [ -z "$err" ] && [ "$MCC_RUN_EMS" = "yes" ]; then
|
||||
echo "## EMS" >> $longlog
|
||||
|
@ -194,10 +194,9 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
catch (const std::exception &e) {
|
||||
std::cerr << e.what() << std::endl;
|
||||
std::cerr << "ERROR" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cerr << "SUCCESS" << std::endl;
|
||||
std::cerr << "KenLM build_binary SUCCESS" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ private:
|
||||
|
||||
typedef map<vector<int>,int,CompareNgrams> counts_t;
|
||||
typedef map<vector<int>,int,CompareNgrams>::iterator counts_iterator;
|
||||
typedef map<vector<int>,int,CompareNgrams>::iterator counts_const_iterator;
|
||||
typedef map<vector<int>,int,CompareNgrams>::const_iterator counts_const_iterator;
|
||||
typedef ScopedVector<counts_t> refcounts_t;
|
||||
|
||||
/**
|
||||
|
@ -39,7 +39,7 @@ if $(location) {
|
||||
|
||||
install phrase-extract : training/phrase-extract//released-programs : <location>$(location)/training/phrase-extract ;
|
||||
install lexical-reordering : training/lexical-reordering//score : <location>$(location)/training/lexical-reordering ;
|
||||
install symal : training/symal//symal : <location>$(location)/symal ;
|
||||
install symal : training/symal//symal : <location>$(location)/training/symal ;
|
||||
|
||||
if $(WITH-GIZA) != no {
|
||||
install train-model : training//train-model.perl : <location>$(location)/training ;
|
||||
|
@ -8,7 +8,7 @@ binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
# apply switches
|
||||
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG);
|
||||
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
|
||||
my $LM = "SRILM"; # SRILM is default.
|
||||
my $BUILD_LM = "build-lm.sh";
|
||||
my $NGRAM_COUNT = "ngram-count";
|
||||
@ -16,24 +16,66 @@ my $TRAIN_SCRIPT = "train-factored-phrase-model.perl";
|
||||
my $MAX_LEN = 1;
|
||||
my $FIRST_STEP = 1;
|
||||
my $LAST_STEP = 11;
|
||||
die("train-recaser.perl --dir recaser --corpus cased")
|
||||
$ERROR = "training Aborted."
|
||||
unless &GetOptions('first-step=i' => \$FIRST_STEP,
|
||||
'last-step=i' => \$LAST_STEP,
|
||||
'corpus=s' => \$CORPUS,
|
||||
'config=s' => \$CONFIG,
|
||||
'dir=s' => \$DIR,
|
||||
'ngram-count=s' => \$NGRAM_COUNT,
|
||||
'build-lm=s' => \$BUILD_LM,
|
||||
'lm=s' => \$LM,
|
||||
'train-script=s' => \$TRAIN_SCRIPT,
|
||||
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
|
||||
'max-len=i' => \$MAX_LEN);
|
||||
'dir=s' => \$DIR,
|
||||
'ngram-count=s' => \$NGRAM_COUNT,
|
||||
'build-lm=s' => \$BUILD_LM,
|
||||
'lm=s' => \$LM,
|
||||
'train-script=s' => \$TRAIN_SCRIPT,
|
||||
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
|
||||
'max-len=i' => \$MAX_LEN,
|
||||
'help' => \$HELP);
|
||||
|
||||
# check and set default to unset parameters
|
||||
die("please specify working dir --dir") unless defined($DIR);
|
||||
die("please specify --corpus") if !defined($CORPUS)
|
||||
$ERROR = "please specify working dir --dir" unless defined($DIR) || defined($HELP);
|
||||
$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP)
|
||||
&& $FIRST_STEP <= 2 && $LAST_STEP >= 1;
|
||||
|
||||
if ($HELP || $ERROR) {
|
||||
if ($ERROR) {
|
||||
print STDERR "ERROR: " . $ERROR . "\n";
|
||||
}
|
||||
print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]";
|
||||
|
||||
print STDERR "\n\nOptions:
|
||||
== MANDATORY ==
|
||||
--dir=dir ... outputted recaser directory.
|
||||
--corpus=file ... inputted cased corpus.
|
||||
|
||||
== OPTIONAL ==
|
||||
= Recaser Training configuration =
|
||||
--train-script=file ... path to the train script (default: train-factored-phrase-model.perl in \$PATH).
|
||||
--config=config ... training script configuration.
|
||||
--scripts-root-dir=dir ... scripts directory.
|
||||
--max-len=int ... max phrase length (default: 1).
|
||||
|
||||
= Language Model Training configuration =
|
||||
--lm=[IRSTLM,SRILM] ... language model (default: SRILM).
|
||||
--build-lm=file ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
|
||||
--ngram-count=file ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).
|
||||
|
||||
= Steps this script will perform =
|
||||
(1) Truecasing (disabled);
|
||||
(2) Language Model Training;
|
||||
(3) Data Preparation
|
||||
(4-10) Recaser Model Training;
|
||||
(11) Cleanup.
|
||||
--first-step=[1-11] ... step where script starts (default: 1).
|
||||
--last-step=[1-11] ... step where script ends (default: 11).
|
||||
|
||||
--help ... this usage output.\n";
|
||||
if ($ERROR) {
|
||||
exit(1);
|
||||
}
|
||||
else {
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
# main loop
|
||||
`mkdir -p $DIR`;
|
||||
&truecase() if 0 && $FIRST_STEP == 1;
|
||||
@ -60,7 +102,7 @@ sub train_lm {
|
||||
}
|
||||
print STDERR "** Using $LM **" . "\n";
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
|
||||
}
|
||||
|
||||
sub prepare_data {
|
||||
@ -110,12 +152,18 @@ sub train_recase_model {
|
||||
$cmd .= " -scripts-root-dir $SCRIPTS_ROOT_DIR" if $SCRIPTS_ROOT_DIR;
|
||||
$cmd .= " -config $CONFIG" if $CONFIG;
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
|
||||
}
|
||||
|
||||
sub cleanup {
|
||||
print STDERR "\n(11) Cleaning up @ ".`date`;
|
||||
`rm -f $DIR/extract*`;
|
||||
my $clean_1 = $?;
|
||||
`rm -f $DIR/aligned*`;
|
||||
my $clean_2 = $?;
|
||||
`rm -f $DIR/lex*`;
|
||||
my $clean_3 = $?;
|
||||
if ($clean_1 + $clean_2 + $clean_3 != 0) {
|
||||
print STDERR "Training successful but some files could not be cleaned.\n";
|
||||
}
|
||||
}
|
||||
|
5
scripts/tokenizer/nonbreaking_prefixes/README.txt
Normal file
5
scripts/tokenizer/nonbreaking_prefixes/README.txt
Normal file
@ -0,0 +1,5 @@
|
||||
The language suffix can be found here:
|
||||
|
||||
http://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user