mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 11:28:48 +03:00
default kenlm training and inference in recaser
This commit is contained in:
parent
fc8e588f25
commit
1a82535cf8
@ -10,8 +10,9 @@ binmode(STDOUT, ":utf8");
|
||||
|
||||
# apply switches
|
||||
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
|
||||
my $LM = "SRILM"; # SRILM is default.
|
||||
my $LM = "KENLM"; # KENLM is default.
|
||||
my $BUILD_LM = "build-lm.sh";
|
||||
my $BUILD_KENLM = "$Bin/../../bin/lmplz";
|
||||
my $NGRAM_COUNT = "ngram-count";
|
||||
my $TRAIN_SCRIPT = "train-factored-phrase-model.perl";
|
||||
my $MAX_LEN = 1;
|
||||
@ -25,6 +26,7 @@ $ERROR = "training Aborted."
|
||||
'dir=s' => \$DIR,
|
||||
'ngram-count=s' => \$NGRAM_COUNT,
|
||||
'build-lm=s' => \$BUILD_LM,
|
||||
'build-kenlm=s' => \$BUILD_KENLM,
|
||||
'lm=s' => \$LM,
|
||||
'train-script=s' => \$TRAIN_SCRIPT,
|
||||
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
|
||||
@ -55,7 +57,7 @@ if ($HELP || $ERROR) {
|
||||
--max-len=int ... max phrase length (default: 1).
|
||||
|
||||
= Language Model Training configuration =
|
||||
--lm=[IRSTLM,SRILM] ... language model (default: SRILM).
|
||||
--lm=[IRSTLM,SRILM,KENLM] ... language model (default: KENLM).
|
||||
--build-lm=file ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
|
||||
--ngram-count=file ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).
|
||||
|
||||
@ -110,10 +112,14 @@ sub train_lm {
|
||||
if (uc $LM eq "IRSTLM") {
|
||||
$cmd = "$BUILD_LM -t /tmp -i $CORPUS -n 3 -o $DIR/cased.irstlm.gz";
|
||||
}
|
||||
else {
|
||||
elsif (uc $LM eq "SRILM") {
|
||||
$LM = "SRILM";
|
||||
$cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -kndiscount";
|
||||
}
|
||||
else {
|
||||
$LM = "KENLM";
|
||||
$cmd = "$BUILD_KENLM --prune 0 0 1 -S 50% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.gz";
|
||||
}
|
||||
print STDERR "** Using $LM **" . "\n";
|
||||
print STDERR $cmd."\n";
|
||||
system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
|
||||
@ -160,9 +166,12 @@ sub train_recase_model {
|
||||
if (uc $LM eq "IRSTLM") {
|
||||
$cmd .= " --lm 0:3:$DIR/cased.irstlm.gz:1";
|
||||
}
|
||||
else {
|
||||
elsif (uc $LM eq "SRILM") {
|
||||
$cmd .= " --lm 0:3:$DIR/cased.srilm.gz:8";
|
||||
}
|
||||
else {
|
||||
$cmd .= " --lm 0:3:$DIR/cased.kenlm.gz:8";
|
||||
}
|
||||
$cmd .= " -config $CONFIG" if $CONFIG;
|
||||
print STDERR $cmd."\n";
|
||||
system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
|
||||
|
Loading…
Reference in New Issue
Block a user