mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
allow for more than 10 language models by explicit or automatic grouping
This commit is contained in:
parent
b9622d0da3
commit
8d9c93e1aa
@ -34,7 +34,7 @@ irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
wmt12-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
@ -104,7 +104,7 @@ max-sentence-length = 80
|
||||
|
||||
### raw corpus files (untokenized, but sentence aligned)
|
||||
#
|
||||
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -121,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
#lowercased-stem =
|
||||
|
||||
[CORPUS:nc]
|
||||
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
|
||||
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
|
||||
|
||||
[CORPUS:un] IGNORE
|
||||
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
|
||||
|
||||
#################################################################
|
||||
# LANGUAGE MODEL TRAINING
|
||||
@ -178,7 +178,7 @@ type = 8
|
||||
|
||||
### raw corpus (untokenized)
|
||||
#
|
||||
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -190,13 +190,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
#lm =
|
||||
|
||||
[LM:nc]
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
[LM:un] IGNORE
|
||||
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
|
||||
[LM:news] IGNORE
|
||||
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
|
||||
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
|
||||
|
||||
|
||||
#################################################################
|
||||
@ -216,13 +216,17 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
|
||||
### tuning set
|
||||
# you may use the same set that is used for mert tuning (reference set)
|
||||
#
|
||||
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-tuning =
|
||||
#tokenized-tuning =
|
||||
#factored-tuning =
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
@ -374,13 +378,13 @@ tuning-settings = "-mertdir $moses-bin-dir"
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
|
||||
#raw-input =
|
||||
#tokenized-input =
|
||||
#factorized-input =
|
||||
#input =
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-reference =
|
||||
#tokenized-reference =
|
||||
#factorized-reference =
|
||||
@ -521,11 +525,11 @@ report-segmentation = yes
|
||||
# further precision breakdown by factor
|
||||
#precision-by-coverage-factor = pos
|
||||
|
||||
[EVALUATION:newstest2009]
|
||||
[EVALUATION:newstest2011]
|
||||
|
||||
### input data
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
|
||||
# raw-input =
|
||||
# tokenized-input =
|
||||
# factorized-input =
|
||||
@ -533,7 +537,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
|
||||
### reference data
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
|
||||
# raw-reference =
|
||||
# tokenized-reference =
|
||||
# reference =
|
||||
|
@ -34,7 +34,7 @@ irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
wmt12-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
@ -104,7 +104,7 @@ max-sentence-length = 80
|
||||
|
||||
### raw corpus files (untokenized, but sentence aligned)
|
||||
#
|
||||
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -121,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
#lowercased-stem =
|
||||
|
||||
[CORPUS:nc]
|
||||
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
|
||||
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
|
||||
|
||||
[CORPUS:un] IGNORE
|
||||
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
|
||||
|
||||
#################################################################
|
||||
# LANGUAGE MODEL TRAINING
|
||||
@ -178,7 +178,7 @@ order = 5
|
||||
|
||||
### raw corpus (untokenized)
|
||||
#
|
||||
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -190,19 +190,19 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
#lm =
|
||||
|
||||
[LM:nc]
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
[LM:un] IGNORE
|
||||
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
|
||||
[LM:news] IGNORE
|
||||
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
|
||||
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
|
||||
|
||||
[LM:nc=pos]
|
||||
factors = "pos"
|
||||
order = 7
|
||||
settings = "-interpolate -unk"
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
#################################################################
|
||||
# INTERPOLATING LANGUAGE MODELS
|
||||
@ -221,13 +221,17 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
|
||||
### tuning set
|
||||
# you may use the same set that is used for mert tuning (reference set)
|
||||
#
|
||||
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-tuning =
|
||||
#tokenized-tuning =
|
||||
#factored-tuning =
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
@ -394,13 +398,13 @@ tuning-settings = "-mertdir $moses-bin-dir"
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
|
||||
#raw-input =
|
||||
#tokenized-input =
|
||||
#factorized-input =
|
||||
#input =
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-reference =
|
||||
#tokenized-reference =
|
||||
#factorized-reference =
|
||||
@ -541,11 +545,11 @@ report-segmentation = yes
|
||||
# further precision breakdown by factor
|
||||
#precision-by-coverage-factor = pos
|
||||
|
||||
[EVALUATION:newstest2009]
|
||||
[EVALUATION:newstest2011]
|
||||
|
||||
### input data
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
|
||||
# raw-input =
|
||||
# tokenized-input =
|
||||
# factorized-input =
|
||||
@ -553,7 +557,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
|
||||
### reference data
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
|
||||
# raw-reference =
|
||||
# tokenized-reference =
|
||||
# reference =
|
||||
|
@ -34,7 +34,7 @@ irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
wmt12-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
@ -104,7 +104,7 @@ max-sentence-length = 80
|
||||
|
||||
### raw corpus files (untokenized, but sentence aligned)
|
||||
#
|
||||
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -121,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
#lowercased-stem =
|
||||
|
||||
[CORPUS:nc]
|
||||
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
|
||||
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
|
||||
|
||||
[CORPUS:un] IGNORE
|
||||
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
|
||||
|
||||
#################################################################
|
||||
# LANGUAGE MODEL TRAINING
|
||||
@ -178,7 +178,7 @@ type = 8
|
||||
|
||||
### raw corpus (untokenized)
|
||||
#
|
||||
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -190,13 +190,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
#lm =
|
||||
|
||||
[LM:nc]
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
[LM:un] IGNORE
|
||||
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
|
||||
[LM:news] IGNORE
|
||||
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
|
||||
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
|
||||
|
||||
|
||||
#################################################################
|
||||
@ -216,13 +216,17 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
|
||||
### tuning set
|
||||
# you may use the same set that is used for mert tuning (reference set)
|
||||
#
|
||||
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-tuning =
|
||||
#tokenized-tuning =
|
||||
#factored-tuning =
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
@ -374,13 +378,13 @@ tuning-settings = "-mertdir $moses-bin-dir"
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
|
||||
#raw-input =
|
||||
#tokenized-input =
|
||||
#factorized-input =
|
||||
#input =
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-reference =
|
||||
#tokenized-reference =
|
||||
#factorized-reference =
|
||||
@ -521,11 +525,11 @@ report-segmentation = yes
|
||||
# further precision breakdown by factor
|
||||
#precision-by-coverage-factor = pos
|
||||
|
||||
[EVALUATION:newstest2009]
|
||||
[EVALUATION:newstest2011]
|
||||
|
||||
### input data
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
|
||||
# raw-input =
|
||||
# tokenized-input =
|
||||
# factorized-input =
|
||||
@ -533,7 +537,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
|
||||
### reference data
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
|
||||
# raw-reference =
|
||||
# tokenized-reference =
|
||||
# reference =
|
||||
|
@ -34,7 +34,7 @@ irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
wmt12-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
@ -108,7 +108,7 @@ max-sentence-length = 80
|
||||
|
||||
### raw corpus files (untokenized, but sentence aligned)
|
||||
#
|
||||
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -125,10 +125,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
#lowercased-stem =
|
||||
|
||||
[CORPUS:nc]
|
||||
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
|
||||
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
|
||||
|
||||
[CORPUS:un] IGNORE
|
||||
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
|
||||
|
||||
#################################################################
|
||||
# LANGUAGE MODEL TRAINING
|
||||
@ -182,7 +182,7 @@ type = 8
|
||||
|
||||
### raw corpus (untokenized)
|
||||
#
|
||||
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -194,13 +194,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
#lm =
|
||||
|
||||
[LM:nc]
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
[LM:un] IGNORE
|
||||
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
|
||||
[LM:news] IGNORE
|
||||
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
|
||||
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
|
||||
|
||||
|
||||
#################################################################
|
||||
@ -220,13 +220,17 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
|
||||
### tuning set
|
||||
# you may use the same set that is used for mert tuning (reference set)
|
||||
#
|
||||
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-tuning =
|
||||
#tokenized-tuning =
|
||||
#factored-tuning =
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
@ -378,13 +382,13 @@ tuning-settings = "-mertdir $moses-bin-dir"
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
|
||||
#raw-input =
|
||||
#tokenized-input =
|
||||
#factorized-input =
|
||||
#input =
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-reference =
|
||||
#tokenized-reference =
|
||||
#factorized-reference =
|
||||
@ -525,11 +529,11 @@ report-segmentation = yes
|
||||
# further precision breakdown by factor
|
||||
#precision-by-coverage-factor = pos
|
||||
|
||||
[EVALUATION:newstest2009]
|
||||
[EVALUATION:newstest2011]
|
||||
|
||||
### input data
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
|
||||
# raw-input =
|
||||
# tokenized-input =
|
||||
# factorized-input =
|
||||
@ -537,7 +541,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
|
||||
### reference data
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
|
||||
# raw-reference =
|
||||
# tokenized-reference =
|
||||
# reference =
|
||||
|
@ -207,6 +207,10 @@ raw-corpus = $toy-data/nc-5k.$output-extension
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
|
@ -107,7 +107,7 @@ consolidate
|
||||
default-name: truecaser/corpus
|
||||
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
|
||||
train
|
||||
in: tokenized-stem
|
||||
in: tokenized-stem
|
||||
out: truecase-model
|
||||
rerun-on-change: trainer
|
||||
default-name: truecaser/truecase-model
|
||||
@ -253,7 +253,7 @@ split-tuning
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
interpolate
|
||||
in: script split-tuning LM:lm
|
||||
rerun-on-change: srilm-dir
|
||||
rerun-on-change: srilm-dir group
|
||||
out: lm
|
||||
default-name: lm/interpolated-lm
|
||||
randomize
|
||||
|
@ -1838,13 +1838,16 @@ sub define_training_interpolated_lm_interpolate {
|
||||
$interpolation_script, $tuning, @LM)
|
||||
= &get_output_and_input($step_id);
|
||||
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
|
||||
my $group = &get("INTERPOLATED-LM:group");
|
||||
|
||||
# get list of language model files
|
||||
my $lm_list = "";
|
||||
foreach (@LM) {
|
||||
$lm_list .= $_.",";
|
||||
}
|
||||
chop($lm_list);
|
||||
|
||||
|
||||
# sanity checks on order and factors
|
||||
my @LM_SETS = &get_sets("LM");
|
||||
my %OUTPUT_FACTORS;
|
||||
@ -1868,7 +1871,30 @@ sub define_training_interpolated_lm_interpolate {
|
||||
}
|
||||
}
|
||||
|
||||
# if grouping, identify position in list
|
||||
my $numbered_string = "";
|
||||
if (defined($group)) {
|
||||
my %POSITION;
|
||||
foreach my $set (@LM_SETS) {
|
||||
$POSITION{$set} = scalar keys %POSITION;
|
||||
}
|
||||
my $group_string = $group;
|
||||
$group_string =~ s/\s+/ /g;
|
||||
$group_string =~ s/ *, */,/g;
|
||||
$group_string =~ s/^ //;
|
||||
$group_string =~ s/ $//;
|
||||
$group_string .= " ";
|
||||
while($group_string =~ /^([^ ,]+)([ ,]+)(.*)$/) {
|
||||
die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition")
|
||||
if ! defined($POSITION{$1});
|
||||
$numbered_string .= $POSITION{$1}.$2;
|
||||
$group_string = $3;
|
||||
}
|
||||
chop($numbered_string);
|
||||
}
|
||||
|
||||
my $cmd = "$interpolation_script --tuning $tuning --name $interpolated_lm --srilm $srilm_dir --lm $lm_list";
|
||||
$cmd .= " --group \"$numbered_string\"" if defined($group);
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
@ -12,13 +12,14 @@ binmode(STDERR, ":utf8");
|
||||
|
||||
my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
|
||||
my $TEMPDIR = "/tmp";
|
||||
my ($TUNING,$LM,$NAME);
|
||||
my ($TUNING,$LM,$NAME,$GROUP);
|
||||
|
||||
die("interpolate-lm.perl --tuning set --name out-lm --lm lm1,lm2,lm3 [--srilm srtilm-dir --tempdir tempdir]")
|
||||
die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
|
||||
unless &GetOptions('tuning=s' => => \$TUNING,
|
||||
'name=s' => \$NAME,
|
||||
'srilm=s' => \$SRILM,
|
||||
'tempdir=s' => \$TEMPDIR,
|
||||
'group=s' => \$GROUP,
|
||||
'lm=s' => \$LM);
|
||||
|
||||
# check and set default to unset parameters
|
||||
@ -52,49 +53,109 @@ foreach my $lm (@LM) {
|
||||
}
|
||||
print STDERR "language models have order $order.\n";
|
||||
|
||||
my $tmp = tempdir(DIR=>$TEMPDIR);
|
||||
# too many language models? group them first
|
||||
if (!defined($GROUP) && scalar(@LM) > 10) {
|
||||
print STDERR "more than 10, automatically grouping language models.\n";
|
||||
my $num_groups = int(scalar(@LM)/10 + 0.99);
|
||||
my $size_groups = int(scalar(@LM)/$num_groups + 0.99);
|
||||
|
||||
# compute perplexity
|
||||
my $i = 0;
|
||||
foreach my $lm (@LM) {
|
||||
print STDERR "compute perplexity for $lm\n";
|
||||
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
|
||||
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
|
||||
$i++;
|
||||
$GROUP = "";
|
||||
for(my $i=0;$i<$num_groups;$i++) {
|
||||
$GROUP .= " " unless $i==0;
|
||||
for(my $j=0;$j<$size_groups;$j++) {
|
||||
my $lm_i = $i*$size_groups+$j;
|
||||
next if $lm_i >= scalar(@LM);
|
||||
$GROUP .= "," unless $j==0;
|
||||
$GROUP .= $lm_i;
|
||||
}
|
||||
}
|
||||
print STDERR "groups: $GROUP\n";
|
||||
}
|
||||
|
||||
# compute lambdas
|
||||
print STDERR "computing lambdas...\n";
|
||||
my $cmd = "$SRILM/compute-best-mix";
|
||||
for(my $i=0;$i<scalar(@LM);$i++) {
|
||||
$cmd .= " $tmp/iplm.$$.$i";
|
||||
# normal interpolation
|
||||
if (!defined($GROUP)) {
|
||||
&interpolate($NAME,@LM);
|
||||
exit;
|
||||
}
|
||||
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
|
||||
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
|
||||
my $mix = $mixout;
|
||||
`rm $tmp/iplm.$$.*`;
|
||||
$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
|
||||
my @LAMBDA = split(/ /,$1);
|
||||
|
||||
# create new language models
|
||||
print STDERR "creating new language model...\n";
|
||||
$i = 0;
|
||||
$cmd = "$SRILM/ngram -unk -order $order -write-lm $NAME";
|
||||
foreach my $lm (@LM) {
|
||||
$cmd .= " -lm " if $i==0;
|
||||
$cmd .= " -mix-lm " if $i==1;
|
||||
$cmd .= " -mix-lm$i " if $i>1;
|
||||
$cmd .= $lm;
|
||||
$cmd .= " -lambda " if $i==0;
|
||||
$cmd .= " -mix-lambda$i " if $i>1;
|
||||
$cmd .= $LAMBDA[$i] if $i!=1;
|
||||
$i++;
|
||||
# group language models into sub-interpolated models
|
||||
my %ALREADY;
|
||||
my $g = 0;
|
||||
my @SUB_NAME;
|
||||
foreach my $subgroup (split(/ /,$GROUP)) {
|
||||
my @SUB_LM;
|
||||
foreach my $lm_i (split(/,/,$subgroup)) {
|
||||
die("ERROR: LM id $lm_i in group definition out of range") if $lm_i >= scalar(@LM);
|
||||
push @SUB_LM,$LM[$lm_i];
|
||||
$ALREADY{$lm_i} = 1;
|
||||
}
|
||||
#if (scalar @SUB_NAME == 0 && scalar keys %ALREADY == scalar @LM) {
|
||||
# print STDERR "WARNING: grouped all language models into one, perform normal interpolation\n";
|
||||
# &interpolate($NAME,@LM);
|
||||
# exit;
|
||||
#}
|
||||
my $name = $NAME.".group-".chr(97+($g++));
|
||||
push @SUB_NAME,$name;
|
||||
print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
|
||||
&interpolate($name, @SUB_LM);
|
||||
}
|
||||
safesystem($cmd) or die "Failed.";
|
||||
for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
|
||||
next if defined($ALREADY{$lm_i});
|
||||
push @SUB_NAME, $LM[$lm_i];
|
||||
}
|
||||
print STDERR "\n=== BUILDING FINAL LM ===\n\n";
|
||||
&interpolate($NAME, @SUB_NAME);
|
||||
|
||||
rmtree($tmp); # remove the temp dir
|
||||
print STDERR "done.\n";
|
||||
# main interpolation function
|
||||
sub interpolate {
|
||||
my ($name,@LM) = @_;
|
||||
|
||||
die("cannot interpolate more than 10 language models at once.")
|
||||
if scalar(@LM) > 10;
|
||||
|
||||
my $tmp = tempdir(DIR=>$TEMPDIR);
|
||||
|
||||
# compute perplexity
|
||||
my $i = 0;
|
||||
foreach my $lm (@LM) {
|
||||
print STDERR "compute perplexity for $lm\n";
|
||||
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
|
||||
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
|
||||
$i++;
|
||||
}
|
||||
|
||||
# compute lambdas
|
||||
print STDERR "computing lambdas...\n";
|
||||
my $cmd = "$SRILM/compute-best-mix";
|
||||
for(my $i=0;$i<scalar(@LM);$i++) {
|
||||
$cmd .= " $tmp/iplm.$$.$i";
|
||||
}
|
||||
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
|
||||
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
|
||||
my $mix = $mixout;
|
||||
`rm $tmp/iplm.$$.*`;
|
||||
$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
|
||||
my @LAMBDA = split(/ /,$1);
|
||||
|
||||
# create new language model
|
||||
print STDERR "creating new language model...\n";
|
||||
$i = 0;
|
||||
$cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
|
||||
foreach my $lm (@LM) {
|
||||
$cmd .= " -lm " if $i==0;
|
||||
$cmd .= " -mix-lm " if $i==1;
|
||||
$cmd .= " -mix-lm$i " if $i>1;
|
||||
$cmd .= $lm;
|
||||
$cmd .= " -lambda " if $i==0;
|
||||
$cmd .= " -mix-lambda$i " if $i>1;
|
||||
$cmd .= $LAMBDA[$i] if $i!=1;
|
||||
$i++;
|
||||
}
|
||||
safesystem($cmd) or die "Failed.";
|
||||
|
||||
rmtree($tmp); # remove the temp dir
|
||||
print STDERR "done.\n";
|
||||
}
|
||||
|
||||
sub safesystem {
|
||||
print STDERR "Executing: @_\n";
|
||||
|
Loading…
Reference in New Issue
Block a user