allow for more than 10 language models by explicit or automatic grouping

This commit is contained in:
Philipp Koehn 2011-12-22 00:37:46 +00:00
parent b9622d0da3
commit 8d9c93e1aa
8 changed files with 203 additions and 96 deletions

View File

@ -34,7 +34,7 @@ irstlm-dir = $moses-src-dir/irstlm/bin
randlm-dir = $moses-src-dir/randlm/bin
#
# data
wmt10-data = $working-dir/data
wmt12-data = $working-dir/data
### basic tools
#
@ -104,7 +104,7 @@ max-sentence-length = 80
### raw corpus files (untokenized, but sentence aligned)
#
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
### tokenized corpus files (may contain long sentences)
#
@ -121,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
#lowercased-stem =
[CORPUS:nc]
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
[CORPUS:un] IGNORE
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
#################################################################
# LANGUAGE MODEL TRAINING
@ -178,7 +178,7 @@ type = 8
### raw corpus (untokenized)
#
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
### tokenized corpus files (may contain long sentences)
#
@ -190,13 +190,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
#lm =
[LM:nc]
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
[LM:un] IGNORE
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
[LM:news] IGNORE
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
#################################################################
@ -216,13 +216,17 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
### tuning set
# you may use the same set that is used for mert tuning (reference set)
#
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-tuning =
#tokenized-tuning =
#factored-tuning =
#lowercased-tuning =
#split-tuning =
### group language models for hierarchical interpolation
# (flat interpolation is limited to 10 language models)
#group = "first,second fourth,fifth"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)
@ -374,13 +378,13 @@ tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
#
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
#raw-input =
#tokenized-input =
#factorized-input =
#input =
#
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-reference =
#tokenized-reference =
#factorized-reference =
@ -521,11 +525,11 @@ report-segmentation = yes
# further precision breakdown by factor
#precision-by-coverage-factor = pos
[EVALUATION:newstest2009]
[EVALUATION:newstest2011]
### input data
#
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
# raw-input =
# tokenized-input =
# factorized-input =
@ -533,7 +537,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
### reference data
#
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
# raw-reference =
# tokenized-reference =
# reference =

View File

@ -34,7 +34,7 @@ irstlm-dir = $moses-src-dir/irstlm/bin
randlm-dir = $moses-src-dir/randlm/bin
#
# data
wmt10-data = $working-dir/data
wmt12-data = $working-dir/data
### basic tools
#
@ -104,7 +104,7 @@ max-sentence-length = 80
### raw corpus files (untokenized, but sentence aligned)
#
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
### tokenized corpus files (may contain long sentences)
#
@ -121,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
#lowercased-stem =
[CORPUS:nc]
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
[CORPUS:un] IGNORE
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
#################################################################
# LANGUAGE MODEL TRAINING
@ -178,7 +178,7 @@ order = 5
### raw corpus (untokenized)
#
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
### tokenized corpus files (may contain long sentences)
#
@ -190,19 +190,19 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
#lm =
[LM:nc]
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
[LM:un] IGNORE
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
[LM:news] IGNORE
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
[LM:nc=pos]
factors = "pos"
order = 7
settings = "-interpolate -unk"
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
#################################################################
# INTERPOLATING LANGUAGE MODELS
@ -221,13 +221,17 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
### tuning set
# you may use the same set that is used for mert tuning (reference set)
#
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-tuning =
#tokenized-tuning =
#factored-tuning =
#lowercased-tuning =
#split-tuning =
### group language models for hierarchical interpolation
# (flat interpolation is limited to 10 language models)
#group = "first,second fourth,fifth"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)
@ -394,13 +398,13 @@ tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
#
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
#raw-input =
#tokenized-input =
#factorized-input =
#input =
#
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-reference =
#tokenized-reference =
#factorized-reference =
@ -541,11 +545,11 @@ report-segmentation = yes
# further precision breakdown by factor
#precision-by-coverage-factor = pos
[EVALUATION:newstest2009]
[EVALUATION:newstest2011]
### input data
#
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
# raw-input =
# tokenized-input =
# factorized-input =
@ -553,7 +557,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
### reference data
#
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
# raw-reference =
# tokenized-reference =
# reference =

View File

@ -34,7 +34,7 @@ irstlm-dir = $moses-src-dir/irstlm/bin
randlm-dir = $moses-src-dir/randlm/bin
#
# data
wmt10-data = $working-dir/data
wmt12-data = $working-dir/data
### basic tools
#
@ -104,7 +104,7 @@ max-sentence-length = 80
### raw corpus files (untokenized, but sentence aligned)
#
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
### tokenized corpus files (may contain long sentences)
#
@ -121,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
#lowercased-stem =
[CORPUS:nc]
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
[CORPUS:un] IGNORE
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
#################################################################
# LANGUAGE MODEL TRAINING
@ -178,7 +178,7 @@ type = 8
### raw corpus (untokenized)
#
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
### tokenized corpus files (may contain long sentences)
#
@ -190,13 +190,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
#lm =
[LM:nc]
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
[LM:un] IGNORE
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
[LM:news] IGNORE
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
#################################################################
@ -216,13 +216,17 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
### tuning set
# you may use the same set that is used for mert tuning (reference set)
#
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-tuning =
#tokenized-tuning =
#factored-tuning =
#lowercased-tuning =
#split-tuning =
### group language models for hierarchical interpolation
# (flat interpolation is limited to 10 language models)
#group = "first,second fourth,fifth"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)
@ -374,13 +378,13 @@ tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
#
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
#raw-input =
#tokenized-input =
#factorized-input =
#input =
#
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-reference =
#tokenized-reference =
#factorized-reference =
@ -521,11 +525,11 @@ report-segmentation = yes
# further precision breakdown by factor
#precision-by-coverage-factor = pos
[EVALUATION:newstest2009]
[EVALUATION:newstest2011]
### input data
#
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
# raw-input =
# tokenized-input =
# factorized-input =
@ -533,7 +537,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
### reference data
#
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
# raw-reference =
# tokenized-reference =
# reference =

View File

@ -34,7 +34,7 @@ irstlm-dir = $moses-src-dir/irstlm/bin
randlm-dir = $moses-src-dir/randlm/bin
#
# data
wmt10-data = $working-dir/data
wmt12-data = $working-dir/data
### basic tools
#
@ -108,7 +108,7 @@ max-sentence-length = 80
### raw corpus files (untokenized, but sentence aligned)
#
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
### tokenized corpus files (may contain long sentences)
#
@ -125,10 +125,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
#lowercased-stem =
[CORPUS:nc]
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
[CORPUS:un] IGNORE
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
#################################################################
# LANGUAGE MODEL TRAINING
@ -182,7 +182,7 @@ type = 8
### raw corpus (untokenized)
#
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
### tokenized corpus files (may contain long sentences)
#
@ -194,13 +194,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
#lm =
[LM:nc]
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
[LM:un] IGNORE
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
[LM:news] IGNORE
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
#################################################################
@ -220,13 +220,17 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
### tuning set
# you may use the same set that is used for mert tuning (reference set)
#
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-tuning =
#tokenized-tuning =
#factored-tuning =
#lowercased-tuning =
#split-tuning =
### group language models for hierarchical interpolation
# (flat interpolation is limited to 10 language models)
#group = "first,second fourth,fifth"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)
@ -378,13 +382,13 @@ tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
#
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
#raw-input =
#tokenized-input =
#factorized-input =
#input =
#
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-reference =
#tokenized-reference =
#factorized-reference =
@ -525,11 +529,11 @@ report-segmentation = yes
# further precision breakdown by factor
#precision-by-coverage-factor = pos
[EVALUATION:newstest2009]
[EVALUATION:newstest2011]
### input data
#
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
# raw-input =
# tokenized-input =
# factorized-input =
@ -537,7 +541,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
### reference data
#
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
# raw-reference =
# tokenized-reference =
# reference =

View File

@ -207,6 +207,10 @@ raw-corpus = $toy-data/nc-5k.$output-extension
#lowercased-tuning =
#split-tuning =
### group language models for hierarchical interpolation
# (flat interpolation is limited to 10 language models)
#group = "first,second fourth,fifth"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)

View File

@ -107,7 +107,7 @@ consolidate
default-name: truecaser/corpus
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
train
in: tokenized-stem
in: tokenized-stem
out: truecase-model
rerun-on-change: trainer
default-name: truecaser/truecase-model
@ -253,7 +253,7 @@ split-tuning
template: $output-splitter -model IN1.$output-extension < IN > OUT
interpolate
in: script split-tuning LM:lm
rerun-on-change: srilm-dir
rerun-on-change: srilm-dir group
out: lm
default-name: lm/interpolated-lm
randomize

View File

@ -1838,13 +1838,16 @@ sub define_training_interpolated_lm_interpolate {
$interpolation_script, $tuning, @LM)
= &get_output_and_input($step_id);
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
my $group = &get("INTERPOLATED-LM:group");
# get list of language model files
my $lm_list = "";
foreach (@LM) {
$lm_list .= $_.",";
}
chop($lm_list);
# sanity checks on order and factors
my @LM_SETS = &get_sets("LM");
my %OUTPUT_FACTORS;
@ -1868,7 +1871,30 @@ sub define_training_interpolated_lm_interpolate {
}
}
# if grouping, identify position in list
my $numbered_string = "";
if (defined($group)) {
my %POSITION;
foreach my $set (@LM_SETS) {
$POSITION{$set} = scalar keys %POSITION;
}
my $group_string = $group;
$group_string =~ s/\s+/ /g;
$group_string =~ s/ *, */,/g;
$group_string =~ s/^ //;
$group_string =~ s/ $//;
$group_string .= " ";
while($group_string =~ /^([^ ,]+)([ ,]+)(.*)$/) {
die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition")
if ! defined($POSITION{$1});
$numbered_string .= $POSITION{$1}.$2;
$group_string = $3;
}
chop($numbered_string);
}
my $cmd = "$interpolation_script --tuning $tuning --name $interpolated_lm --srilm $srilm_dir --lm $lm_list";
$cmd .= " --group \"$numbered_string\"" if defined($group);
&create_step($step_id,$cmd);
}

View File

@ -12,13 +12,14 @@ binmode(STDERR, ":utf8");
my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
my $TEMPDIR = "/tmp";
my ($TUNING,$LM,$NAME);
my ($TUNING,$LM,$NAME,$GROUP);
die("interpolate-lm.perl --tuning set --name out-lm --lm lm1,lm2,lm3 [--srilm srtilm-dir --tempdir tempdir]")
die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
unless &GetOptions('tuning=s' => => \$TUNING,
'name=s' => \$NAME,
'srilm=s' => \$SRILM,
'tempdir=s' => \$TEMPDIR,
'group=s' => \$GROUP,
'lm=s' => \$LM);
# check and set default to unset parameters
@ -52,49 +53,109 @@ foreach my $lm (@LM) {
}
print STDERR "language models have order $order.\n";
my $tmp = tempdir(DIR=>$TEMPDIR);
# too many language models? group them first
if (!defined($GROUP) && scalar(@LM) > 10) {
print STDERR "more than 10, automatically grouping language models.\n";
my $num_groups = int(scalar(@LM)/10 + 0.99);
my $size_groups = int(scalar(@LM)/$num_groups + 0.99);
# compute perplexity
my $i = 0;
foreach my $lm (@LM) {
print STDERR "compute perplexity for $lm\n";
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
$i++;
$GROUP = "";
for(my $i=0;$i<$num_groups;$i++) {
$GROUP .= " " unless $i==0;
for(my $j=0;$j<$size_groups;$j++) {
my $lm_i = $i*$size_groups+$j;
next if $lm_i >= scalar(@LM);
$GROUP .= "," unless $j==0;
$GROUP .= $lm_i;
}
}
print STDERR "groups: $GROUP\n";
}
# compute lambdas
print STDERR "computing lambdas...\n";
my $cmd = "$SRILM/compute-best-mix";
for(my $i=0;$i<scalar(@LM);$i++) {
$cmd .= " $tmp/iplm.$$.$i";
# normal interpolation
if (!defined($GROUP)) {
&interpolate($NAME,@LM);
exit;
}
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
my $mix = $mixout;
`rm $tmp/iplm.$$.*`;
$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
my @LAMBDA = split(/ /,$1);
# create new language models
print STDERR "creating new language model...\n";
$i = 0;
$cmd = "$SRILM/ngram -unk -order $order -write-lm $NAME";
foreach my $lm (@LM) {
$cmd .= " -lm " if $i==0;
$cmd .= " -mix-lm " if $i==1;
$cmd .= " -mix-lm$i " if $i>1;
$cmd .= $lm;
$cmd .= " -lambda " if $i==0;
$cmd .= " -mix-lambda$i " if $i>1;
$cmd .= $LAMBDA[$i] if $i!=1;
$i++;
# group language models into sub-interpolated models
my %ALREADY;
my $g = 0;
my @SUB_NAME;
foreach my $subgroup (split(/ /,$GROUP)) {
my @SUB_LM;
foreach my $lm_i (split(/,/,$subgroup)) {
die("ERROR: LM id $lm_i in group definition out of range") if $lm_i >= scalar(@LM);
push @SUB_LM,$LM[$lm_i];
$ALREADY{$lm_i} = 1;
}
#if (scalar @SUB_NAME == 0 && scalar keys %ALREADY == scalar @LM) {
# print STDERR "WARNING: grouped all language models into one, perform normal interpolation\n";
# &interpolate($NAME,@LM);
# exit;
#}
my $name = $NAME.".group-".chr(97+($g++));
push @SUB_NAME,$name;
print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
&interpolate($name, @SUB_LM);
}
safesystem($cmd) or die "Failed.";
for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
next if defined($ALREADY{$lm_i});
push @SUB_NAME, $LM[$lm_i];
}
print STDERR "\n=== BUILDING FINAL LM ===\n\n";
&interpolate($NAME, @SUB_NAME);
rmtree($tmp); # remove the temp dir
print STDERR "done.\n";
# main interpolation function
sub interpolate {
my ($name,@LM) = @_;
die("cannot interpolate more than 10 language models at once.")
if scalar(@LM) > 10;
my $tmp = tempdir(DIR=>$TEMPDIR);
# compute perplexity
my $i = 0;
foreach my $lm (@LM) {
print STDERR "compute perplexity for $lm\n";
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
$i++;
}
# compute lambdas
print STDERR "computing lambdas...\n";
my $cmd = "$SRILM/compute-best-mix";
for(my $i=0;$i<scalar(@LM);$i++) {
$cmd .= " $tmp/iplm.$$.$i";
}
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
my $mix = $mixout;
`rm $tmp/iplm.$$.*`;
$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
my @LAMBDA = split(/ /,$1);
# create new language model
print STDERR "creating new language model...\n";
$i = 0;
$cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
foreach my $lm (@LM) {
$cmd .= " -lm " if $i==0;
$cmd .= " -mix-lm " if $i==1;
$cmd .= " -mix-lm$i " if $i>1;
$cmd .= $lm;
$cmd .= " -lambda " if $i==0;
$cmd .= " -mix-lambda$i " if $i>1;
$cmd .= $LAMBDA[$i] if $i!=1;
$i++;
}
safesystem($cmd) or die "Failed.";
rmtree($tmp); # remove the temp dir
print STDERR "done.\n";
}
sub safesystem {
print STDERR "Executing: @_\n";