allow specification of weights for lm interpolation

This commit is contained in:
phikoehn 2014-07-23 15:39:42 +01:00
parent 73081786bc
commit 2239501b21
3 changed files with 74 additions and 40 deletions

View File

@ -22,6 +22,7 @@ clean
rerun-on-change: max-sentence-length $moses-script-dir/training/clean-corpus-n.perl
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 $max-sentence-length OUT.lines-retained
error: there is a blank factor
error: is too long! at
parse
in: clean-stem
out: parsed-stem
@ -104,7 +105,7 @@ tokenize
train
in: tokenized
out: recase-config
template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT -ngram-count $lm-training
template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT $recasing-settings
default-name: recasing/moses.ini
tmp-name: recasing/model
ignore-unless: EVALUATION:recaser
@ -116,6 +117,7 @@ consolidate
out: tokenized-stem
default-name: truecaser/corpus
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
error: number of lines don't match
train
in: tokenized-stem
out: truecase-model
@ -159,7 +161,6 @@ mock-parse
factorize
in: mock-parsed-corpus
out: factorized-corpus
rerun-on-change: TRAINING:output-factors
default-name: lm/factored
pass-unless: factors
parallelizable: yes
@ -277,7 +278,7 @@ split-tuning
template: $output-splitter -model IN1.$output-extension < IN > OUT
interpolate
in: script split-tuning LM:lm
rerun-on-change: srilm-dir group
rerun-on-change: srilm-dir group weights
out: lm
default-name: lm/interpolated-lm
randomize
@ -1077,7 +1078,7 @@ decode
default-name: evaluation/output
qsub-script: yes
ignore-if: use-hiero
rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade
rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade TRAINING:post-decoding-transliteration
error: Translation was not performed correctly
not-error: trans: No such file or directory
final-model: yes

View File

@ -1535,7 +1535,6 @@ sub check_if_crashed {
# check if output file empty
my $output = &get_default_file(&deconstruct_name($DO_STEP[$i]));
print STDERR "".$DO_STEP[$i]." -> $output\n";
# currently only works for single output file
if (-e $output && -z $output) {
push @DIGEST,"output file $output is empty";
@ -2152,13 +2151,14 @@ sub define_training_build_transliteration_model {
my ($model, $corpus, $alignment) = &get_output_and_input($step_id);
my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir");
my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
my $srilm_dir = &check_and_get("GENERAL:srilm-dir");
my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir");
my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
my $srilm_dir = &check_and_get("TRAINING:srilm-dir");
my $decoder = &get("TRAINING:transliteration-decoder");
my $cmd = "$moses_script_dir/Transliteration/train-transliteration-module.pl";
$cmd .= " --corpus-f $corpus.$input_extension";
@ -2166,6 +2166,7 @@ sub define_training_build_transliteration_model {
$cmd .= " --alignment $alignment.$sym_method";
$cmd .= " --out-dir $model";
$cmd .= " --moses-src-dir $moses_src_dir";
$cmd .= " --decoder $decoder" if defined($decoder);
$cmd .= " --external-bin-dir $external_bin_dir";
$cmd .= " --srilm-dir $srilm_dir";
$cmd .= " --input-extension $input_extension";
@ -2174,7 +2175,7 @@ sub define_training_build_transliteration_model {
$cmd .= " --source-syntax " if &get("GENERAL:input-parser");
$cmd .= " --target-syntax " if &get("GENERAL:output-parser");
&create_step($step_id, $cmd);
&create_step($step_id, $cmd);
}
sub define_training_extract_phrases {
@ -2496,10 +2497,19 @@ sub define_interpolated_lm_interpolate {
$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id);
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
my $group = &get("INTERPOLATED-LM:group");
my $weights = &get("INTERPOLATED-LM:weights");
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir");
my $cmd = "";
my %WEIGHT;
if (defined($weights)) {
foreach (split(/ *, */,$weights)) {
/^ *(\S+) *= *(\S+)/ || die("ERROR: wrong interpolation weight specification $_ ($weights)");
$WEIGHT{$1} = $2;
}
}
# go through language models by factor and order
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
foreach my $factor (keys %{$ILM_SETS}) {
@ -2508,11 +2518,18 @@ sub define_interpolated_lm_interpolate {
# get list of language model files
my $lm_list = "";
my $weight_list = "";
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
my ($id,$set) = split(/ /,$id_set,2);
$lm_list .= $LM[$id].",";
if (defined($weights)) {
die("ERROR: no interpolation weight set for $factor:$order:$set (factor:order:set)")
unless defined($WEIGHT{"$factor:$order:$set"});
$weight_list .= $WEIGHT{"$factor:$order:$set"}.",";
}
}
chop($lm_list);
chop($weight_list);
# if grouping, identify position in list
my $numbered_string = "";
@ -2553,6 +2570,7 @@ sub define_interpolated_lm_interpolate {
}
$cmd .= "$interpolation_script --tuning $factored_tuning --name $name --srilm $srilm_dir --lm $lm_list";
$cmd .= " --group \"$numbered_string\"" if defined($group);
$cmd .= " --weights \"$weight_list\"" if defined($weights);
$cmd .= "\n";
}
}
@ -3418,10 +3436,11 @@ sub get_default_file {
my $name = &construct_name($module,$set,$out);
return &check_backoff_and_get($name);
}
# print "\t\tpassing $step -> ";
# print "\t\tpassing $step\n";
$i = $DEPENDENCY[$i][0];
$step = $DO_STEP[$i];
# print "\t\tbacking off to $step\n";
($default_module,$default_set,$default_step) = &deconstruct_name($step);
}
# get file name

View File

@ -12,7 +12,7 @@ binmode(STDERR, ":utf8");
my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
my $TEMPDIR = "/tmp";
my ($TUNING,$LM,$NAME,$GROUP,$CONTINUE);
my ($TUNING,$LM,$NAME,$GROUP,$WEIGHTS,$CONTINUE);
die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
unless &GetOptions('tuning=s' => => \$TUNING,
@ -21,6 +21,7 @@ die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--sril
'tempdir=s' => \$TEMPDIR,
'continue' => \$CONTINUE,
'group=s' => \$GROUP,
'weights=s' => \$WEIGHTS,
'lm=s' => \$LM);
# check and set default to unset parameters
@ -32,6 +33,10 @@ die("ERROR: did not find srilm dir") unless -e $SRILM;
die("ERROR: cannot run ngram") unless -x $SRILM."/ngram";
my @LM = split(/,/,$LM);
my @WEIGHT;
@WEIGHT = split(/,/,$WEIGHTS) if defined($WEIGHTS);
die("ERROR: different number of weights and language models: ".scalar(@WEIGHT)." vs. ".scalar(@LM))
if defined($WEIGHTS) && scalar(@WEIGHT) != scalar(@LM);
# establish order
my $order = 0;
@ -75,7 +80,7 @@ if (!defined($GROUP) && scalar(@LM) > 10) {
# normal interpolation
if (!defined($GROUP)) {
&interpolate($NAME,@LM);
&interpolate($NAME,\@WEIGHT,@LM);
exit;
}
@ -98,50 +103,59 @@ foreach my $subgroup (split(/ /,$GROUP)) {
my $name = $NAME.".group-".chr(97+($g++));
push @SUB_NAME,$name;
print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
&interpolate($name, @SUB_LM) unless $CONTINUE && -e $name;
&interpolate($name, undef, @SUB_LM) unless $CONTINUE && -e $name;
}
for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
next if defined($ALREADY{$lm_i});
push @SUB_NAME, $LM[$lm_i];
}
print STDERR "\n=== BUILDING FINAL LM ===\n\n";
&interpolate($NAME, @SUB_NAME);
&interpolate($NAME, undef, @SUB_NAME);
# main interpolation function
sub interpolate {
my ($name,@LM) = @_;
my ($name,$WEIGHT,@LM) = @_;
die("cannot interpolate more than 10 language models at once: ",join(",",@LM))
if scalar(@LM) > 10;
my $tmp = tempdir(DIR=>$TEMPDIR);
my @LAMBDA;
# compute perplexity
my $i = 0;
foreach my $lm (@LM) {
print STDERR "compute perplexity for $lm\n";
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
$i++;
# if weights are specified, use them
if (defined($WEIGHT) && scalar(@$WEIGHT) == scalar(@LM)) {
@LAMBDA = @$WEIGHT;
}
# no specified weights -> compute them
else {
# compute lambdas
print STDERR "computing lambdas...\n";
my $cmd = "$SRILM/compute-best-mix";
for(my $i=0;$i<scalar(@LM);$i++) {
$cmd .= " $tmp/iplm.$$.$i";
# compute perplexity
my $i = 0;
foreach my $lm (@LM) {
print STDERR "compute perplexity for $lm\n";
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
$i++;
}
# compute lambdas
print STDERR "computing lambdas...\n";
my $cmd = "$SRILM/compute-best-mix";
for(my $i=0;$i<scalar(@LM);$i++) {
$cmd .= " $tmp/iplm.$$.$i";
}
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
my $mix = $mixout;
`rm $tmp/iplm.$$.*`;
$mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix");
@LAMBDA = split(/ /,$1);
}
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
my $mix = $mixout;
`rm $tmp/iplm.$$.*`;
$mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix");
my @LAMBDA = split(/ /,$1);
# create new language model
print STDERR "creating new language model...\n";
$i = 0;
$cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
my $i = 0;
my $cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
foreach my $lm (@LM) {
$cmd .= " -lm " if $i==0;
$cmd .= " -mix-lm " if $i==1;