mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
allow specification of weights for lm interpolation
This commit is contained in:
parent
73081786bc
commit
2239501b21
@ -22,6 +22,7 @@ clean
|
||||
rerun-on-change: max-sentence-length $moses-script-dir/training/clean-corpus-n.perl
|
||||
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 $max-sentence-length OUT.lines-retained
|
||||
error: there is a blank factor
|
||||
error: is too long! at
|
||||
parse
|
||||
in: clean-stem
|
||||
out: parsed-stem
|
||||
@ -104,7 +105,7 @@ tokenize
|
||||
train
|
||||
in: tokenized
|
||||
out: recase-config
|
||||
template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT -ngram-count $lm-training
|
||||
template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT $recasing-settings
|
||||
default-name: recasing/moses.ini
|
||||
tmp-name: recasing/model
|
||||
ignore-unless: EVALUATION:recaser
|
||||
@ -116,6 +117,7 @@ consolidate
|
||||
out: tokenized-stem
|
||||
default-name: truecaser/corpus
|
||||
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
|
||||
error: number of lines don't match
|
||||
train
|
||||
in: tokenized-stem
|
||||
out: truecase-model
|
||||
@ -159,7 +161,6 @@ mock-parse
|
||||
factorize
|
||||
in: mock-parsed-corpus
|
||||
out: factorized-corpus
|
||||
rerun-on-change: TRAINING:output-factors
|
||||
default-name: lm/factored
|
||||
pass-unless: factors
|
||||
parallelizable: yes
|
||||
@ -277,7 +278,7 @@ split-tuning
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
interpolate
|
||||
in: script split-tuning LM:lm
|
||||
rerun-on-change: srilm-dir group
|
||||
rerun-on-change: srilm-dir group weights
|
||||
out: lm
|
||||
default-name: lm/interpolated-lm
|
||||
randomize
|
||||
@ -1077,7 +1078,7 @@ decode
|
||||
default-name: evaluation/output
|
||||
qsub-script: yes
|
||||
ignore-if: use-hiero
|
||||
rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade
|
||||
rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade TRAINING:post-decoding-transliteration
|
||||
error: Translation was not performed correctly
|
||||
not-error: trans: No such file or directory
|
||||
final-model: yes
|
||||
|
@ -1535,7 +1535,6 @@ sub check_if_crashed {
|
||||
|
||||
# check if output file empty
|
||||
my $output = &get_default_file(&deconstruct_name($DO_STEP[$i]));
|
||||
print STDERR "".$DO_STEP[$i]." -> $output\n";
|
||||
# currently only works for single output file
|
||||
if (-e $output && -z $output) {
|
||||
push @DIGEST,"output file $output is empty";
|
||||
@ -2152,13 +2151,14 @@ sub define_training_build_transliteration_model {
|
||||
|
||||
my ($model, $corpus, $alignment) = &get_output_and_input($step_id);
|
||||
|
||||
my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir");
|
||||
my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
|
||||
my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
|
||||
my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
|
||||
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
|
||||
my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
|
||||
my $srilm_dir = &check_and_get("GENERAL:srilm-dir");
|
||||
my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir");
|
||||
my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
|
||||
my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
|
||||
my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
|
||||
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
|
||||
my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
|
||||
my $srilm_dir = &check_and_get("TRAINING:srilm-dir");
|
||||
my $decoder = &get("TRAINING:transliteration-decoder");
|
||||
|
||||
my $cmd = "$moses_script_dir/Transliteration/train-transliteration-module.pl";
|
||||
$cmd .= " --corpus-f $corpus.$input_extension";
|
||||
@ -2166,6 +2166,7 @@ sub define_training_build_transliteration_model {
|
||||
$cmd .= " --alignment $alignment.$sym_method";
|
||||
$cmd .= " --out-dir $model";
|
||||
$cmd .= " --moses-src-dir $moses_src_dir";
|
||||
$cmd .= " --decoder $decoder" if defined($decoder);
|
||||
$cmd .= " --external-bin-dir $external_bin_dir";
|
||||
$cmd .= " --srilm-dir $srilm_dir";
|
||||
$cmd .= " --input-extension $input_extension";
|
||||
@ -2174,7 +2175,7 @@ sub define_training_build_transliteration_model {
|
||||
$cmd .= " --source-syntax " if &get("GENERAL:input-parser");
|
||||
$cmd .= " --target-syntax " if &get("GENERAL:output-parser");
|
||||
|
||||
&create_step($step_id, $cmd);
|
||||
&create_step($step_id, $cmd);
|
||||
}
|
||||
|
||||
sub define_training_extract_phrases {
|
||||
@ -2496,10 +2497,19 @@ sub define_interpolated_lm_interpolate {
|
||||
$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id);
|
||||
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
|
||||
my $group = &get("INTERPOLATED-LM:group");
|
||||
my $weights = &get("INTERPOLATED-LM:weights");
|
||||
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir");
|
||||
|
||||
my $cmd = "";
|
||||
|
||||
my %WEIGHT;
|
||||
if (defined($weights)) {
|
||||
foreach (split(/ *, */,$weights)) {
|
||||
/^ *(\S+) *= *(\S+)/ || die("ERROR: wrong interpolation weight specification $_ ($weights)");
|
||||
$WEIGHT{$1} = $2;
|
||||
}
|
||||
}
|
||||
|
||||
# go through language models by factor and order
|
||||
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
|
||||
foreach my $factor (keys %{$ILM_SETS}) {
|
||||
@ -2508,11 +2518,18 @@ sub define_interpolated_lm_interpolate {
|
||||
|
||||
# get list of language model files
|
||||
my $lm_list = "";
|
||||
my $weight_list = "";
|
||||
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
|
||||
my ($id,$set) = split(/ /,$id_set,2);
|
||||
$lm_list .= $LM[$id].",";
|
||||
if (defined($weights)) {
|
||||
die("ERROR: no interpolation weight set for $factor:$order:$set (factor:order:set)")
|
||||
unless defined($WEIGHT{"$factor:$order:$set"});
|
||||
$weight_list .= $WEIGHT{"$factor:$order:$set"}.",";
|
||||
}
|
||||
}
|
||||
chop($lm_list);
|
||||
chop($weight_list);
|
||||
|
||||
# if grouping, identify position in list
|
||||
my $numbered_string = "";
|
||||
@ -2553,6 +2570,7 @@ sub define_interpolated_lm_interpolate {
|
||||
}
|
||||
$cmd .= "$interpolation_script --tuning $factored_tuning --name $name --srilm $srilm_dir --lm $lm_list";
|
||||
$cmd .= " --group \"$numbered_string\"" if defined($group);
|
||||
$cmd .= " --weights \"$weight_list\"" if defined($weights);
|
||||
$cmd .= "\n";
|
||||
}
|
||||
}
|
||||
@ -3418,10 +3436,11 @@ sub get_default_file {
|
||||
my $name = &construct_name($module,$set,$out);
|
||||
return &check_backoff_and_get($name);
|
||||
}
|
||||
# print "\t\tpassing $step -> ";
|
||||
# print "\t\tpassing $step\n";
|
||||
$i = $DEPENDENCY[$i][0];
|
||||
$step = $DO_STEP[$i];
|
||||
# print "\t\tbacking off to $step\n";
|
||||
($default_module,$default_set,$default_step) = &deconstruct_name($step);
|
||||
}
|
||||
|
||||
# get file name
|
||||
|
@ -12,7 +12,7 @@ binmode(STDERR, ":utf8");
|
||||
|
||||
my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
|
||||
my $TEMPDIR = "/tmp";
|
||||
my ($TUNING,$LM,$NAME,$GROUP,$CONTINUE);
|
||||
my ($TUNING,$LM,$NAME,$GROUP,$WEIGHTS,$CONTINUE);
|
||||
|
||||
die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
|
||||
unless &GetOptions('tuning=s' => => \$TUNING,
|
||||
@ -21,6 +21,7 @@ die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--sril
|
||||
'tempdir=s' => \$TEMPDIR,
|
||||
'continue' => \$CONTINUE,
|
||||
'group=s' => \$GROUP,
|
||||
'weights=s' => \$WEIGHTS,
|
||||
'lm=s' => \$LM);
|
||||
|
||||
# check and set default to unset parameters
|
||||
@ -32,6 +33,10 @@ die("ERROR: did not find srilm dir") unless -e $SRILM;
|
||||
die("ERROR: cannot run ngram") unless -x $SRILM."/ngram";
|
||||
|
||||
my @LM = split(/,/,$LM);
|
||||
my @WEIGHT;
|
||||
@WEIGHT = split(/,/,$WEIGHTS) if defined($WEIGHTS);
|
||||
die("ERROR: different number of weights and language models: ".scalar(@WEIGHT)." vs. ".scalar(@LM))
|
||||
if defined($WEIGHTS) && scalar(@WEIGHT) != scalar(@LM);
|
||||
|
||||
# establish order
|
||||
my $order = 0;
|
||||
@ -75,7 +80,7 @@ if (!defined($GROUP) && scalar(@LM) > 10) {
|
||||
|
||||
# normal interpolation
|
||||
if (!defined($GROUP)) {
|
||||
&interpolate($NAME,@LM);
|
||||
&interpolate($NAME,\@WEIGHT,@LM);
|
||||
exit;
|
||||
}
|
||||
|
||||
@ -98,50 +103,59 @@ foreach my $subgroup (split(/ /,$GROUP)) {
|
||||
my $name = $NAME.".group-".chr(97+($g++));
|
||||
push @SUB_NAME,$name;
|
||||
print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
|
||||
&interpolate($name, @SUB_LM) unless $CONTINUE && -e $name;
|
||||
&interpolate($name, undef, @SUB_LM) unless $CONTINUE && -e $name;
|
||||
}
|
||||
for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
|
||||
next if defined($ALREADY{$lm_i});
|
||||
push @SUB_NAME, $LM[$lm_i];
|
||||
}
|
||||
print STDERR "\n=== BUILDING FINAL LM ===\n\n";
|
||||
&interpolate($NAME, @SUB_NAME);
|
||||
&interpolate($NAME, undef, @SUB_NAME);
|
||||
|
||||
# main interpolation function
|
||||
sub interpolate {
|
||||
my ($name,@LM) = @_;
|
||||
my ($name,$WEIGHT,@LM) = @_;
|
||||
|
||||
die("cannot interpolate more than 10 language models at once: ",join(",",@LM))
|
||||
if scalar(@LM) > 10;
|
||||
|
||||
my $tmp = tempdir(DIR=>$TEMPDIR);
|
||||
my @LAMBDA;
|
||||
|
||||
# compute perplexity
|
||||
my $i = 0;
|
||||
foreach my $lm (@LM) {
|
||||
print STDERR "compute perplexity for $lm\n";
|
||||
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
|
||||
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
|
||||
$i++;
|
||||
# if weights are specified, use them
|
||||
if (defined($WEIGHT) && scalar(@$WEIGHT) == scalar(@LM)) {
|
||||
@LAMBDA = @$WEIGHT;
|
||||
}
|
||||
# no specified weights -> compute them
|
||||
else {
|
||||
|
||||
# compute lambdas
|
||||
print STDERR "computing lambdas...\n";
|
||||
my $cmd = "$SRILM/compute-best-mix";
|
||||
for(my $i=0;$i<scalar(@LM);$i++) {
|
||||
$cmd .= " $tmp/iplm.$$.$i";
|
||||
# compute perplexity
|
||||
my $i = 0;
|
||||
foreach my $lm (@LM) {
|
||||
print STDERR "compute perplexity for $lm\n";
|
||||
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
|
||||
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
|
||||
$i++;
|
||||
}
|
||||
|
||||
# compute lambdas
|
||||
print STDERR "computing lambdas...\n";
|
||||
my $cmd = "$SRILM/compute-best-mix";
|
||||
for(my $i=0;$i<scalar(@LM);$i++) {
|
||||
$cmd .= " $tmp/iplm.$$.$i";
|
||||
}
|
||||
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
|
||||
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
|
||||
my $mix = $mixout;
|
||||
`rm $tmp/iplm.$$.*`;
|
||||
$mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix");
|
||||
@LAMBDA = split(/ /,$1);
|
||||
}
|
||||
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
|
||||
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
|
||||
my $mix = $mixout;
|
||||
`rm $tmp/iplm.$$.*`;
|
||||
$mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix");
|
||||
my @LAMBDA = split(/ /,$1);
|
||||
|
||||
|
||||
# create new language model
|
||||
print STDERR "creating new language model...\n";
|
||||
$i = 0;
|
||||
$cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
|
||||
my $i = 0;
|
||||
my $cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
|
||||
foreach my $lm (@LM) {
|
||||
$cmd .= " -lm " if $i==0;
|
||||
$cmd .= " -mix-lm " if $i==1;
|
||||
|
Loading…
Reference in New Issue
Block a user