allow specification of weights for lm interpolation

2024-12-26 05:14:36 +03:00 · 2014-07-23 15:39:42 +01:00 · 2014-07-23 15:39:42 +01:00 · 2239501b21
commit 2239501b21
parent 73081786bc
3 changed files with 74 additions and 40 deletions
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -22,6 +22,7 @@ clean
 	rerun-on-change: max-sentence-length $moses-script-dir/training/clean-corpus-n.perl
 	template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 $max-sentence-length OUT.lines-retained
 	error: there is a blank factor
+	error: is too long! at
 parse
 	in: clean-stem
 	out: parsed-stem
@ -104,7 +105,7 @@ tokenize
 train
 	in: tokenized
 	out: recase-config
-	template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT -ngram-count $lm-training
+	template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir OUT.model -corpus IN -scripts-root-dir $moses-script-dir -config OUT $recasing-settings
 	default-name: recasing/moses.ini
        tmp-name: recasing/model
 	ignore-unless: EVALUATION:recaser
@ -116,6 +117,7 @@ consolidate
 	out: tokenized-stem
 	default-name: truecaser/corpus
 	template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
+	error: number of lines don't match
 train
 	in: tokenized-stem
 	out: truecase-model
@ -159,7 +161,6 @@ mock-parse
 factorize
 	in: mock-parsed-corpus
 	out: factorized-corpus
-	rerun-on-change: TRAINING:output-factors
 	default-name: lm/factored
 	pass-unless: factors	
 	parallelizable: yes
@ -277,7 +278,7 @@ split-tuning
 	template: $output-splitter -model IN1.$output-extension < IN > OUT
 interpolate
 	in: script split-tuning LM:lm
-	rerun-on-change: srilm-dir group
+	rerun-on-change: srilm-dir group weights
 	out: lm
 	default-name: lm/interpolated-lm
 randomize
@ -1077,7 +1078,7 @@ decode
 	default-name: evaluation/output
 	qsub-script: yes
 	ignore-if: use-hiero
-	rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade
+	rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade TRAINING:post-decoding-transliteration
 	error: Translation was not performed correctly
 	not-error: trans: No such file or directory
 	final-model: yes
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -1535,7 +1535,6 @@ sub check_if_crashed {

    # check if output file empty
    my $output = &get_default_file(&deconstruct_name($DO_STEP[$i]));
-    print STDERR "".$DO_STEP[$i]." -> $output\n";
    # currently only works for single output file
    if (-e $output && -z $output) {
      push @DIGEST,"output file $output is empty";
@ -2152,13 +2151,14 @@ sub define_training_build_transliteration_model {

    my ($model, $corpus, $alignment) = &get_output_and_input($step_id);

-		my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir");
-		my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
-		my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
-		my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
-		my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
-		my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
-		my $srilm_dir = &check_and_get("GENERAL:srilm-dir");
+    my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir");
+    my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
+    my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
+    my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method");
+    my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
+    my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir");
+    my $srilm_dir = &check_and_get("TRAINING:srilm-dir");
+    my $decoder = &get("TRAINING:transliteration-decoder");

    my $cmd = "$moses_script_dir/Transliteration/train-transliteration-module.pl";
    $cmd .= " --corpus-f $corpus.$input_extension";
@ -2166,6 +2166,7 @@ sub define_training_build_transliteration_model {
    $cmd .= " --alignment $alignment.$sym_method";
    $cmd .= " --out-dir $model";
    $cmd .= " --moses-src-dir $moses_src_dir";
+    $cmd .= " --decoder $decoder" if defined($decoder);
    $cmd .= " --external-bin-dir $external_bin_dir";
    $cmd .= " --srilm-dir $srilm_dir";
    $cmd .= " --input-extension $input_extension";
@ -2174,7 +2175,7 @@ sub define_training_build_transliteration_model {
    $cmd .= " --source-syntax " if &get("GENERAL:input-parser");
    $cmd .= " --target-syntax " if &get("GENERAL:output-parser");

-		&create_step($step_id, $cmd);
+    &create_step($step_id, $cmd);
 }

 sub define_training_extract_phrases {
@ -2496,10 +2497,19 @@ sub define_interpolated_lm_interpolate {
 	$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id);
    my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
    my $group = &get("INTERPOLATED-LM:group");
+    my $weights = &get("INTERPOLATED-LM:weights");
    my $scripts = &check_backoff_and_get("TUNING:moses-script-dir");

    my $cmd = "";

+    my %WEIGHT;
+    if (defined($weights)) {
+      foreach (split(/ *, */,$weights)) {
+        /^ *(\S+) *= *(\S+)/ || die("ERROR: wrong interpolation weight specification $_ ($weights)");
+        $WEIGHT{$1} = $2;
+      }
+    }
+
    # go through language models by factor and order 
    my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
    foreach my $factor (keys %{$ILM_SETS}) {
@ -2508,11 +2518,18 @@ sub define_interpolated_lm_interpolate {

        # get list of language model files
        my $lm_list = "";
+        my $weight_list = "";
        foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
          my ($id,$set) = split(/ /,$id_set,2);
          $lm_list .= $LM[$id].",";
+          if (defined($weights)) { 
+            die("ERROR: no interpolation weight set for $factor:$order:$set (factor:order:set)") 
+              unless defined($WEIGHT{"$factor:$order:$set"});
+            $weight_list .= $WEIGHT{"$factor:$order:$set"}.",";
+          }
        }
        chop($lm_list);
+        chop($weight_list);

        # if grouping, identify position in list
        my $numbered_string = "";
@ -2553,6 +2570,7 @@ sub define_interpolated_lm_interpolate {
        }
        $cmd .= "$interpolation_script --tuning $factored_tuning --name $name --srilm $srilm_dir --lm $lm_list";
        $cmd .= " --group \"$numbered_string\"" if defined($group);
+        $cmd .= " --weights \"$weight_list\"" if defined($weights);
        $cmd .= "\n";
      }
    }
@ -3418,10 +3436,11 @@ sub get_default_file {
 	    my $name = &construct_name($module,$set,$out);
 	    return &check_backoff_and_get($name);
 	}
-#	print "\t\tpassing $step -> ";
+#	print "\t\tpassing $step\n";
 	$i = $DEPENDENCY[$i][0];
 	$step = $DO_STEP[$i];
 #	print "\t\tbacking off to $step\n";
+        ($default_module,$default_set,$default_step) = &deconstruct_name($step);
    }

    # get file name
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@ -12,7 +12,7 @@ binmode(STDERR, ":utf8");

 my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
 my $TEMPDIR = "/tmp";
-my ($TUNING,$LM,$NAME,$GROUP,$CONTINUE);
+my ($TUNING,$LM,$NAME,$GROUP,$WEIGHTS,$CONTINUE);

 die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
    unless &GetOptions('tuning=s' => => \$TUNING,
@ -21,6 +21,7 @@ die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--sril
 		       'tempdir=s' => \$TEMPDIR,
           'continue' => \$CONTINUE,
           'group=s' => \$GROUP,
+           'weights=s' => \$WEIGHTS,
 		       'lm=s' => \$LM);

 # check and set default to unset parameters
@ -32,6 +33,10 @@ die("ERROR: did not find srilm dir") unless -e $SRILM;
 die("ERROR: cannot run ngram") unless -x $SRILM."/ngram";

 my @LM = split(/,/,$LM);
+my @WEIGHT;
+@WEIGHT = split(/,/,$WEIGHTS) if defined($WEIGHTS);
+die("ERROR: different number of weights and language models: ".scalar(@WEIGHT)." vs. ".scalar(@LM))
+  if defined($WEIGHTS) && scalar(@WEIGHT) != scalar(@LM);

 # establish order
 my $order = 0;
@ -75,7 +80,7 @@ if (!defined($GROUP) && scalar(@LM) > 10) {

 # normal interpolation
 if (!defined($GROUP)) {
-  &interpolate($NAME,@LM);
+  &interpolate($NAME,\@WEIGHT,@LM);
  exit;
 }

@ -98,50 +103,59 @@ foreach my $subgroup (split(/ /,$GROUP)) {
  my $name = $NAME.".group-".chr(97+($g++));
  push @SUB_NAME,$name;
  print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
-  &interpolate($name, @SUB_LM) unless $CONTINUE && -e $name;
+  &interpolate($name, undef, @SUB_LM) unless $CONTINUE && -e $name;
 }
 for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
  next if defined($ALREADY{$lm_i});
  push @SUB_NAME, $LM[$lm_i];
 }
 print STDERR "\n=== BUILDING FINAL LM ===\n\n";
-&interpolate($NAME, @SUB_NAME);
+&interpolate($NAME, undef, @SUB_NAME);

 # main interpolation function
 sub interpolate {
-  my ($name,@LM) = @_;
+  my ($name,$WEIGHT,@LM) = @_;

  die("cannot interpolate more than 10 language models at once: ",join(",",@LM))
    if scalar(@LM) > 10;

  my $tmp = tempdir(DIR=>$TEMPDIR);
+  my @LAMBDA;

-  # compute perplexity
-  my $i = 0;
-  foreach my $lm (@LM) {
-    print STDERR "compute perplexity for $lm\n";
-    safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
-    print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
-    $i++;
+  # if weights are specified, use them
+  if (defined($WEIGHT) && scalar(@$WEIGHT) == scalar(@LM)) {
+    @LAMBDA = @$WEIGHT;
  }
+  # no specified weights -> compute them
+  else {

-  # compute lambdas
-  print STDERR "computing lambdas...\n";
-  my $cmd = "$SRILM/compute-best-mix";
-  for(my $i=0;$i<scalar(@LM);$i++) {
-    $cmd .= " $tmp/iplm.$$.$i";
+    # compute perplexity
+    my $i = 0;
+    foreach my $lm (@LM) {
+      print STDERR "compute perplexity for $lm\n";
+      safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
+      print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
+      $i++;
+    }
+
+    # compute lambdas
+    print STDERR "computing lambdas...\n";
+    my $cmd = "$SRILM/compute-best-mix";
+    for(my $i=0;$i<scalar(@LM);$i++) {
+      $cmd .= " $tmp/iplm.$$.$i";
+    }
+    my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
+    die "Failed to mix models: $mixerr" if $mixexitcode != 0;
+    my $mix = $mixout;
+    `rm $tmp/iplm.$$.*`;
+    $mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix");
+    @LAMBDA = split(/ /,$1);
  }
-  my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
-  die "Failed to mix models: $mixerr" if $mixexitcode != 0;
-  my $mix = $mixout;
-  `rm $tmp/iplm.$$.*`;
-  $mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix");
-  my @LAMBDA = split(/ /,$1);
-
+ 
  # create new language model
  print STDERR "creating new language model...\n";
-  $i = 0;
-  $cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
+  my $i = 0;
+  my $cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
  foreach my $lm (@LM) {
    $cmd .= " -lm " if $i==0;
    $cmd .= " -mix-lm " if $i==1;