merge

2024-12-28 14:32:38 +03:00 · 2013-07-24 21:48:14 +01:00 · 2013-07-24 21:48:14 +01:00 · c2489c7d8b
commit c2489c7d8b
parent 6ac4d4ddad f79746b3c2
3 changed files with 110 additions and 98 deletions
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -598,7 +598,7 @@ binarize-config
 	pass-unless: binarize-all
 	rerun-on-change: config
 	default-name: model/moses.bin.ini
-	template: $binarize-all $ttable-binarizer $rtable-binarizer OUT IN
+	template: $binarize-all IN OUT -Binarizer $ttable-binarizer 
 hiero-compile-source-suffix-array
 	in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
 	out: hiero-source-suffix-array
@ -835,33 +835,31 @@ filter
 	out: filtered-dir
 	default-name: tuning/filtered
 	rerun-on-change: filter-settings
-	pass-if: TRAINING:binarize-all
-	ignore-if: use-hiero
+	ignore-if: TRAINING:binarize-all
 	error: already exists. Please delete
 filter-devtest
 	in: input-devtest TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table
 	out: filtered-dir-devtest
 	default-name: tuning/filtered.devtest
 	rerun-on-change: filter-settings
-	pass-if: TRAINING:binarize-all
+	ignore-if: TRAINING:binarize-all
 	ignore-unless: use-mira	
 	error: already exists. Please delete
 apply-filter
-	in: TRAINING:config filtered-dir
+	in: TRAINING:bin-config filtered-dir
 	out: filtered-config
 	default-name: tuning/moses.filtered.ini
-	pass-if: TRAINING:binarize-all
-	ignore-if: use-hiero
+	ignore-if: TRAINING:binarize-all
 	template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
 apply-filter-devtest
-	in: TRAINING:config filtered-dir-devtest
+	in: TRAINING:bin-config filtered-dir-devtest
 	out: filtered-config-devtest
 	default-name: tuning/moses.filtered.devtest.ini
 	pass-if: TRAINING:binarize-all
 	ignore-unless: use-mira
 	template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
 tune
-	in: filtered-config input reference filtered-config-devtest input-devtest reference-devtest
+	in: TRAINING:bin-config input reference filtered-config-devtest input-devtest reference-devtest filtered-config 
 	out: weight-config
 	ignore-if: use-hiero
 	qsub-script: yes
@ -869,7 +867,7 @@ tune
 	rerun-on-change: decoder-settings tuning-settings nbest lambda async
 	not-error: trans: No such file or directory
 apply-weights
-	in: TRAINING:config weight-config
+	in: TRAINING:bin-config weight-config
 	out: config-with-reused-weights
 	ignore-if: use-hiero
 	default-name: tuning/moses.tuned.ini
@ -958,14 +956,13 @@ filter
 	ignore-if: use-hiero
 	error: already exists. Please delete
 apply-filter
-	in: filtered-dir TRAINING:config TUNING:config-with-reused-weights
+	in: TUNING:config-with-reused-weights filtered-dir
 	out: filtered-config
 	default-name: evaluation/filtered.ini
-	pass-if: TRAINING:binarize-all
-	ignore-if: use-hiero
+	ignore-if: TRAINING:binarize-all
 	template: $moses-script-dir/ems/support/substitute-filtered-tables-and-weights.perl IN/moses.ini IN1 IN2 OUT
 decode
-	in: filtered-config input
+	in: TUNING:config-with-reused-weights input filtered-config
 	out: system-output
 	default-name: evaluation/output
 	qsub-script: yes
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@ -39,9 +39,11 @@ my $binarizer = undef;
 my $min_score = undef;
 my $opt_min_non_initial_rule_count = undef;
 my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
+my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice

 GetOptions(
    "gzip!" => \$opt_gzip,
+    "filter!" => \$opt_filter,
    "Hierarchical" => \$opt_hierarchical,
    "Binarizer=s" => \$binarizer,
    "MinScore=s" => \$min_score,
@ -253,32 +255,34 @@ if ($opt_hierarchical) {
 } #if ($opt_hierarchical) {

 my %PHRASE_USED;
-if (!$opt_hierarchical) {
-    # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
-    open(INPUT,mk_open_string($input)) or die "Can't read $input";
-    while(my $line = <INPUT>) {
-        chomp($line);
-        my @WORD = split(/ +/,$line);
-        for(my $i=0;$i<=$#WORD;$i++) {
-            for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
-                foreach (keys %CONSIDER_FACTORS) {
-                    my @FACTOR = split(/,/);
-                    my $phrase = "";
-                    for(my $k=$i;$k<=$i+$j;$k++) {
-                        my @WORD_FACTOR = split(/\|/,$WORD[$k]);
-                        for(my $f=0;$f<=$#FACTOR;$f++) {
-                            $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
-                        }
-                        chop($phrase);
-                        $phrase .= " ";
-                    }
-                    chop($phrase);
-                    $PHRASE_USED{$_}{$phrase}++;
-                }
-            }
-        }
-    }
-    close(INPUT);
+if ($opt_filter) {
+  if (!$opt_hierarchical) {
+      # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
+      open(INPUT,mk_open_string($input)) or die "Can't read $input";
+      while(my $line = <INPUT>) {
+          chomp($line);
+          my @WORD = split(/ +/,$line);
+          for(my $i=0;$i<=$#WORD;$i++) {
+              for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
+                  foreach (keys %CONSIDER_FACTORS) {
+                      my @FACTOR = split(/,/);
+                      my $phrase = "";
+                      for(my $k=$i;$k<=$i+$j;$k++) {
+                          my @WORD_FACTOR = split(/\|/,$WORD[$k]);
+                          for(my $f=0;$f<=$#FACTOR;$f++) {
+                              $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
+                          }
+                          chop($phrase);
+                          $phrase .= " ";
+                      }
+                      chop($phrase);
+                      $PHRASE_USED{$_}{$phrase}++;
+                  }
+              }
+          }
+      }
+      close(INPUT);
+  }
 }

 # filter files
@ -288,79 +292,89 @@ for(my $i=0;$i<=$#TABLE;$i++) {
    my $factors = $TABLE_FACTORS[$i];
    my $new_file = $TABLE_NEW_NAME[$i];
    print STDERR "filtering $file -> $new_file...\n";
-
-    my $openstring = mk_open_string($file);
-
-    my $new_openstring;
-    if ($new_file =~ /\.gz$/) {
-      $new_openstring = "| gzip -c > $new_file";
+    my $mid_file = $new_file; # used when both filtering and binarizing
+    if (!$opt_filter) {
+      # check if original file was gzipped
+      if ($file !~ /\.gz$/ && -e "$file.gz") {
+        $file .= ".gz";
+      }
+      $mid_file .= ".gz" if $file =~ /\.gz$/;
+      safesystem("ln -s $file $mid_file");
    } else {
-      $new_openstring = ">$new_file";
-    }

-    open(FILE_OUT,$new_openstring) or die "Can't write to $new_openstring";
+      $mid_file .= ".gz"
+        if $mid_file !~ /\.gz/
+           && $binarizer && $binarizer =~ /processPhraseTable/;

-    if ($opt_hierarchical) {
-        my $tmp_input = $TMP_INPUT_FILENAME{$factors};
-        my $options = "";
-        $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
-        open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
-        while (my $line = <PIPE>) {
-            print FILE_OUT $line
-        }
-        close(FILEHANDLE);
-    } else {
-        open(FILE,$openstring) or die "Can't open '$openstring'";
-        while(my $entry = <FILE>) {
-            my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
-            $foreign =~ s/ $//;
-            if (defined($PHRASE_USED{$factors}{$foreign})) {
-                # handle min_score thresholds
-                if ($min_score) {
-                   my @ITEM = split(/ *\|\|\| */,$rest);
-                   if(scalar (@ITEM)>2) { # do not filter reordering table
-                     my @SCORE = split(/ /,$ITEM[1]);
-                     my $okay = 1;
-                     foreach my $id (keys %MIN_SCORE) {
-                       $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id};
+      my $openstring = mk_open_string($file);
+
+      my $mid_openstring;
+      if ($mid_file =~ /\.gz$/) {
+        $mid_openstring = "| gzip -c > $mid_file";
+      } else {
+        $mid_openstring = ">$mid_file";
+      }
+
+
+      open(FILE_OUT,$mid_openstring) or die "Can't write to $mid_openstring";
+
+      if ($opt_hierarchical) {
+          my $tmp_input = $TMP_INPUT_FILENAME{$factors};
+          my $options = "";
+          $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
+          open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
+          while (my $line = <PIPE>) {
+              print FILE_OUT $line
+          }
+          close(FILEHANDLE);
+      } else {
+          open(FILE,$openstring) or die "Can't open '$openstring'";
+          while(my $entry = <FILE>) {
+              my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
+              $foreign =~ s/ $//;
+              if (defined($PHRASE_USED{$factors}{$foreign})) {
+                  # handle min_score thresholds
+                  if ($min_score) {
+                     my @ITEM = split(/ *\|\|\| */,$rest);
+                     if(scalar (@ITEM)>2) { # do not filter reordering table
+                       my @SCORE = split(/ /,$ITEM[1]);
+                       my $okay = 1;
+                       foreach my $id (keys %MIN_SCORE) {
+                         $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id};
+                       }
+                       next unless $okay;
                     }
-                     next unless $okay;
-                   }
-                }
-                print FILE_OUT $entry;
-                $used++;
-            }
-            $total++;
-        }
-        close(FILE);
-        die "No phrases found in $file!" if $total == 0;
-        printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+                  }
+                  print FILE_OUT $entry;
+                  $used++;
+              }
+              $total++;
+          }
+          close(FILE);
+          die "No phrases found in $file!" if $total == 0;
+          printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+      }
    }

+    my $catcmd = ($mid_file =~ /\.gz$/ ? "$ZCAT" : "cat");
    if(defined($binarizer)) {
      print STDERR "binarizing...";
      # translation model
      if ($KNOWN_TTABLE{$i}) {
        # ... hierarchical translation model
        if ($opt_hierarchical) {
-          my $cmd = "$binarizer $new_file $new_file.bin";
+          my $cmd = "$binarizer $mid_file $new_file.bin";
          print STDERR $cmd."\n";
          print STDERR `$cmd`;
        }
        # ... phrase translation model
        elsif ($binarizer =~ /processPhraseTableMin/) {
          #compact phrase table
-          my $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
+          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir > $mid_file.sorted; $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $mid_file.sorted";
          print STDERR $cmd."\n";
          print STDERR `$cmd`;
-        }
-        elsif ($binarizer =~ /CreateOnDiskPt/) {
-          my $cmd = "$binarizer $new_file $new_file.bin";
-          print STDERR $cmd."\n";
-          print STDERR `$cmd`;
-        }
-        else { 
-          my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
+        } else { 
+          my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
          print STDERR $cmd."\n";
          print STDERR `$cmd`;
        }
@ -371,10 +385,10 @@ for(my $i=0;$i<=$#TABLE;$i++) {
        $lexbin =~ s/PhraseTable/LexicalTable/;
        my $cmd;
        if ($lexbin =~ /processLexicalTableMin/) {
-          $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted;  $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
+          $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir > $mid_file.sorted;  $lexbin -in $mid_file.sorted -out $new_file; rm $mid_file.sorted";
        } else {
          $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
-          $cmd = "$lexbin -in $new_file -out $new_file";
+          $cmd = "$lexbin -in $mid_file -out $new_file";
        }
        print STDERR $cmd."\n";
        print STDERR `$cmd`;
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@ -2055,11 +2055,12 @@ sub create_ini {

  # lattice feature
  if ($_NUM_LATTICE_FEATURES) {
-    print INI "\n\n#lattice or confusion net weights\n[weight-i]\n";
+    $feature_spec .= "InputFeature num-input-features=$_NUM_LATTICE_FEATURES\n";
+    $weight_spec .= "InputFeature0=";
    for (1..$_NUM_LATTICE_FEATURES) {
-      print INI "0.1\n";
+      $weight_spec .= " 0.1";
    }
-    print "\n";
+    $weight_spec .= "\n";
  }

  # get addititional content for config file from switch or file