Training Scripts for Factored OSM

2024-12-26 21:42:19 +03:00 · 2013-08-26 13:21:04 +01:00 · 2013-08-26 13:21:04 +01:00 · fb35e1f3c9
commit fb35e1f3c9
parent 1444837ff9
3 changed files with 229 additions and 4 deletions
--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@ -0,0 +1,186 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+use FindBin qw($RealBin);
+
+print STDERR "Training OSM - Start\n".`date`;
+
+my $ORDER = 5;
+my $OUT_DIR = "/tmp/osm.$$";
+my $___FACTOR_DELIMITER = "|";
+my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR);
+
+# utilities
+my $ZCAT = "gzip -cd";
+my $BZCAT = "bzcat";
+
+die("ERROR: wrong syntax when invoking OSM-Train.perl")
+    unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR,
+		       'corpus-f=s' => \$CORPUS_F,
+		       'corpus-e=s' => \$CORPUS_E,
+		       'alignment=s' => \$ALIGNMENT,
+		       'order=i' => \$ORDER,
+		       'factor=s' => \$FACTOR,
+		       'srilm-dir=s' => \$SRILM_DIR,
+		       'out-dir=s' => \$OUT_DIR);
+
+# check if the files are in place
+die("ERROR: you need to define --corpus-e, --corpus-f, --alignment, --srilm-dir, and --moses-src-dir") 
+    unless (defined($MOSES_SRC_DIR) && 
+	    defined($CORPUS_F) && 
+	    defined($CORPUS_E) && 
+	    defined($ALIGNMENT)&& 
+	    defined($SRILM_DIR));
+die("ERROR: could not find input corpus file '$CORPUS_F'") 
+    unless -e $CORPUS_F;
+die("ERROR: could not find output corpus file '$CORPUS_E'") 
+    unless -e $CORPUS_E;
+die("ERROR: could not find algnment file '$ALIGNMENT'") 
+    unless -e $ALIGNMENT;
+die("ERROR: could not find OSM scripts in '$MOSES_SRC_DIR/scripts/OSM") 
+    unless -e "$MOSES_SRC_DIR/scripts/OSM/flipAlignment";
+
+# create factors
+`mkdir $OUT_DIR`;
+`$MOSES_SRC_DIR/scripts/OSM/flipAlignment $ALIGNMENT > $OUT_DIR/align`;
+
+if (defined($FACTOR)) {
+  
+   my @factor_values = split(',', $FACTOR);
+ 
+    foreach my $factor_val (@factor_values) {
+    `mkdir $OUT_DIR/$factor_val`;
+  my ($factor_f,$factor_e) = split(/\-/,$factor_val);
+    
+    $CORPUS_F =~ /^(.+)\.([^\.]+)/;
+    my ($corpus_stem_f,$ext_f) = ($1,$2);
+    $CORPUS_E =~ /^(.+)\.([^\.]+)/;
+    my ($corpus_stem_e,$ext_e) = ($1,$2);
+    &reduce_factors($CORPUS_F,"$corpus_stem_f.$factor_val.$ext_f",$factor_f);
+    &reduce_factors($CORPUS_E,"$corpus_stem_e.$factor_val.$ext_e",$factor_e);
+
+    `ln -s $corpus_stem_f.$factor_val.$ext_f $OUT_DIR/$factor_val/f`;
+    `ln -s $corpus_stem_e.$factor_val.$ext_e $OUT_DIR/$factor_val/e`;
+     create_model($factor_val);
+  }
+}
+else {
+    `ln -s $CORPUS_F $OUT_DIR/f`;
+    `ln -s $CORPUS_E $OUT_DIR/e`;
+     create_model("");	
+}
+
+# create model
+
+print "Training OSM - End".`date`;
+
+sub create_model{
+my ($factor_val) = @_;
+
+print "Creating Model ".$factor_val."\n";
+
+print "Extracting Singletons\n";
+`$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align > $OUT_DIR/$factor_val/Singletons`;
+
+print "Converting Bilingual Sentence Pair into Operation Corpus\n";
+`$MOSES_SRC_DIR/scripts/OSM/generateSequences $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/f $OUT_DIR/align $OUT_DIR/$factor_val/Singletons > $OUT_DIR/$factor_val/opCorpus`;
+
+print "Learning Operation Sequence Translation Model\n";
+`$SRILM_DIR/ngram-count -kndiscount -order $ORDER -unk -text $OUT_DIR/$factor_val/opCorpus -lm $OUT_DIR/$factor_val/operationLM`;
+
+print "Binarizing\n";
+`$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/$factor_val/operationLM $OUT_DIR/$factor_val/operationLM.bin`;
+
+
+}
+
+# from train-model.perl
+sub reduce_factors {
+    my ($full,$reduced,$factors) = @_;
+
+    my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
+
+    print "Reducing factors to produce $reduced  @ ".`date`;
+    while(-e $reduced.".lock") {
+	sleep(10);
+    }
+    if (-e $reduced) {
+        print STDERR "  $reduced in place, reusing\n";
+        return;
+    }
+    if (-e $reduced.".gz") {
+        print STDERR "  $reduced.gz in place, reusing\n";
+        return;
+    }
+
+    # peek at input, to check if we are asked to produce exactly the
+    # available factors
+    my $inh = open_or_zcat($full);
+    my $firstline = <$inh>;
+    die "Corpus file $full is empty" unless $firstline;
+    close $inh;
+    # pick first word
+    $firstline =~ s/^\s*//;
+    $firstline =~ s/\s.*//;
+    # count factors
+    my $maxfactorindex = $firstline =~ tr/|/|/;
+    if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
+	# create just symlink; preserving compression
+	my $realfull = $full;
+	if (!-e $realfull && -e $realfull.".gz") {
+            $realfull .= ".gz";
+            $reduced =~ s/(\.gz)?$/.gz/;
+	}
+	safesystem("ln -s '$realfull' '$reduced'")
+            or die "Failed to create symlink $realfull -> $reduced";
+	return;
+    }
+
+    # The default is to select the needed factors
+    `touch $reduced.lock`;
+    *IN = open_or_zcat($full);
+    open(OUT,">".$reduced) or die "ERROR: Can't write $reduced";
+    my $nr = 0;
+    while(<IN>) {
+        $nr++;
+        print STDERR "." if $nr % 10000 == 0;
+        print STDERR "($nr)" if $nr % 100000 == 0;
+	chomp; s/ +/ /g; s/^ //; s/ $//;
+	my $first = 1;
+	foreach (split) {
+	    my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
+              # \Q causes to disable metacharacters in regex
+	    print OUT " " unless $first;
+	    $first = 0;
+	    my $first_factor = 1;
+            foreach my $outfactor (@INCLUDE) {
+              print OUT "|" unless $first_factor;
+              $first_factor = 0;
+              my $out = $FACTOR[$outfactor];
+              die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
+              print OUT $out;
+            }
+	} 
+	print OUT "\n";
+    }
+    print STDERR "\n";
+    close(OUT);
+    close(IN);
+    `rm -f $reduced.lock`;
+}
+
+sub open_or_zcat {
+  my $fn = shift;
+  my $read = $fn;
+  $fn = $fn.".gz" if ! -e $fn && -e $fn.".gz";
+  $fn = $fn.".bz2" if ! -e $fn && -e $fn.".bz2";
+  if ($fn =~ /\.bz2$/) {
+      $read = "$BZCAT $fn|";
+  } elsif ($fn =~ /\.gz$/) {
+      $read = "$ZCAT $fn|";
+  }
+  my $hdl;
+  open($hdl,$read) or die "Can't read $fn ($read)";
+  return $hdl;
+}
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -2167,7 +2167,23 @@ sub define_training_create_config {

    my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);

-    $cmd .= "-osm-model $osm/operationLM.bin " if $osm;
+    if($osm){
+      
+      my $osm_settings = &get("TRAINING:operation-sequence-model-settings"); 
+     
+
+	if($osm_settings =~ /factor/){
+	
+		$cmd .= "-osm-model $osm/ ";
+		my $find = "--factor";
+		my $replace = "-osm-setting";
+		$osm_settings =~ s/$find/$replace/g;
+      		$cmd .= "$osm_settings ";       
+       }
+	else{
+	 $cmd .= "-osm-model $osm/operationLM.bin ";
+	}
+    }
 	
    # sparse lexical features provide additional content for config file
    $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@ -31,7 +31,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
   $_DECODING_GRAPH_BACKOFF,
   $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
   @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
-   $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM,
+   $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS,
   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
   $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2,
   $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
@ -119,7 +119,8 @@ $_HELP = 1
 		       'xml' => \$_XML,
 		       'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
 		       'config=s' => \$_CONFIG,
-		       'osm-model=s' => \$_OSM,	
+		       'osm-model=s' => \$_OSM,
+			'osm-setting=s' => \$_OSM_FACTORS,		
 		       'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
 		       'do-steps=s' => \$_DO_STEPS,
 		       'memscore:s' => \$_MEMSCORE,
@ -1997,9 +1998,31 @@ sub create_ini {

  if($_OSM)
  {
+    if (defined($_OSM_FACTORS))
+    {
+	my $count = 0;
+	my @factor_values = split(',', $_OSM_FACTORS);
+    	foreach my $factor_val (@factor_values) {

-      $feature_spec .= "OpSequenceModel num-features=5 path=". $_OSM . " \n";
+		my ($factor_f,$factor_e) = split(/\-/,$factor_val);
+
+		if($count == 0){
+		$feature_spec .= "OpSequenceModel$count num-features=5 path=". $_OSM . $factor_val . "/operationLM.bin" . " sFactor=". $factor_f . " tFactor=". $factor_e . " numFeatures=5 \n";
+	       $weight_spec  .= "OpSequenceModel$count= 0.08 -0.02 0.02 -0.001 0.03\n";		
+		}
+		else{
+			$feature_spec .= "OpSequenceModel$count num-features=1 path=". $_OSM . $factor_val . "/operationLM.bin" . " sFactor=". $factor_f . " tFactor=". $factor_e . " numFeatures=1 \n";
+	       	$weight_spec  .= "OpSequenceModel$count= 0.08 \n";	
+
+		}
+		$count++;
+	}
+    }
+    else
+    {
+      $feature_spec .= "OpSequenceModel0 num-features=5 path=". $_OSM . " \n";
      $weight_spec  .= "OpSequenceModel0= 0.08 -0.02 0.02 -0.001 0.03\n";
+    }
  }	

  # distance-based reordering