Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-12-27 22:14:57 +03:00 · 2013-07-11 15:42:17 +01:00 · 2013-07-11 15:42:17 +01:00 · 175b540509
commit 175b540509
parent ddffe5e01b 7eb5e41ea1
10 changed files with 106 additions and 57 deletions
--- a/moses/FF/OSM-Feature/OpSequenceModel.cpp
+++ b/moses/FF/OSM-Feature/OpSequenceModel.cpp
@ -21,24 +21,6 @@ void OpSequenceModel :: readLanguageModel(const char *lmFile)
 {

  string unkOp = "_TRANS_SLF_";
-
-
-  /*
-
-  // Code for SRILM
-
-  vector <int> numbers;
-        int nonWordFlag = 0;
-
-  ptrOp = new Api;
-  ptrOp -> read_lm(lmFile,lmOrder);
-  numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
-  unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
-
-  */
-
-  // Code to load KenLM
-
  OSM = new Model(m_lmPath.c_str());
  State startState = OSM->NullContextState();
  State endState;
@ -49,36 +31,6 @@ void OpSequenceModel :: readLanguageModel(const char *lmFile)
 void OpSequenceModel::Load()
 {

-  /*
-  // load future cost
-
-  //vector <string> input;
-  ifstream sr (m_featurePath.c_str());
-  char* tmp;
-
-  CHECK(sr.is_open());
-
-  vector<FactorType> factorOrder;
-  factorOrder.push_back(0);
-
-  string line;
-  while (std::getline(sr, line))
-  {
-    std::vector<std::string> tokens;
-    tokens = TokenizeMultiCharSeparator(line, "|||");
-    CHECK(tokens.size() == 3);
-
-    Phrase source, target;
-    source.CreateFromString(Input, factorOrder, tokens[0], "|", NULL);
-    target.CreateFromString(Output, factorOrder, tokens[1], "|", NULL);
-
-    ParallelPhrase pp(source, target);
-    Scores scores = Tokenize<float>(tokens[2], " ");
-    m_futureCost[pp] = scores;
-   // m_coll[pp] = scores;
-  }
-
-  */
  readLanguageModel(m_lmPath.c_str());

 }
@ -285,9 +237,8 @@ std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const

 void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
 {
-  if (key == "feature-path") {
-    m_featurePath = value;
-  } else if (key == "path") {
+
+  if (key == "path") {
    m_lmPath = value;
  } else if (key == "order") {
    lmOrder = Scan<int>(value);
--- a/moses/FF/OSM-Feature/OpSequenceModel.h
+++ b/moses/FF/OSM-Feature/OpSequenceModel.h
@ -60,8 +60,7 @@ protected:

  std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
  std::set <int> targetNullWords;
-  std::string m_featurePath, m_lmPath;
-
+  std::string m_lmPath;


 };
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@ -496,7 +496,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
    if (numElement == NOT_FOUND) {
      // init numElement
      numElement = tokens.size();
-      CHECK(numElement >= 3);
+      CHECK(numElement >= (PrintWordAlignment()?4:3));
    }

    if (tokens.size() != numElement) {
--- a/scripts/OSM/OSM-Train.sh
+++ b/scripts/OSM/OSM-Train.sh
@ -0,0 +1,34 @@
+#!/bin/sh
+
+echo 'Training OSM - Start'
+date
+
+mkdir $5
+ln -s $1 $5/e
+ln -s $2 $5/f
+
+$6/scripts/OSM/flipAlignment $3 > $5/align
+
+echo 'Extracting Singletons'
+
+$6/scripts/OSM/extract-singletons.perl $5/e $5/f $5/align > $5/Singletons
+
+echo 'Converting Bilingual Sentence Pair into Operation Corpus'
+
+$6/scripts/OSM/generateSequences $5/e $5/f $5/align $5/Singletons > $5/opCorpus			# Generates Operation Corpus
+
+echo 'Learning Operation Sequence Translation Model'
+
+$7/ngram-count -kndiscount -order $4 -unk -text $5/opCorpus -lm $5/operationLM
+
+echo 'Binarizing'
+
+$6/bin/build_binary $5/operationLM $5/operationLM.bin
+
+\rm $5/e
+\rm $5/f
+\rm $5/align
+
+echo 'Training OSM - End'
+date
+
--- a/scripts/OSM/extract-singletons.perl
+++ b/scripts/OSM/extract-singletons.perl
@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+use Getopt::Std;
+getopts('q');
+
+$target = shift;
+$source = shift;
+$align = shift or die "
+Usage: extract-singletons.perl target source align
+
+";
+open(TARGET,$target) or die "Error: unable to open target file \"$target\"!\n";
+open(SOURCE,$source) or die "Error: unable to open source file \"$source\"!\n";
+open(ALIGN,$align) or die "Error: unable to open alignment file \"$align\"!\n";
+
+while (<TARGET>) {
+    unless (defined $opt_q) {
+	print STDERR "\r$M" if ++$M%1000 == 0;
+    }
+    @T = split;
+    $_ = <SOURCE>;
+    @S = split;
+    $_ = <ALIGN>;
+    @A = split;
+
+    my(@source_links,@target_links);
+    for( $i=0; $i<=$#A; $i+=2 ) {
+	$target_links[$A[$i]]++;
+	$source_links[$A[$i+1]]++;
+    }
+
+    for( $i=0; $i<=$#A; $i+=2 ) {
+	if ($target_links[$A[$i]] == 1 && $source_links[$A[$i+1]] == 1 && 
+	    $T[$A[$i]] eq $S[$A[$i+1]])
+	{
+	    $count{$S[$A[$i+1]]}++; # Print this if it only occurs here
+	}
+	else {
+	    $count{$S[$A[$i+1]]}+=2; # Don't print this
+	}
+    }
+}
+
+foreach $w (sort keys %count) {
+    print "$w\n" if $count{$w}==1;
+}
--- a/scripts/OSM/flipAlignment
+++ b/scripts/OSM/flipAlignment
--- a/scripts/OSM/generateSequences
+++ b/scripts/OSM/generateSequences
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -511,6 +511,13 @@ pcfg-score
 	default-name: model/scored-corpus
 	pass-unless: use-pcfg-feature
 	template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
+build-osm
+	in: corpus word-alignment
+	out: osm-model
+	ignore-unless: operation-sequence-model
+	rerun-on-change: operation-sequence-model training-options script giza-settings 
+	template: $moses-script-dir/OSM/OSM-Train.sh IN0.$output-extension IN0.$input-extension IN1.$alignment-symmetrization-method $operation-sequence-model-order OUT $moses-src-dir $srilm-dir
+	default-name: model/OSM
 extract-phrases
 	in: corpus-mml-postfilter=OR=word-alignment scored-corpus
 	out: extracted-phrases
@ -579,7 +586,7 @@ build-sparse
        default-name: model/sparse-features
 	template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
 create-config
-	in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains INTERPOLATED-LM:binlm LM:binlm 
+	in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm 
 	out: config
 	ignore-if: use-hiero
 	rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -2164,11 +2164,13 @@ sub get_config_tables {
 sub define_training_create_config {
    my ($step_id) = @_;

-    my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,@LM)
+    my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
 			= &get_output_and_input($step_id);

    my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);

+    $cmd .= "-osm-model $osm/operationLM.bin " if $osm;
+	
    # sparse lexical features provide additional content for config file
    $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;

--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@ -31,7 +31,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
   $_DECODING_GRAPH_BACKOFF,
   $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
   @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
-   $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG,
+   $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM,
   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
   $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2,
   $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
@ -119,6 +119,7 @@ $_HELP = 1
 		       'xml' => \$_XML,
 		       'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
 		       'config=s' => \$_CONFIG,
+		       'osm-model=s' => \$_OSM,	
 		       'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
 		       'do-steps=s' => \$_DO_STEPS,
 		       'memscore:s' => \$_MEMSCORE,
@ -1992,6 +1993,15 @@ sub create_ini {
      }
  }

+  # operation sequence model
+
+  if($_OSM)
+  {
+
+      $feature_spec .= "OpSequenceModel num-features=5 path=". $_OSM . " \n";
+      $weight_spec  .= "OpSequenceModel0= 0.08 -0.02 0.02 -0.001 0.03\n";
+  }	
+
  # distance-based reordering
  if (!$_HIERARCHICAL) {
    $feature_spec .= "Distortion\n";