From de45d7076a4d7dc312b63c2e99e4bb5170284095 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <fishandfrolick@gmail.com>
Date: Tue, 1 Jan 2013 17:27:26 +0000
Subject: [PATCH] add [feature] arg. Use for GlobalLexicalModelUnlimited. Not
 tested

---
 moses/GlobalLexicalModelUnlimited.cpp |  55 +++++++++++++
 moses/GlobalLexicalModelUnlimited.h   |   1 +
 moses/StaticData.cpp                  | 107 +++++++-------------------
 moses/StaticData.h                    |   2 -
 4 files changed, 84 insertions(+), 81 deletions(-)
diff --git a/moses/GlobalLexicalModelUnlimited.cpp b/moses/GlobalLexicalModelUnlimited.cpp
index f1de65bd0..d0aad8986 100644
--- a/moses/GlobalLexicalModelUnlimited.cpp
+++ b/moses/GlobalLexicalModelUnlimited.cpp
@@ -8,6 +8,61 @@ using namespace std;
 
 namespace Moses
 {
+GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line)
+:StatelessFeatureFunction("glm",ScoreProducer::unlimited)
+{
+  const vector<string> modelSpec = Tokenize(line);
+
+  for (size_t i = 0; i < modelSpec.size(); i++ ) {
+    bool ignorePunctuation = true, biasFeature = false, restricted = false;
+    size_t context = 0;
+    string filenameSource, filenameTarget;
+    vector< string > factors;
+    vector< string > spec = Tokenize(modelSpec[i]," ");
+
+    // read optional punctuation and bias specifications
+    if (spec.size() > 0) {
+      if (spec.size() != 2 && spec.size() != 3 && spec.size() != 4 && spec.size() != 6) {
+        UserMessage::Add("Format of glm feature is <factor-src>-<factor-tgt> [ignore-punct] [use-bias] "
+            "[context-type] [filename-src filename-tgt]");
+        //return false;
+      }
+
+      factors = Tokenize(spec[0],"-");
+      if (spec.size() >= 2)
+        ignorePunctuation = Scan<size_t>(spec[1]);
+      if (spec.size() >= 3)
+        biasFeature = Scan<size_t>(spec[2]);
+      if (spec.size() >= 4)
+        context = Scan<size_t>(spec[3]);
+      if (spec.size() == 6) {
+        filenameSource = spec[4];
+        filenameTarget = spec[5];
+        restricted = true;
+      }
+    }
+    else
+      factors = Tokenize(modelSpec[i],"-");
+
+    if ( factors.size() != 2 ) {
+      UserMessage::Add("Wrong factor definition for global lexical model unlimited: " + modelSpec[i]);
+      //return false;
+    }
+
+    const vector<FactorType> inputFactors = Tokenize<FactorType>(factors[0],",");
+    const vector<FactorType> outputFactors = Tokenize<FactorType>(factors[1],",");
+    throw runtime_error("GlobalLexicalModelUnlimited should be reimplemented as a stateful feature");
+    GlobalLexicalModelUnlimited* glmu = NULL; // new GlobalLexicalModelUnlimited(inputFactors, outputFactors, biasFeature, ignorePunctuation, context);
+
+    if (restricted) {
+      cerr << "loading word translation word lists from " << filenameSource << " and " << filenameTarget << endl;
+      if (!glmu->Load(filenameSource, filenameTarget)) {
+        UserMessage::Add("Unable to load word lists for word translation feature from files " + filenameSource + " and " + filenameTarget);
+        //return false;
+      }
+    }
+  }
+}
 
 bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource,
 																			 const std::string &filePathTarget)
diff --git a/moses/GlobalLexicalModelUnlimited.h b/moses/GlobalLexicalModelUnlimited.h
index 747031102..af93559c6 100644
--- a/moses/GlobalLexicalModelUnlimited.h
+++ b/moses/GlobalLexicalModelUnlimited.h
@@ -68,6 +68,7 @@ private:
   std::set<std::string> m_vocabTarget;
 
 public:
+  GlobalLexicalModelUnlimited(const std::string &line);
   GlobalLexicalModelUnlimited(const std::vector< FactorType >& inFactors, const std::vector< FactorType >& outFactors,
   		bool biasFeature, bool ignorePunctuation, size_t context):
     StatelessFeatureFunction("glm",ScoreProducer::unlimited),
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 750cdf23c..34294fa98 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -83,6 +83,21 @@ static size_t CalcMax(size_t x, const vector<size_t>& y, const vector<size_t>& z
   return max;
 }
 
+int GetFeatureIndex(std::map<string, int> &map, const string &featureName)
+{
+  std::map<string, int>::iterator iter;
+  iter = map.find(featureName);
+  if (iter == map.end()) {
+    map[featureName] = 0;
+    return 0;
+  }
+  else {
+    int &index = iter->second;
+    ++index;
+    return index;
+  }
+}
+
 StaticData StaticData::s_instance;
 
 StaticData::StaticData()
@@ -530,14 +545,24 @@ SetWeight(m_unknownWordPenaltyProducer, weightUnknownWord);
   }
 
   // all features
+  map<string, int> featureIndexMap;
+
   const vector<string> &features = m_parameter->GetParam("feature");
   for (size_t i = 0; i < features.size(); ++i) {
     const string &line = features[i];
     vector<string> toks = Tokenize(line);
 
-    if (toks[0] == "GlobalLexicalModel") {
+    const string &feature = toks[0];
+    int featureIndex = GetFeatureIndex(featureIndexMap, feature);
+
+    if (feature == "GlobalLexicalModel") {
       GlobalLexicalModel *model = new GlobalLexicalModel(line);
-      const vector<float> &weights = m_parameter->GetWeights(toks[0], 0);
+      const vector<float> &weights = m_parameter->GetWeights(feature, featureIndex);
+      SetWeights(model, weights);
+    }
+    else if (feature == "glm") {
+      GlobalLexicalModelUnlimited *model = NULL; //new GlobalLexicalModelUnlimited(line);
+      const vector<float> &weights = m_parameter->GetWeights(feature, featureIndex);
       SetWeights(model, weights);
     }
 
@@ -553,7 +578,6 @@ SetWeight(m_unknownWordPenaltyProducer, weightUnknownWord);
   if (!LoadLanguageModels()) return false;
   if (!LoadGenerationTables()) return false;
   if (!LoadPhraseTables()) return false;
-  if (!LoadGlobalLexicalModelUnlimited()) return false;
   if (!LoadDecodeGraphs()) return false;
   if (!LoadReferences()) return  false;
   if (!LoadDiscrimLMFeature()) return false;
@@ -601,16 +625,7 @@ SetWeight(m_unknownWordPenaltyProducer, weightUnknownWord);
       UserMessage::Add("Unable to load weights from " + extraWeightConfig[0]);
       return false;
     }
-    
-    // GLM: apply additional weight to sparse features if applicable
-    for (size_t i = 0; i < m_globalLexicalModelsUnlimited.size(); ++i) {
-    	float weight = m_globalLexicalModelsUnlimited[i]->GetSparseProducerWeight();
-    	if (weight != 1) {
-    	  AddSparseProducer(m_globalLexicalModelsUnlimited[i]);
-    	  cerr << "glm sparse producer weight: " << weight << endl;
-    	}
-    }
-    
+
     m_allWeights.PlusEquals(extraWeights);
   }
 
@@ -778,72 +793,6 @@ bool StaticData::LoadLexicalReorderingModel()
   return true;
 }
 
-bool StaticData::LoadGlobalLexicalModelUnlimited()
-{
-  const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-glm"));
-  const vector<string> &modelSpec = m_parameter->GetParam("glm-feature");
-
-  if (weight.size() != 0 && weight.size() != modelSpec.size()) {
-    std::cerr << "number of sparse producer weights and model specs for the global lexical model unlimited "
-    		"does not match (" << weight.size() << " != " << modelSpec.size() << ")" << std::endl;
-    return false;
-  }
-
-  for (size_t i = 0; i < modelSpec.size(); i++ ) {
-    bool ignorePunctuation = true, biasFeature = false, restricted = false;
-    size_t context = 0;
-    string filenameSource, filenameTarget;
-    vector< string > factors;
-    vector< string > spec = Tokenize(modelSpec[i]," ");
-
-    // read optional punctuation and bias specifications
-    if (spec.size() > 0) {
-      if (spec.size() != 2 && spec.size() != 3 && spec.size() != 4 && spec.size() != 6) {
-      	UserMessage::Add("Format of glm feature is <factor-src>-<factor-tgt> [ignore-punct] [use-bias] "
-      			"[context-type] [filename-src filename-tgt]");
-				return false;
-      }
-      
-      factors = Tokenize(spec[0],"-");
-      if (spec.size() >= 2)
-      	ignorePunctuation = Scan<size_t>(spec[1]);
-      if (spec.size() >= 3)
-      	biasFeature = Scan<size_t>(spec[2]);
-      if (spec.size() >= 4)
-      	context = Scan<size_t>(spec[3]);
-      if (spec.size() == 6) {
-      	filenameSource = spec[4];
-      	filenameTarget = spec[5];
-      	restricted = true;
-      }
-    }
-    else
-    	factors = Tokenize(modelSpec[i],"-");
-
-    if ( factors.size() != 2 ) {
-      UserMessage::Add("Wrong factor definition for global lexical model unlimited: " + modelSpec[i]);
-    	return false;
-    }
-
-    const vector<FactorType> inputFactors = Tokenize<FactorType>(factors[0],",");
-    const vector<FactorType> outputFactors = Tokenize<FactorType>(factors[1],",");
-    throw runtime_error("GlobalLexicalModelUnlimited should be reimplemented as a stateful feature");
-    GlobalLexicalModelUnlimited* glmu = NULL; // new GlobalLexicalModelUnlimited(inputFactors, outputFactors, biasFeature, ignorePunctuation, context);
-
-    m_globalLexicalModelsUnlimited.push_back(glmu);
-    if (restricted) {
-      cerr << "loading word translation word lists from " << filenameSource << " and " << filenameTarget << endl;
-      if (!glmu->Load(filenameSource, filenameTarget)) {
-        UserMessage::Add("Unable to load word lists for word translation feature from files " + filenameSource + " and " + filenameTarget);
-        return false;
-      }
-    }
-    if (weight.size() > i)
-      m_globalLexicalModelsUnlimited[i]->SetSparseProducerWeight(weight[i]);
-  }
-  return true;
-}
-
 bool StaticData::LoadLanguageModels()
 {
   if (m_parameter->GetParam("lmodel-file").size() > 0) {
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 86ae58543..f23845658 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -89,7 +89,6 @@ protected:
   LMList									m_languageModel;
   ScoreComponentCollection m_allWeights;
   std::vector<LexicalReordering*>                   m_reorderModels;
-  std::vector<GlobalLexicalModelUnlimited*>          m_globalLexicalModelsUnlimited;
 #ifdef HAVE_SYNLM
 	SyntacticLanguageModel* m_syntacticLanguageModel;
 #endif
@@ -249,7 +248,6 @@ protected:
   //! load decoding steps
   bool LoadDecodeGraphs();
   bool LoadLexicalReorderingModel();
-  bool LoadGlobalLexicalModelUnlimited();
   //References used for scoring feature (eg BleuScoreFeature) for online training
   bool LoadReferences();
   bool LoadDiscrimLMFeature();