From 18a6d12cb0efc0168ccf95ca4584425f0f620134 Mon Sep 17 00:00:00 2001
From: XapaJIaMnu <nheart@gmail.com>
Date: Thu, 16 Oct 2014 11:53:49 +0100
Subject: [PATCH] Rework lookup and greatly speedup decoding (2x+)

---
 moses/LM/BilingualLM.h              |  2 -
 moses/LM/bilingual-lm/BiLM_NPLM.cpp | 88 +++++++++++------------------
 moses/LM/bilingual-lm/BiLM_NPLM.h   |  7 +--
 3 files changed, 37 insertions(+), 60 deletions(-)
diff --git a/moses/LM/BilingualLM.h b/moses/LM/BilingualLM.h
index 9f696e3b0..effd7342a 100644
--- a/moses/LM/BilingualLM.h
+++ b/moses/LM/BilingualLM.h
@@ -4,8 +4,6 @@
 #include "moses/FF/StatefulFeatureFunction.h"
 #include "moses/FF/FFState.h"
 #include <boost/thread/tss.hpp>
-#include <boost/thread/locks.hpp>
-#include <boost/thread/shared_mutex.hpp>
 #include "moses/Hypothesis.h"
 #include "moses/ChartHypothesis.h"
 #include "moses/InputPath.h"
diff --git a/moses/LM/bilingual-lm/BiLM_NPLM.cpp b/moses/LM/bilingual-lm/BiLM_NPLM.cpp
index 9eb3af2c1..213c40e37 100644
--- a/moses/LM/bilingual-lm/BiLM_NPLM.cpp
+++ b/moses/LM/bilingual-lm/BiLM_NPLM.cpp
@@ -1,5 +1,6 @@
 #include "BiLM_NPLM.h"
 #include "neuralLM.h"
+#include "vocabulary.h"
 
 namespace Moses {
 
@@ -23,70 +24,32 @@ float BilingualLM_NPLM::Score(std::vector<int>& source_words, std::vector<int>&
   return m_neuralLM->lookup_ngram(source_words);
 }
 
-int BilingualLM_NPLM::LookUpNeuralLMWord(const std::string& str) const {
-  return m_neuralLM->lookup_word(str);
-}
-
 const Word& BilingualLM_NPLM::getNullWord() const {
   return NULL_word;
 }
 
-//Cache for NeuralLMids
-int BilingualLM_NPLM::getNeuralLMId(
-    const Word& word, bool is_source_word) const{
+int BilingualLM_NPLM::getNeuralLMId(const Word& word, bool is_source_word) const {
   initSharedPointer();
 
+  boost::unordered_map<const Factor*, int>::iterator it;
   const Factor* factor = word.GetFactor(word_factortype);
 
-  std::map<const Factor *, int>::iterator it;
-
-  boost::upgrade_lock< boost::shared_mutex > read_lock(neuralLMids_lock);
   it = neuralLMids.find(factor);
-
-  if (it != neuralLMids.end()) {
-    if (!factored){
-      return it->second; //Lock is released here automatically
-    } else {
-      //See if word is unknown
-      if (it->second == unknown_word_id){
-        const Factor* pos_factor = word.GetFactor(pos_factortype); //Get POS tag
-        //Look up the POS tag in the cache
-        it = neuralLMids.find(pos_factor);
-        if (it != neuralLMids.end()){
-          return it->second; //We have our pos tag in the cache.
-        } else {
-          //We have to lookup the pos_tag
-          const std::string posstring = pos_factor->GetString().as_string();
-          int neuralLM_wordID = LookUpNeuralLMWord(posstring);
-
-          boost::upgrade_to_unique_lock< boost::shared_mutex > uniqueLock(read_lock);
-          neuralLMids.insert(std::pair<const Factor *, int>(pos_factor, neuralLM_wordID));
-
-          return neuralLM_wordID; //We return the ID of the pos TAG
-        }
-      } else {
-        return it->second; //We return the neuralLMid of the word
-      }
-    }
+  //If we know the word return immediately
+  if (it != neuralLMids.end()){
+    return it->second;
+  }
+  //If we don't know the word and we aren't factored, return the word.
+  if (!factored) {
+      return unknown_word_id;
+  } 
+  //Else try to get a pos_factor
+  const Factor* pos_factor = word.GetFactor(pos_factortype);
+  it = neuralLMids.find(pos_factor);
+  if (it != neuralLMids.end()){
+    return it->second;
   } else {
-    //We have to lookup the word
-    const std::string string = factor->GetString().as_string();
-    int neuralLM_wordID = LookUpNeuralLMWord(string);
-
-    boost::upgrade_to_unique_lock< boost::shared_mutex > uniqueLock(read_lock);
-    neuralLMids.insert(std::pair<const Factor *, int>(factor, neuralLM_wordID));
-
-    if (!factored) {
-      return neuralLM_wordID; //Lock is released here
-    } else {
-      if (neuralLM_wordID == unknown_word_id){
-        const Factor* pos_factor = word.GetFactor(pos_factortype);
-        const std::string factorstring = pos_factor->GetString().as_string();
-        neuralLM_wordID = LookUpNeuralLMWord(string);
-        neuralLMids.insert(std::pair<const Factor *, int>(pos_factor, neuralLM_wordID));
-      }
-      return neuralLM_wordID; //If a POS tag is needed, neuralLM_wordID is going to be updated.
-    }
+    return unknown_word_id;
   }
 }
 
@@ -128,6 +91,23 @@ void BilingualLM_NPLM::loadModel() {
 
   m_neuralLM_shared->set_cache(neuralLM_cache); //Default 1000000
   unknown_word_id = m_neuralLM_shared->lookup_word("<unk>");
+
+  //Setup factor -> NeuralLMId cache
+  FactorCollection& factorFactory = FactorCollection::Instance(); //To do the conversion from string to vocabID
+
+  const nplm::vocabulary& vocab = m_neuralLM_shared->get_vocabulary();
+  const boost::unordered_map<std::string, int>& neuraLMvocabmap = vocab.get_idmap();
+
+  boost::unordered_map<std::string, int>::const_iterator it;
+
+  for (it = neuraLMvocabmap.cbegin(); it != neuraLMvocabmap.cend(); it++) {
+    std::string raw_word = it->first;
+    int neuralLMid = it->second;
+    const Factor * factor = factorFactory.AddFactor(raw_word);
+
+    neuralLMids.insert(std::make_pair(factor, neuralLMid));
+  }
+
 }
 
 } // namespace Moses
diff --git a/moses/LM/bilingual-lm/BiLM_NPLM.h b/moses/LM/bilingual-lm/BiLM_NPLM.h
index 78555efe0..19bdf10e6 100644
--- a/moses/LM/bilingual-lm/BiLM_NPLM.h
+++ b/moses/LM/bilingual-lm/BiLM_NPLM.h
@@ -1,4 +1,6 @@
 #include "moses/LM/BilingualLM.h"
+#include <boost/unordered_map.hpp>
+#include <utility> //make_pair
 
 namespace nplm {
   class neuralLM;
@@ -15,8 +17,6 @@ class BilingualLM_NPLM : public BilingualLM {
 
   int getNeuralLMId(const Word& word, bool is_source_word) const;
 
-  int LookUpNeuralLMWord(const std::string& str) const;
-
   void initSharedPointer() const;
 
   void loadModel();
@@ -28,8 +28,7 @@ class BilingualLM_NPLM : public BilingualLM {
   nplm::neuralLM *m_neuralLM_shared;
   mutable boost::thread_specific_ptr<nplm::neuralLM> m_neuralLM;
 
-  mutable std::map<const Factor*, int> neuralLMids;
-  mutable boost::shared_mutex neuralLMids_lock;
+  mutable boost::unordered_map<const Factor*, int> neuralLMids;
 
   //const Factor* NULL_factor_overwrite;
   std::string NULL_string;