Rework lookup and greatly speedup decoding (2x+)

2024-12-26 05:14:36 +03:00 · 2014-10-16 11:53:49 +01:00 · 2014-10-16 11:53:49 +01:00 · 18a6d12cb0
commit 18a6d12cb0
parent cf3fe60cf6
3 changed files with 37 additions and 60 deletions
--- a/moses/LM/BilingualLM.h
+++ b/moses/LM/BilingualLM.h
@ -4,8 +4,6 @@
 #include "moses/FF/StatefulFeatureFunction.h"
 #include "moses/FF/FFState.h"
 #include <boost/thread/tss.hpp>
-#include <boost/thread/locks.hpp>
-#include <boost/thread/shared_mutex.hpp>
 #include "moses/Hypothesis.h"
 #include "moses/ChartHypothesis.h"
 #include "moses/InputPath.h"
--- a/moses/LM/bilingual-lm/BiLM_NPLM.cpp
+++ b/moses/LM/bilingual-lm/BiLM_NPLM.cpp
@ -1,5 +1,6 @@
 #include "BiLM_NPLM.h"
 #include "neuralLM.h"
+#include "vocabulary.h"

 namespace Moses {

@ -23,70 +24,32 @@ float BilingualLM_NPLM::Score(std::vector<int>& source_words, std::vector<int>&
  return m_neuralLM->lookup_ngram(source_words);
 }

-int BilingualLM_NPLM::LookUpNeuralLMWord(const std::string& str) const {
-  return m_neuralLM->lookup_word(str);
-}
-
 const Word& BilingualLM_NPLM::getNullWord() const {
  return NULL_word;
 }

-//Cache for NeuralLMids
-int BilingualLM_NPLM::getNeuralLMId(
-    const Word& word, bool is_source_word) const{
+int BilingualLM_NPLM::getNeuralLMId(const Word& word, bool is_source_word) const {
  initSharedPointer();

+  boost::unordered_map<const Factor*, int>::iterator it;
  const Factor* factor = word.GetFactor(word_factortype);

-  std::map<const Factor *, int>::iterator it;
-
-  boost::upgrade_lock< boost::shared_mutex > read_lock(neuralLMids_lock);
  it = neuralLMids.find(factor);
-
-  if (it != neuralLMids.end()) {
-    if (!factored){
-      return it->second; //Lock is released here automatically
-    } else {
-      //See if word is unknown
-      if (it->second == unknown_word_id){
-        const Factor* pos_factor = word.GetFactor(pos_factortype); //Get POS tag
-        //Look up the POS tag in the cache
-        it = neuralLMids.find(pos_factor);
-        if (it != neuralLMids.end()){
-          return it->second; //We have our pos tag in the cache.
-        } else {
-          //We have to lookup the pos_tag
-          const std::string posstring = pos_factor->GetString().as_string();
-          int neuralLM_wordID = LookUpNeuralLMWord(posstring);
-
-          boost::upgrade_to_unique_lock< boost::shared_mutex > uniqueLock(read_lock);
-          neuralLMids.insert(std::pair<const Factor *, int>(pos_factor, neuralLM_wordID));
-
-          return neuralLM_wordID; //We return the ID of the pos TAG
-        }
-      } else {
-        return it->second; //We return the neuralLMid of the word
-      }
-    }
+  //If we know the word return immediately
+  if (it != neuralLMids.end()){
+    return it->second;
+  }
+  //If we don't know the word and we aren't factored, return the word.
+  if (!factored) {
+      return unknown_word_id;
+  } 
+  //Else try to get a pos_factor
+  const Factor* pos_factor = word.GetFactor(pos_factortype);
+  it = neuralLMids.find(pos_factor);
+  if (it != neuralLMids.end()){
+    return it->second;
  } else {
-    //We have to lookup the word
-    const std::string string = factor->GetString().as_string();
-    int neuralLM_wordID = LookUpNeuralLMWord(string);
-
-    boost::upgrade_to_unique_lock< boost::shared_mutex > uniqueLock(read_lock);
-    neuralLMids.insert(std::pair<const Factor *, int>(factor, neuralLM_wordID));
-
-    if (!factored) {
-      return neuralLM_wordID; //Lock is released here
-    } else {
-      if (neuralLM_wordID == unknown_word_id){
-        const Factor* pos_factor = word.GetFactor(pos_factortype);
-        const std::string factorstring = pos_factor->GetString().as_string();
-        neuralLM_wordID = LookUpNeuralLMWord(string);
-        neuralLMids.insert(std::pair<const Factor *, int>(pos_factor, neuralLM_wordID));
-      }
-      return neuralLM_wordID; //If a POS tag is needed, neuralLM_wordID is going to be updated.
-    }
+    return unknown_word_id;
  }
 }

@ -128,6 +91,23 @@ void BilingualLM_NPLM::loadModel() {

  m_neuralLM_shared->set_cache(neuralLM_cache); //Default 1000000
  unknown_word_id = m_neuralLM_shared->lookup_word("<unk>");
+
+  //Setup factor -> NeuralLMId cache
+  FactorCollection& factorFactory = FactorCollection::Instance(); //To do the conversion from string to vocabID
+
+  const nplm::vocabulary& vocab = m_neuralLM_shared->get_vocabulary();
+  const boost::unordered_map<std::string, int>& neuraLMvocabmap = vocab.get_idmap();
+
+  boost::unordered_map<std::string, int>::const_iterator it;
+
+  for (it = neuraLMvocabmap.cbegin(); it != neuraLMvocabmap.cend(); it++) {
+    std::string raw_word = it->first;
+    int neuralLMid = it->second;
+    const Factor * factor = factorFactory.AddFactor(raw_word);
+
+    neuralLMids.insert(std::make_pair(factor, neuralLMid));
+  }
+
 }

 } // namespace Moses
--- a/moses/LM/bilingual-lm/BiLM_NPLM.h
+++ b/moses/LM/bilingual-lm/BiLM_NPLM.h
@ -1,4 +1,6 @@
 #include "moses/LM/BilingualLM.h"
+#include <boost/unordered_map.hpp>
+#include <utility> //make_pair

 namespace nplm {
  class neuralLM;
@ -15,8 +17,6 @@ class BilingualLM_NPLM : public BilingualLM {

  int getNeuralLMId(const Word& word, bool is_source_word) const;

-  int LookUpNeuralLMWord(const std::string& str) const;
-
  void initSharedPointer() const;

  void loadModel();
@ -28,8 +28,7 @@ class BilingualLM_NPLM : public BilingualLM {
  nplm::neuralLM *m_neuralLM_shared;
  mutable boost::thread_specific_ptr<nplm::neuralLM> m_neuralLM;

-  mutable std::map<const Factor*, int> neuralLMids;
-  mutable boost::shared_mutex neuralLMids_lock;
+  mutable boost::unordered_map<const Factor*, int> neuralLMids;

  //const Factor* NULL_factor_overwrite;
  std::string NULL_string;