From 18a6d12cb0efc0168ccf95ca4584425f0f620134 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Thu, 16 Oct 2014 11:53:49 +0100 Subject: [PATCH] Rework lookup and greatly speedup decoding (2x+) --- moses/LM/BilingualLM.h | 2 - moses/LM/bilingual-lm/BiLM_NPLM.cpp | 88 +++++++++++------------------ moses/LM/bilingual-lm/BiLM_NPLM.h | 7 +-- 3 files changed, 37 insertions(+), 60 deletions(-) diff --git a/moses/LM/BilingualLM.h b/moses/LM/BilingualLM.h index 9f696e3b0..effd7342a 100644 --- a/moses/LM/BilingualLM.h +++ b/moses/LM/BilingualLM.h @@ -4,8 +4,6 @@ #include "moses/FF/StatefulFeatureFunction.h" #include "moses/FF/FFState.h" #include -#include -#include #include "moses/Hypothesis.h" #include "moses/ChartHypothesis.h" #include "moses/InputPath.h" diff --git a/moses/LM/bilingual-lm/BiLM_NPLM.cpp b/moses/LM/bilingual-lm/BiLM_NPLM.cpp index 9eb3af2c1..213c40e37 100644 --- a/moses/LM/bilingual-lm/BiLM_NPLM.cpp +++ b/moses/LM/bilingual-lm/BiLM_NPLM.cpp @@ -1,5 +1,6 @@ #include "BiLM_NPLM.h" #include "neuralLM.h" +#include "vocabulary.h" namespace Moses { @@ -23,70 +24,32 @@ float BilingualLM_NPLM::Score(std::vector& source_words, std::vector& return m_neuralLM->lookup_ngram(source_words); } -int BilingualLM_NPLM::LookUpNeuralLMWord(const std::string& str) const { - return m_neuralLM->lookup_word(str); -} - const Word& BilingualLM_NPLM::getNullWord() const { return NULL_word; } -//Cache for NeuralLMids -int BilingualLM_NPLM::getNeuralLMId( - const Word& word, bool is_source_word) const{ +int BilingualLM_NPLM::getNeuralLMId(const Word& word, bool is_source_word) const { initSharedPointer(); + boost::unordered_map::iterator it; const Factor* factor = word.GetFactor(word_factortype); - std::map::iterator it; - - boost::upgrade_lock< boost::shared_mutex > read_lock(neuralLMids_lock); it = neuralLMids.find(factor); - - if (it != neuralLMids.end()) { - if (!factored){ - return it->second; //Lock is released here automatically - } else { - //See if word is unknown - if (it->second == unknown_word_id){ - const Factor* pos_factor = word.GetFactor(pos_factortype); //Get POS tag - //Look up the POS tag in the cache - it = neuralLMids.find(pos_factor); - if (it != neuralLMids.end()){ - return it->second; //We have our pos tag in the cache. - } else { - //We have to lookup the pos_tag - const std::string posstring = pos_factor->GetString().as_string(); - int neuralLM_wordID = LookUpNeuralLMWord(posstring); - - boost::upgrade_to_unique_lock< boost::shared_mutex > uniqueLock(read_lock); - neuralLMids.insert(std::pair(pos_factor, neuralLM_wordID)); - - return neuralLM_wordID; //We return the ID of the pos TAG - } - } else { - return it->second; //We return the neuralLMid of the word - } - } + //If we know the word return immediately + if (it != neuralLMids.end()){ + return it->second; + } + //If we don't know the word and we aren't factored, return the word. + if (!factored) { + return unknown_word_id; + } + //Else try to get a pos_factor + const Factor* pos_factor = word.GetFactor(pos_factortype); + it = neuralLMids.find(pos_factor); + if (it != neuralLMids.end()){ + return it->second; } else { - //We have to lookup the word - const std::string string = factor->GetString().as_string(); - int neuralLM_wordID = LookUpNeuralLMWord(string); - - boost::upgrade_to_unique_lock< boost::shared_mutex > uniqueLock(read_lock); - neuralLMids.insert(std::pair(factor, neuralLM_wordID)); - - if (!factored) { - return neuralLM_wordID; //Lock is released here - } else { - if (neuralLM_wordID == unknown_word_id){ - const Factor* pos_factor = word.GetFactor(pos_factortype); - const std::string factorstring = pos_factor->GetString().as_string(); - neuralLM_wordID = LookUpNeuralLMWord(string); - neuralLMids.insert(std::pair(pos_factor, neuralLM_wordID)); - } - return neuralLM_wordID; //If a POS tag is needed, neuralLM_wordID is going to be updated. - } + return unknown_word_id; } } @@ -128,6 +91,23 @@ void BilingualLM_NPLM::loadModel() { m_neuralLM_shared->set_cache(neuralLM_cache); //Default 1000000 unknown_word_id = m_neuralLM_shared->lookup_word(""); + + //Setup factor -> NeuralLMId cache + FactorCollection& factorFactory = FactorCollection::Instance(); //To do the conversion from string to vocabID + + const nplm::vocabulary& vocab = m_neuralLM_shared->get_vocabulary(); + const boost::unordered_map& neuraLMvocabmap = vocab.get_idmap(); + + boost::unordered_map::const_iterator it; + + for (it = neuraLMvocabmap.cbegin(); it != neuraLMvocabmap.cend(); it++) { + std::string raw_word = it->first; + int neuralLMid = it->second; + const Factor * factor = factorFactory.AddFactor(raw_word); + + neuralLMids.insert(std::make_pair(factor, neuralLMid)); + } + } } // namespace Moses diff --git a/moses/LM/bilingual-lm/BiLM_NPLM.h b/moses/LM/bilingual-lm/BiLM_NPLM.h index 78555efe0..19bdf10e6 100644 --- a/moses/LM/bilingual-lm/BiLM_NPLM.h +++ b/moses/LM/bilingual-lm/BiLM_NPLM.h @@ -1,4 +1,6 @@ #include "moses/LM/BilingualLM.h" +#include +#include //make_pair namespace nplm { class neuralLM; @@ -15,8 +17,6 @@ class BilingualLM_NPLM : public BilingualLM { int getNeuralLMId(const Word& word, bool is_source_word) const; - int LookUpNeuralLMWord(const std::string& str) const; - void initSharedPointer() const; void loadModel(); @@ -28,8 +28,7 @@ class BilingualLM_NPLM : public BilingualLM { nplm::neuralLM *m_neuralLM_shared; mutable boost::thread_specific_ptr m_neuralLM; - mutable std::map neuralLMids; - mutable boost::shared_mutex neuralLMids_lock; + mutable boost::unordered_map neuralLMids; //const Factor* NULL_factor_overwrite; std::string NULL_string;