From 33b3fc67e4f33265eab8b10d641bb8673b8e1d71 Mon Sep 17 00:00:00 2001 From: hieuhoang1972 Date: Mon, 7 Aug 2006 18:21:05 +0000 Subject: [PATCH] factored lm git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@533 1f5c12ca-751b-0410-a591-d2e778427230 --- moses/src/Hypothesis.cpp | 16 ++++----- moses/src/LanguageModel.cpp | 46 +++++++++++++++++++++++++ moses/src/LanguageModel.h | 9 +++-- moses/src/LanguageModelMultiFactor.h | 5 +-- moses/src/LanguageModelSingleFactor.cpp | 46 ++----------------------- moses/src/LanguageModelSingleFactor.h | 20 +++++------ moses/src/LanguageModel_Chunking.h | 45 ++---------------------- moses/src/LanguageModel_IRST.cpp | 8 +++-- moses/src/LanguageModel_IRST.h | 2 +- moses/src/LanguageModel_SRI.cpp | 10 ++++-- moses/src/LanguageModel_SRI.h | 2 +- 11 files changed, 91 insertions(+), 118 deletions(-) diff --git a/moses/src/Hypothesis.cpp b/moses/src/Hypothesis.cpp index a5437b791..06d66eb2c 100755 --- a/moses/src/Hypothesis.cpp +++ b/moses/src/Hypothesis.cpp @@ -265,14 +265,14 @@ void Hypothesis::CalcLMScore(const LMList &languageModels) lmScore = 0; //the score associated with dropping source words is not part of the language model } else { //non-empty target phrase // 1st n-gram - vector contextFactor(nGramOrder); + vector contextFactor(nGramOrder); size_t index = 0; for (int currPos = (int) startPos - (int) nGramOrder + 1 ; currPos <= (int) startPos ; currPos++) { if (currPos >= 0) - contextFactor[index++] = GetFactor(currPos, factorType); + contextFactor[index++] = &GetFactorArray(currPos); else - contextFactor[index++] = languageModel.GetSentenceStart(); + contextFactor[index++] = &languageModel.GetSentenceStartArray(); } lmScore = languageModel.GetValue(contextFactor); //cout<<"context factor: "< contextFactor; + contextFactor.reserve(m_nGramOrder); + + // start of sentence + for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++) + { + contextFactor.push_back(&phrase.GetFactorArray(currPos)); + fullScore += GetValue(contextFactor); + } + + if (phraseSize >= m_nGramOrder) + { + contextFactor.push_back(&phrase.GetFactorArray(m_nGramOrder - 1)); + ngramScore = GetValue(contextFactor); + } + + // main loop + for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++) + { // used by hypo to speed up lm score calc + for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++) + { + contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1]; + } + contextFactor[m_nGramOrder - 1] = &phrase.GetFactorArray(currPos); + float partScore = GetValue(contextFactor); + ngramScore += partScore; + } + fullScore += ngramScore; +} + +LanguageModel::State LanguageModel::GetState(const std::vector &contextFactor) const +{ + State state; + GetValue(contextFactor,&state); + return state; +} + diff --git a/moses/src/LanguageModel.h b/moses/src/LanguageModel.h index ae93230f9..c407c32ec 100755 --- a/moses/src/LanguageModel.h +++ b/moses/src/LanguageModel.h @@ -39,6 +39,8 @@ protected: std::string m_filename; size_t m_nGramOrder; public: + typedef const void* State; + LanguageModel(); virtual ~LanguageModel(); virtual void Load(const std::string &fileName @@ -50,9 +52,12 @@ public: // see ScoreProducer.h unsigned int GetNumScoreComponents() const; - virtual void CalcScore(const Phrase &phrase + void CalcScore(const Phrase &phrase , float &fullScore - , float &ngramScore) const = 0; + , float &ngramScore) const; + virtual float GetValue(const std::vector &contextFactor, State* finalState = NULL) const = 0; + State GetState(const std::vector &contextFactor) const; + size_t GetNGramOrder() const { return m_nGramOrder; diff --git a/moses/src/LanguageModelMultiFactor.h b/moses/src/LanguageModelMultiFactor.h index c12ac6468..ffb26cd9d 100644 --- a/moses/src/LanguageModelMultiFactor.h +++ b/moses/src/LanguageModelMultiFactor.h @@ -30,8 +30,5 @@ protected: FactorType m_factorType; // tempoary public: - const std::string GetScoreProducerDescription() const; - - virtual float GetValue(const std::vector &contextFactor0) const = 0; - + const std::string GetScoreProducerDescription() const; }; diff --git a/moses/src/LanguageModelSingleFactor.cpp b/moses/src/LanguageModelSingleFactor.cpp index 7b1e16818..874cc9dbc 100644 --- a/moses/src/LanguageModelSingleFactor.cpp +++ b/moses/src/LanguageModelSingleFactor.cpp @@ -40,6 +40,8 @@ LanguageModelSingleFactor::State LanguageModelSingleFactor::UnknownState=0; LanguageModelSingleFactor::LanguageModelSingleFactor() { + Word::Initialize(m_sentenceStartArray); + Word::Initialize(m_sentenceEndArray); } LanguageModelSingleFactor::~LanguageModelSingleFactor() {} @@ -52,49 +54,5 @@ const std::string LanguageModelSingleFactor::GetScoreProducerDescription() const return oss.str(); } -void LanguageModelSingleFactor::CalcScore(const Phrase &phrase - , float &fullScore - , float &ngramScore) const -{ - fullScore = 0; - ngramScore = 0; - FactorType factorType = GetFactorType(); - size_t phraseSize = phrase.GetSize(); - vector contextFactor; - contextFactor.reserve(m_nGramOrder); - - // start of sentence - for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++) - { - contextFactor.push_back(phrase.GetFactor(currPos, factorType)); - fullScore += GetValue(contextFactor); - } - - if (phraseSize >= m_nGramOrder) - { - contextFactor.push_back(phrase.GetFactor(m_nGramOrder - 1, factorType)); - ngramScore = GetValue(contextFactor); - } - - // main loop - for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++) - { // used by hypo to speed up lm score calc - for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++) - { - contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1]; - } - contextFactor[m_nGramOrder - 1] = phrase.GetFactor(currPos, factorType); - float partScore = GetValue(contextFactor); - ngramScore += partScore; - } - fullScore += ngramScore; -} - -LanguageModelSingleFactor::State LanguageModelSingleFactor::GetState(const std::vector &contextFactor) const -{ - State state; - GetValue(contextFactor,&state); - return state; -} diff --git a/moses/src/LanguageModelSingleFactor.h b/moses/src/LanguageModelSingleFactor.h index 43de82922..696dbd1ee 100644 --- a/moses/src/LanguageModelSingleFactor.h +++ b/moses/src/LanguageModelSingleFactor.h @@ -31,9 +31,9 @@ class LanguageModelSingleFactor : public LanguageModel { protected: const Factor *m_sentenceStart, *m_sentenceEnd; + FactorArray m_sentenceStartArray, m_sentenceEndArray; FactorType m_factorType; public: - typedef const void* State; static State UnknownState; LanguageModelSingleFactor(); @@ -52,9 +52,14 @@ public: { return m_sentenceEnd; } - virtual void CalcScore(const Phrase &phrase - , float &fullScore - , float &ngramScore) const; + const FactorArray &GetSentenceStartArray() const + { + return m_sentenceStartArray; + } + const FactorArray &GetSentenceEndArray() const + { + return m_sentenceEndArray; + } FactorType GetFactorType() const { return m_factorType; @@ -68,12 +73,5 @@ public: m_weight = weight; } const std::string GetScoreProducerDescription() const; - virtual float GetValue(const std::vector &contextFactor, State* finalState = 0) const = 0; - - LanguageModelSingleFactor::State GetState(const std::vector &contextFactor) const; - - // one of the following should probably be made available - // virtual LmId GetLmID( const Factor *factor ) const = 0; - // virtual LmId GetLmID( const std::string &factor ) const = 0; }; diff --git a/moses/src/LanguageModel_Chunking.h b/moses/src/LanguageModel_Chunking.h index c8fbacda5..919321db3 100644 --- a/moses/src/LanguageModel_Chunking.h +++ b/moses/src/LanguageModel_Chunking.h @@ -46,45 +46,7 @@ public: m_realNGramOrder = 3; // fixed for now } - void CalcScore(const Phrase &phrase - , float &fullScore - , float &ngramScore) const - { - fullScore = 0; - ngramScore = 0; - - size_t phraseSize = phrase.GetSize(); - std::vector contextFactor; - contextFactor.reserve(m_nGramOrder); - - // start of sentence - for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++) - { - contextFactor.push_back(&phrase.GetFactorArray(currPos)); - fullScore += GetValue(contextFactor); - } - - if (phraseSize >= m_nGramOrder) - { - contextFactor.push_back(&phrase.GetFactorArray(m_nGramOrder - 1)); - ngramScore = GetValue(contextFactor); - } - - // main loop - for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++) - { // used by hypo to speed up lm score calc - for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++) - { - contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1]; - } - contextFactor[m_nGramOrder - 1] = &phrase.GetFactorArray(currPos); - float partScore = GetValue(contextFactor); - ngramScore += partScore; - } - fullScore += ngramScore; - } - - float GetValue(const std::vector &contextFactor) const + float GetValue(const std::vector &contextFactor, State* finalState = NULL) const { if (contextFactor.size() == 0) { @@ -98,13 +60,13 @@ public: // create vector of just B-factors, in reverse order size_t currOrder = 0; - std::vector chunkContext; + std::vector chunkContext; for (int currPos = (int)contextFactor.size() - 1 ; currPos >= 0 ; --currPos ) { const Factor *factor = *contextFactor[currPos][m_factorType]; if (factor->GetString().substr(0, 2) != "I-") { - chunkContext.push_back(factor); + chunkContext.push_back(contextFactor[currPos]); if (++currOrder >= m_realNGramOrder) break; } @@ -113,7 +75,6 @@ public: // create context factor the right way round std::reverse(chunkContext.begin(), chunkContext.end()); // calc score on that phrase - LanguageModelSingleFactor::State *finalState; // what shall we do with this ??? return m_lmImpl.GetValue(chunkContext, finalState); } diff --git a/moses/src/LanguageModel_IRST.cpp b/moses/src/LanguageModel_IRST.cpp index 80a643ac3..69082d2ca 100755 --- a/moses/src/LanguageModel_IRST.cpp +++ b/moses/src/LanguageModel_IRST.cpp @@ -90,11 +90,13 @@ void LanguageModel_IRST::CreateFactors(FactorCollection &factorCollection) factorId = m_sentenceStart->GetId(); lmIdMap[factorId] = GetLmID(BOS_); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceStartArray[m_factorType] = m_sentenceStart; m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_); factorId = m_sentenceEnd->GetId(); lmIdMap[factorId] = GetLmID(EOS_);; maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceEndArray[m_factorType] = m_sentenceEnd; // add to lookup vector in object m_lmIdLookup.resize(maxFactorId+1); @@ -119,8 +121,10 @@ int LanguageModel_IRST::GetLmID( const Factor *factor ) const return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId]; } -float LanguageModel_IRST::GetValue(const vector &contextFactor, State* finalState) const +float LanguageModel_IRST::GetValue(const vector &contextFactor, State* finalState) const { + FactorType factorType = GetFactorType(); + // set up context size_t count = contextFactor.size(); ngram ng(m_lmtb->dict); @@ -130,7 +134,7 @@ float LanguageModel_IRST::GetValue(const vector &contextFactor, S #ifdef CDYER_DEBUG_LMSCORE std::cout << i <<"="<GetLmId().irst <<"," << contextFactor[i]->GetString()<<" "; #endif - int lmId = GetLmID(contextFactor[i]); + int lmId = GetLmID((*contextFactor[i])[factorType]); ng.pushc(lmId); } #ifdef CDYER_DEBUG_LMSCORE diff --git a/moses/src/LanguageModel_IRST.h b/moses/src/LanguageModel_IRST.h index f9debd336..a51ff798e 100755 --- a/moses/src/LanguageModel_IRST.h +++ b/moses/src/LanguageModel_IRST.h @@ -56,7 +56,7 @@ public: , float weight , size_t nGramOrder); - virtual float GetValue(const std::vector &contextFactor, State* finalState = 0) const; + virtual float GetValue(const std::vector &contextFactor, State* finalState = NULL) const; }; diff --git a/moses/src/LanguageModel_SRI.cpp b/moses/src/LanguageModel_SRI.cpp index b3701f6ed..7164ed435 100755 --- a/moses/src/LanguageModel_SRI.cpp +++ b/moses/src/LanguageModel_SRI.cpp @@ -97,11 +97,13 @@ void LanguageModel_SRI::CreateFactors(FactorCollection &factorCollection) factorId = m_sentenceStart->GetId(); lmIdMap[factorId] = GetLmID(BOS_); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceStartArray[m_factorType] = m_sentenceStart; m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_); factorId = m_sentenceEnd->GetId(); lmIdMap[factorId] = GetLmID(EOS_); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceEndArray[m_factorType] = m_sentenceEnd; // add to lookup vector in object m_lmIdLookup.resize(maxFactorId+1); @@ -131,8 +133,9 @@ float LanguageModel_SRI::GetValue(VocabIndex wordId, VocabIndex *context) const return FloorSRIScore(TransformSRIScore(p)); // log10->log } -float LanguageModel_SRI::GetValue(const vector &contextFactor, State* finalState) const +float LanguageModel_SRI::GetValue(const vector &contextFactor, State* finalState) const { + FactorType factorType = GetFactorType(); size_t count = contextFactor.size(); if (count <= 0) { @@ -144,12 +147,13 @@ float LanguageModel_SRI::GetValue(const vector &contextFactor, St VocabIndex context[MAX_NGRAM_SIZE]; for (size_t i = 0 ; i < count - 1 ; i++) { - context[i] = GetLmID(contextFactor[count-2-i]); + context[i] = GetLmID((*contextFactor[count-2-i])[factorType]); } context[count-1] = Vocab_None; + assert((*contextFactor[count-1])[factorType] != NULL); // call sri lm fn - VocabIndex lmId= GetLmID(contextFactor[count-1]); + VocabIndex lmId= GetLmID((*contextFactor[count-1])[factorType]); float ret = GetValue(lmId, context); if (finalState) { diff --git a/moses/src/LanguageModel_SRI.h b/moses/src/LanguageModel_SRI.h index 526f24c56..452add896 100755 --- a/moses/src/LanguageModel_SRI.h +++ b/moses/src/LanguageModel_SRI.h @@ -56,6 +56,6 @@ public: , float weight , size_t nGramOrder); - virtual float GetValue(const std::vector &contextFactor, State* finalState = 0) const; + virtual float GetValue(const std::vector &contextFactor, State* finalState = NULL) const; };