factored lm

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@533 1f5c12ca-751b-0410-a591-d2e778427230
2024-10-07 00:39:23 +03:00 · 2006-08-07 18:21:05 +00:00 · 2006-08-07 18:21:05 +00:00 · 33b3fc67e4
commit 33b3fc67e4
parent 4a88275261
11 changed files with 91 additions and 118 deletions
--- a/moses/src/Hypothesis.cpp
+++ b/moses/src/Hypothesis.cpp
@ -265,14 +265,14 @@ void Hypothesis::CalcLMScore(const LMList &languageModels)
 			lmScore = 0; //the score associated with dropping source words is not part of the language model
 		} else { //non-empty target phrase
 			// 1st n-gram
-			vector<const Factor*> contextFactor(nGramOrder);
+			vector<const FactorArray*> contextFactor(nGramOrder);
 			size_t index = 0;
 			for (int currPos = (int) startPos - (int) nGramOrder + 1 ; currPos <= (int) startPos ; currPos++)
 			{
 				if (currPos >= 0)
-					contextFactor[index++] = GetFactor(currPos, factorType);
+					contextFactor[index++] = &GetFactorArray(currPos);
 				else			
-					contextFactor[index++] = languageModel.GetSentenceStart();
+					contextFactor[index++] = &languageModel.GetSentenceStartArray();
 			}
 			lmScore	= languageModel.GetValue(contextFactor);
 			//cout<<"context factor: "<<languageModel.GetValue(contextFactor)<<endl;
@ -287,7 +287,7 @@ void Hypothesis::CalcLMScore(const LMList &languageModels)
 					contextFactor[i] = contextFactor[i + 1];
 	
 				// add last factor
-				contextFactor.back() = GetFactor(currPos, factorType);
+				contextFactor.back() = &GetFactorArray(currPos);

 				lmScore	+= languageModel.GetValue(contextFactor);
 				//cout<<"context factor: "<<languageModel.GetValue(contextFactor)<<endl;		
@ -297,22 +297,22 @@ void Hypothesis::CalcLMScore(const LMList &languageModels)
 			if (m_sourceCompleted.IsComplete())
 			{
 				const size_t size = GetSize();
-				contextFactor.back() = languageModel.GetSentenceEnd();
+				contextFactor.back() = &languageModel.GetSentenceEndArray();
 	
 				for (size_t i = 0 ; i < nGramOrder - 1 ; i ++)
 				{
 					int currPos = size - nGramOrder + i + 1;
 					if (currPos < 0)
-						contextFactor[i] = languageModel.GetSentenceStart();
+						contextFactor[i] = &languageModel.GetSentenceStartArray();
 					else
-						contextFactor[i] = GetFactor((size_t)currPos, factorType);
+						contextFactor[i] = &GetFactorArray((size_t)currPos);
 				}
 				lmScore	+= languageModel.GetValue(contextFactor, &m_languageModelStates[lmIdx]);
 			} else {
 				for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
 					for (size_t i = 0 ; i < nGramOrder - 1 ; i++)
 						contextFactor[i] = contextFactor[i + 1];
-					contextFactor.back() = GetFactor(currPos, factorType);
+					contextFactor.back() = &GetFactorArray(currPos);
 				}
 				m_languageModelStates[lmIdx]=languageModel.GetState(contextFactor);
 			}
--- a/moses/src/LanguageModel.cpp
+++ b/moses/src/LanguageModel.cpp
@ -46,3 +46,49 @@ unsigned int LanguageModel::GetNumScoreComponents() const
 {
 	return 1;
 }
+
+void LanguageModel::CalcScore(const Phrase &phrase
+														, float &fullScore
+														, float &ngramScore) const
+{
+	fullScore	= 0;
+	ngramScore	= 0;
+
+	size_t phraseSize = phrase.GetSize();
+	vector<const FactorArray*> contextFactor;
+	contextFactor.reserve(m_nGramOrder);
+
+	// start of sentence
+	for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++)
+	{
+		contextFactor.push_back(&phrase.GetFactorArray(currPos));		
+		fullScore += GetValue(contextFactor);
+	}
+	
+	if (phraseSize >= m_nGramOrder)
+	{
+		contextFactor.push_back(&phrase.GetFactorArray(m_nGramOrder - 1));
+		ngramScore = GetValue(contextFactor);
+	}
+	
+	// main loop
+	for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++)
+	{ // used by hypo to speed up lm score calc
+		for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++)
+		{
+			contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1];
+		}
+		contextFactor[m_nGramOrder - 1] = &phrase.GetFactorArray(currPos);
+		float partScore = GetValue(contextFactor);		
+		ngramScore += partScore;		
+	}
+	fullScore += ngramScore;	
+}
+
+LanguageModel::State LanguageModel::GetState(const std::vector<const FactorArray*> &contextFactor) const
+{
+  State state;
+  GetValue(contextFactor,&state);
+  return state;
+}
+
--- a/moses/src/LanguageModel.h
+++ b/moses/src/LanguageModel.h
@ -39,6 +39,8 @@ protected:
 	std::string	m_filename;
 	size_t			m_nGramOrder;
 public:
+  typedef const void* State;
+
 	LanguageModel();
 	virtual ~LanguageModel();
 	virtual void Load(const std::string &fileName
@ -50,9 +52,12 @@ public:
 	// see ScoreProducer.h
 	unsigned int GetNumScoreComponents() const;

-	virtual void CalcScore(const Phrase &phrase
+	void CalcScore(const Phrase &phrase
 							, float &fullScore
-							, float &ngramScore) const = 0;
+							, float &ngramScore) const;
+	virtual float GetValue(const std::vector<const FactorArray*> &contextFactor, State* finalState = NULL) const = 0;
+	State GetState(const std::vector<const FactorArray*> &contextFactor) const;
+
 	size_t GetNGramOrder() const
 	{
 		return m_nGramOrder;
--- a/moses/src/LanguageModelMultiFactor.h
+++ b/moses/src/LanguageModelMultiFactor.h
@ -30,8 +30,5 @@ protected:
 	FactorType	m_factorType; // tempoary
 	
 public:
-	const std::string GetScoreProducerDescription() const;
-	
-	virtual float GetValue(const std::vector<const FactorArray*> &contextFactor0) const = 0;
-	
+	const std::string GetScoreProducerDescription() const;	
 };
--- a/moses/src/LanguageModelSingleFactor.cpp
+++ b/moses/src/LanguageModelSingleFactor.cpp
@ -40,6 +40,8 @@ LanguageModelSingleFactor::State LanguageModelSingleFactor::UnknownState=0;

 LanguageModelSingleFactor::LanguageModelSingleFactor() 
 {
+	Word::Initialize(m_sentenceStartArray);
+	Word::Initialize(m_sentenceEndArray);
 }
 LanguageModelSingleFactor::~LanguageModelSingleFactor() {}

@ -52,49 +54,5 @@ const std::string LanguageModelSingleFactor::GetScoreProducerDescription() const
 	return oss.str();
 } 

-void LanguageModelSingleFactor::CalcScore(const Phrase &phrase
-														, float &fullScore
-														, float &ngramScore) const
-{
-	fullScore	= 0;
-	ngramScore	= 0;
-	FactorType factorType = GetFactorType();

-	size_t phraseSize = phrase.GetSize();
-	vector<const Factor*> contextFactor;
-	contextFactor.reserve(m_nGramOrder);
-
-	// start of sentence
-	for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++)
-	{
-		contextFactor.push_back(phrase.GetFactor(currPos, factorType));		
-		fullScore += GetValue(contextFactor);
-	}
-	
-	if (phraseSize >= m_nGramOrder)
-	{
-		contextFactor.push_back(phrase.GetFactor(m_nGramOrder - 1, factorType));
-		ngramScore = GetValue(contextFactor);
-	}
-	
-	// main loop
-	for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++)
-	{ // used by hypo to speed up lm score calc
-		for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++)
-		{
-			contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1];
-		}
-		contextFactor[m_nGramOrder - 1] = phrase.GetFactor(currPos, factorType);
-		float partScore = GetValue(contextFactor);		
-		ngramScore += partScore;		
-	}
-	fullScore += ngramScore;	
-}
-
-LanguageModelSingleFactor::State LanguageModelSingleFactor::GetState(const std::vector<const Factor*> &contextFactor) const
-{
-  State state;
-  GetValue(contextFactor,&state);
-  return state;
-}

--- a/moses/src/LanguageModelSingleFactor.h
+++ b/moses/src/LanguageModelSingleFactor.h
@ -31,9 +31,9 @@ class LanguageModelSingleFactor : public LanguageModel
 {
 protected:	
 	const Factor *m_sentenceStart, *m_sentenceEnd;
+	FactorArray m_sentenceStartArray, m_sentenceEndArray;
 	FactorType	m_factorType;
 public:
-  typedef const void* State;
  static State UnknownState;

 	LanguageModelSingleFactor();
@ -52,9 +52,14 @@ public:
 	{
 		return m_sentenceEnd;
 	}
-	virtual void CalcScore(const Phrase &phrase
-											, float &fullScore
-											, float &ngramScore) const;
+	const FactorArray &GetSentenceStartArray() const
+	{
+		return m_sentenceStartArray;
+	}
+	const FactorArray &GetSentenceEndArray() const
+	{
+		return m_sentenceEndArray;
+	}
 	FactorType GetFactorType() const
 	{
 		return m_factorType;
@ -68,12 +73,5 @@ public:
 		m_weight = weight;
 	}
 	const std::string GetScoreProducerDescription() const;
-	virtual float GetValue(const std::vector<const Factor*> &contextFactor, State* finalState = 0) const = 0;
-
-	LanguageModelSingleFactor::State GetState(const std::vector<const Factor*> &contextFactor) const;
-
-  // one of the following should probably be made available
-  // virtual LmId GetLmID( const Factor *factor )  const = 0;
-  // virtual LmId GetLmID( const std::string &factor )  const = 0;
 };

--- a/moses/src/LanguageModel_Chunking.h
+++ b/moses/src/LanguageModel_Chunking.h
@ -46,45 +46,7 @@ public:
 		m_realNGramOrder = 3; // fixed for now
 	}
 		
-	void CalcScore(const Phrase &phrase
-							, float &fullScore
-							, float &ngramScore) const
-	{
-		fullScore	= 0;
-		ngramScore	= 0;
-	
-		size_t phraseSize = phrase.GetSize();
-		std::vector<const FactorArray*> contextFactor;
-		contextFactor.reserve(m_nGramOrder);
-				
-		// start of sentence
-		for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++)
-		{
-			contextFactor.push_back(&phrase.GetFactorArray(currPos));
-			fullScore += GetValue(contextFactor);
-		}
-		
-		if (phraseSize >= m_nGramOrder)
-		{
-			contextFactor.push_back(&phrase.GetFactorArray(m_nGramOrder - 1));
-			ngramScore = GetValue(contextFactor);
-		}
-		
-		// main loop
-		for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++)
-		{ // used by hypo to speed up lm score calc
-			for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++)
-			{
-				contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1];
-			}
-			contextFactor[m_nGramOrder - 1] = &phrase.GetFactorArray(currPos);
-			float partScore = GetValue(contextFactor);			
-			ngramScore += partScore;		
-		}
-		fullScore += ngramScore;	
-	}
-	
-	float GetValue(const std::vector<const FactorArray*> &contextFactor) const
+	float GetValue(const std::vector<const FactorArray*> &contextFactor, State* finalState = NULL) const
 	{
 		if (contextFactor.size() == 0)
 		{
@ -98,13 +60,13 @@ public:
 	
 		// create vector of just B-factors, in reverse order
 		size_t currOrder = 0;
-		std::vector<const Factor*> chunkContext;
+		std::vector<const FactorArray*> chunkContext;
 		for (int currPos = (int)contextFactor.size() - 1 ; currPos >= 0 ; --currPos )
 		{
 			const Factor *factor = *contextFactor[currPos][m_factorType];
 			if (factor->GetString().substr(0, 2) != "I-")
 			{
-				chunkContext.push_back(factor);
+				chunkContext.push_back(contextFactor[currPos]);
 				if (++currOrder >= m_realNGramOrder)
 					break;
 			}
@ -113,7 +75,6 @@ public:
 		// create context factor the right way round
 		std::reverse(chunkContext.begin(), chunkContext.end());
 		// calc score on that phrase
-		LanguageModelSingleFactor::State *finalState; // what shall we do with this ???
 		return m_lmImpl.GetValue(chunkContext, finalState);
 	}
 	
--- a/moses/src/LanguageModel_IRST.cpp
+++ b/moses/src/LanguageModel_IRST.cpp
@ -90,11 +90,13 @@ void LanguageModel_IRST::CreateFactors(FactorCollection &factorCollection)
 	factorId = m_sentenceStart->GetId();
 	lmIdMap[factorId] = GetLmID(BOS_);
 	maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
+	m_sentenceStartArray[m_factorType] = m_sentenceStart;

 	m_sentenceEnd		= factorCollection.AddFactor(Output, m_factorType, EOS_);
 	factorId = m_sentenceEnd->GetId();
 	lmIdMap[factorId] = GetLmID(EOS_);;
 	maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
+	m_sentenceEndArray[m_factorType] = m_sentenceEnd;
 	
 	// add to lookup vector in object
 	m_lmIdLookup.resize(maxFactorId+1);
@ -119,8 +121,10 @@ int LanguageModel_IRST::GetLmID( const Factor *factor ) const
 	return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId];
 }

-float LanguageModel_IRST::GetValue(const vector<const Factor*> &contextFactor, State* finalState) const
+float LanguageModel_IRST::GetValue(const vector<const FactorArray*> &contextFactor, State* finalState) const
 {
+	FactorType factorType = GetFactorType();
+	
 	// set up context
 	size_t count = contextFactor.size();
  ngram ng(m_lmtb->dict);
@ -130,7 +134,7 @@ float LanguageModel_IRST::GetValue(const vector<const Factor*> &contextFactor, S
 #ifdef CDYER_DEBUG_LMSCORE
 		std::cout << i <<"="<<contextFactor[i]->GetLmId().irst <<"," << contextFactor[i]->GetString()<<" ";
 #endif
-    int lmId = GetLmID(contextFactor[i]);
+    int lmId = GetLmID((*contextFactor[i])[factorType]);
 		ng.pushc(lmId);
 	}
 #ifdef CDYER_DEBUG_LMSCORE
--- a/moses/src/LanguageModel_IRST.h
+++ b/moses/src/LanguageModel_IRST.h
@ -56,7 +56,7 @@ public:
 					, float weight
 					, size_t nGramOrder);

-  virtual float GetValue(const std::vector<const Factor*> &contextFactor, State* finalState = 0) const;
+  virtual float GetValue(const std::vector<const FactorArray*> &contextFactor, State* finalState = NULL) const;

 };

--- a/moses/src/LanguageModel_SRI.cpp
+++ b/moses/src/LanguageModel_SRI.cpp
@ -97,11 +97,13 @@ void LanguageModel_SRI::CreateFactors(FactorCollection &factorCollection)
 	factorId = m_sentenceStart->GetId();
 	lmIdMap[factorId] = GetLmID(BOS_);
 	maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
+	m_sentenceStartArray[m_factorType] = m_sentenceStart;
 	
 	m_sentenceEnd		= factorCollection.AddFactor(Output, m_factorType, EOS_);
 	factorId = m_sentenceEnd->GetId();
 	lmIdMap[factorId] = GetLmID(EOS_);
 	maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
+	m_sentenceEndArray[m_factorType] = m_sentenceEnd;
 	
 	// add to lookup vector in object
 	m_lmIdLookup.resize(maxFactorId+1);
@ -131,8 +133,9 @@ float LanguageModel_SRI::GetValue(VocabIndex wordId, VocabIndex *context) const
 	return FloorSRIScore(TransformSRIScore(p));  // log10->log
 }

-float LanguageModel_SRI::GetValue(const vector<const Factor*> &contextFactor, State* finalState) const
+float LanguageModel_SRI::GetValue(const vector<const FactorArray*> &contextFactor, State* finalState) const
 {
+	FactorType	factorType = GetFactorType();
 	size_t count = contextFactor.size();
 	if (count <= 0)
 	{
@ -144,12 +147,13 @@ float LanguageModel_SRI::GetValue(const vector<const Factor*> &contextFactor, St
 	VocabIndex context[MAX_NGRAM_SIZE];
 	for (size_t i = 0 ; i < count - 1 ; i++)
 	{
-		context[i] =  GetLmID(contextFactor[count-2-i]);
+		context[i] =  GetLmID((*contextFactor[count-2-i])[factorType]);
 	}
 	context[count-1] = Vocab_None;
 	
+	assert((*contextFactor[count-1])[factorType] != NULL);
 	// call sri lm fn
-	VocabIndex lmId= GetLmID(contextFactor[count-1]);
+	VocabIndex lmId= GetLmID((*contextFactor[count-1])[factorType]);
 	float ret = GetValue(lmId, context);

 	if (finalState) {
--- a/moses/src/LanguageModel_SRI.h
+++ b/moses/src/LanguageModel_SRI.h
@ -56,6 +56,6 @@ public:
 					, float weight
 					, size_t nGramOrder);

-  virtual float GetValue(const std::vector<const Factor*> &contextFactor, State* finalState = 0) const;
+  virtual float GetValue(const std::vector<const FactorArray*> &contextFactor, State* finalState = NULL) const;
 };