mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-07 00:39:23 +03:00
factored lm
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@533 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
4a88275261
commit
33b3fc67e4
@ -265,14 +265,14 @@ void Hypothesis::CalcLMScore(const LMList &languageModels)
|
||||
lmScore = 0; //the score associated with dropping source words is not part of the language model
|
||||
} else { //non-empty target phrase
|
||||
// 1st n-gram
|
||||
vector<const Factor*> contextFactor(nGramOrder);
|
||||
vector<const FactorArray*> contextFactor(nGramOrder);
|
||||
size_t index = 0;
|
||||
for (int currPos = (int) startPos - (int) nGramOrder + 1 ; currPos <= (int) startPos ; currPos++)
|
||||
{
|
||||
if (currPos >= 0)
|
||||
contextFactor[index++] = GetFactor(currPos, factorType);
|
||||
contextFactor[index++] = &GetFactorArray(currPos);
|
||||
else
|
||||
contextFactor[index++] = languageModel.GetSentenceStart();
|
||||
contextFactor[index++] = &languageModel.GetSentenceStartArray();
|
||||
}
|
||||
lmScore = languageModel.GetValue(contextFactor);
|
||||
//cout<<"context factor: "<<languageModel.GetValue(contextFactor)<<endl;
|
||||
@ -287,7 +287,7 @@ void Hypothesis::CalcLMScore(const LMList &languageModels)
|
||||
contextFactor[i] = contextFactor[i + 1];
|
||||
|
||||
// add last factor
|
||||
contextFactor.back() = GetFactor(currPos, factorType);
|
||||
contextFactor.back() = &GetFactorArray(currPos);
|
||||
|
||||
lmScore += languageModel.GetValue(contextFactor);
|
||||
//cout<<"context factor: "<<languageModel.GetValue(contextFactor)<<endl;
|
||||
@ -297,22 +297,22 @@ void Hypothesis::CalcLMScore(const LMList &languageModels)
|
||||
if (m_sourceCompleted.IsComplete())
|
||||
{
|
||||
const size_t size = GetSize();
|
||||
contextFactor.back() = languageModel.GetSentenceEnd();
|
||||
contextFactor.back() = &languageModel.GetSentenceEndArray();
|
||||
|
||||
for (size_t i = 0 ; i < nGramOrder - 1 ; i ++)
|
||||
{
|
||||
int currPos = size - nGramOrder + i + 1;
|
||||
if (currPos < 0)
|
||||
contextFactor[i] = languageModel.GetSentenceStart();
|
||||
contextFactor[i] = &languageModel.GetSentenceStartArray();
|
||||
else
|
||||
contextFactor[i] = GetFactor((size_t)currPos, factorType);
|
||||
contextFactor[i] = &GetFactorArray((size_t)currPos);
|
||||
}
|
||||
lmScore += languageModel.GetValue(contextFactor, &m_languageModelStates[lmIdx]);
|
||||
} else {
|
||||
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
|
||||
for (size_t i = 0 ; i < nGramOrder - 1 ; i++)
|
||||
contextFactor[i] = contextFactor[i + 1];
|
||||
contextFactor.back() = GetFactor(currPos, factorType);
|
||||
contextFactor.back() = &GetFactorArray(currPos);
|
||||
}
|
||||
m_languageModelStates[lmIdx]=languageModel.GetState(contextFactor);
|
||||
}
|
||||
|
@ -46,3 +46,49 @@ unsigned int LanguageModel::GetNumScoreComponents() const
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
void LanguageModel::CalcScore(const Phrase &phrase
|
||||
, float &fullScore
|
||||
, float &ngramScore) const
|
||||
{
|
||||
fullScore = 0;
|
||||
ngramScore = 0;
|
||||
|
||||
size_t phraseSize = phrase.GetSize();
|
||||
vector<const FactorArray*> contextFactor;
|
||||
contextFactor.reserve(m_nGramOrder);
|
||||
|
||||
// start of sentence
|
||||
for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++)
|
||||
{
|
||||
contextFactor.push_back(&phrase.GetFactorArray(currPos));
|
||||
fullScore += GetValue(contextFactor);
|
||||
}
|
||||
|
||||
if (phraseSize >= m_nGramOrder)
|
||||
{
|
||||
contextFactor.push_back(&phrase.GetFactorArray(m_nGramOrder - 1));
|
||||
ngramScore = GetValue(contextFactor);
|
||||
}
|
||||
|
||||
// main loop
|
||||
for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++)
|
||||
{ // used by hypo to speed up lm score calc
|
||||
for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++)
|
||||
{
|
||||
contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1];
|
||||
}
|
||||
contextFactor[m_nGramOrder - 1] = &phrase.GetFactorArray(currPos);
|
||||
float partScore = GetValue(contextFactor);
|
||||
ngramScore += partScore;
|
||||
}
|
||||
fullScore += ngramScore;
|
||||
}
|
||||
|
||||
LanguageModel::State LanguageModel::GetState(const std::vector<const FactorArray*> &contextFactor) const
|
||||
{
|
||||
State state;
|
||||
GetValue(contextFactor,&state);
|
||||
return state;
|
||||
}
|
||||
|
||||
|
@ -39,6 +39,8 @@ protected:
|
||||
std::string m_filename;
|
||||
size_t m_nGramOrder;
|
||||
public:
|
||||
typedef const void* State;
|
||||
|
||||
LanguageModel();
|
||||
virtual ~LanguageModel();
|
||||
virtual void Load(const std::string &fileName
|
||||
@ -50,9 +52,12 @@ public:
|
||||
// see ScoreProducer.h
|
||||
unsigned int GetNumScoreComponents() const;
|
||||
|
||||
virtual void CalcScore(const Phrase &phrase
|
||||
void CalcScore(const Phrase &phrase
|
||||
, float &fullScore
|
||||
, float &ngramScore) const = 0;
|
||||
, float &ngramScore) const;
|
||||
virtual float GetValue(const std::vector<const FactorArray*> &contextFactor, State* finalState = NULL) const = 0;
|
||||
State GetState(const std::vector<const FactorArray*> &contextFactor) const;
|
||||
|
||||
size_t GetNGramOrder() const
|
||||
{
|
||||
return m_nGramOrder;
|
||||
|
@ -30,8 +30,5 @@ protected:
|
||||
FactorType m_factorType; // tempoary
|
||||
|
||||
public:
|
||||
const std::string GetScoreProducerDescription() const;
|
||||
|
||||
virtual float GetValue(const std::vector<const FactorArray*> &contextFactor0) const = 0;
|
||||
|
||||
const std::string GetScoreProducerDescription() const;
|
||||
};
|
||||
|
@ -40,6 +40,8 @@ LanguageModelSingleFactor::State LanguageModelSingleFactor::UnknownState=0;
|
||||
|
||||
LanguageModelSingleFactor::LanguageModelSingleFactor()
|
||||
{
|
||||
Word::Initialize(m_sentenceStartArray);
|
||||
Word::Initialize(m_sentenceEndArray);
|
||||
}
|
||||
LanguageModelSingleFactor::~LanguageModelSingleFactor() {}
|
||||
|
||||
@ -52,49 +54,5 @@ const std::string LanguageModelSingleFactor::GetScoreProducerDescription() const
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
void LanguageModelSingleFactor::CalcScore(const Phrase &phrase
|
||||
, float &fullScore
|
||||
, float &ngramScore) const
|
||||
{
|
||||
fullScore = 0;
|
||||
ngramScore = 0;
|
||||
FactorType factorType = GetFactorType();
|
||||
|
||||
size_t phraseSize = phrase.GetSize();
|
||||
vector<const Factor*> contextFactor;
|
||||
contextFactor.reserve(m_nGramOrder);
|
||||
|
||||
// start of sentence
|
||||
for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++)
|
||||
{
|
||||
contextFactor.push_back(phrase.GetFactor(currPos, factorType));
|
||||
fullScore += GetValue(contextFactor);
|
||||
}
|
||||
|
||||
if (phraseSize >= m_nGramOrder)
|
||||
{
|
||||
contextFactor.push_back(phrase.GetFactor(m_nGramOrder - 1, factorType));
|
||||
ngramScore = GetValue(contextFactor);
|
||||
}
|
||||
|
||||
// main loop
|
||||
for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++)
|
||||
{ // used by hypo to speed up lm score calc
|
||||
for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++)
|
||||
{
|
||||
contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1];
|
||||
}
|
||||
contextFactor[m_nGramOrder - 1] = phrase.GetFactor(currPos, factorType);
|
||||
float partScore = GetValue(contextFactor);
|
||||
ngramScore += partScore;
|
||||
}
|
||||
fullScore += ngramScore;
|
||||
}
|
||||
|
||||
LanguageModelSingleFactor::State LanguageModelSingleFactor::GetState(const std::vector<const Factor*> &contextFactor) const
|
||||
{
|
||||
State state;
|
||||
GetValue(contextFactor,&state);
|
||||
return state;
|
||||
}
|
||||
|
||||
|
@ -31,9 +31,9 @@ class LanguageModelSingleFactor : public LanguageModel
|
||||
{
|
||||
protected:
|
||||
const Factor *m_sentenceStart, *m_sentenceEnd;
|
||||
FactorArray m_sentenceStartArray, m_sentenceEndArray;
|
||||
FactorType m_factorType;
|
||||
public:
|
||||
typedef const void* State;
|
||||
static State UnknownState;
|
||||
|
||||
LanguageModelSingleFactor();
|
||||
@ -52,9 +52,14 @@ public:
|
||||
{
|
||||
return m_sentenceEnd;
|
||||
}
|
||||
virtual void CalcScore(const Phrase &phrase
|
||||
, float &fullScore
|
||||
, float &ngramScore) const;
|
||||
const FactorArray &GetSentenceStartArray() const
|
||||
{
|
||||
return m_sentenceStartArray;
|
||||
}
|
||||
const FactorArray &GetSentenceEndArray() const
|
||||
{
|
||||
return m_sentenceEndArray;
|
||||
}
|
||||
FactorType GetFactorType() const
|
||||
{
|
||||
return m_factorType;
|
||||
@ -68,12 +73,5 @@ public:
|
||||
m_weight = weight;
|
||||
}
|
||||
const std::string GetScoreProducerDescription() const;
|
||||
virtual float GetValue(const std::vector<const Factor*> &contextFactor, State* finalState = 0) const = 0;
|
||||
|
||||
LanguageModelSingleFactor::State GetState(const std::vector<const Factor*> &contextFactor) const;
|
||||
|
||||
// one of the following should probably be made available
|
||||
// virtual LmId GetLmID( const Factor *factor ) const = 0;
|
||||
// virtual LmId GetLmID( const std::string &factor ) const = 0;
|
||||
};
|
||||
|
||||
|
@ -46,45 +46,7 @@ public:
|
||||
m_realNGramOrder = 3; // fixed for now
|
||||
}
|
||||
|
||||
void CalcScore(const Phrase &phrase
|
||||
, float &fullScore
|
||||
, float &ngramScore) const
|
||||
{
|
||||
fullScore = 0;
|
||||
ngramScore = 0;
|
||||
|
||||
size_t phraseSize = phrase.GetSize();
|
||||
std::vector<const FactorArray*> contextFactor;
|
||||
contextFactor.reserve(m_nGramOrder);
|
||||
|
||||
// start of sentence
|
||||
for (size_t currPos = 0 ; currPos < m_nGramOrder - 1 && currPos < phraseSize ; currPos++)
|
||||
{
|
||||
contextFactor.push_back(&phrase.GetFactorArray(currPos));
|
||||
fullScore += GetValue(contextFactor);
|
||||
}
|
||||
|
||||
if (phraseSize >= m_nGramOrder)
|
||||
{
|
||||
contextFactor.push_back(&phrase.GetFactorArray(m_nGramOrder - 1));
|
||||
ngramScore = GetValue(contextFactor);
|
||||
}
|
||||
|
||||
// main loop
|
||||
for (size_t currPos = m_nGramOrder; currPos < phraseSize ; currPos++)
|
||||
{ // used by hypo to speed up lm score calc
|
||||
for (size_t currNGramOrder = 0 ; currNGramOrder < m_nGramOrder - 1 ; currNGramOrder++)
|
||||
{
|
||||
contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1];
|
||||
}
|
||||
contextFactor[m_nGramOrder - 1] = &phrase.GetFactorArray(currPos);
|
||||
float partScore = GetValue(contextFactor);
|
||||
ngramScore += partScore;
|
||||
}
|
||||
fullScore += ngramScore;
|
||||
}
|
||||
|
||||
float GetValue(const std::vector<const FactorArray*> &contextFactor) const
|
||||
float GetValue(const std::vector<const FactorArray*> &contextFactor, State* finalState = NULL) const
|
||||
{
|
||||
if (contextFactor.size() == 0)
|
||||
{
|
||||
@ -98,13 +60,13 @@ public:
|
||||
|
||||
// create vector of just B-factors, in reverse order
|
||||
size_t currOrder = 0;
|
||||
std::vector<const Factor*> chunkContext;
|
||||
std::vector<const FactorArray*> chunkContext;
|
||||
for (int currPos = (int)contextFactor.size() - 1 ; currPos >= 0 ; --currPos )
|
||||
{
|
||||
const Factor *factor = *contextFactor[currPos][m_factorType];
|
||||
if (factor->GetString().substr(0, 2) != "I-")
|
||||
{
|
||||
chunkContext.push_back(factor);
|
||||
chunkContext.push_back(contextFactor[currPos]);
|
||||
if (++currOrder >= m_realNGramOrder)
|
||||
break;
|
||||
}
|
||||
@ -113,7 +75,6 @@ public:
|
||||
// create context factor the right way round
|
||||
std::reverse(chunkContext.begin(), chunkContext.end());
|
||||
// calc score on that phrase
|
||||
LanguageModelSingleFactor::State *finalState; // what shall we do with this ???
|
||||
return m_lmImpl.GetValue(chunkContext, finalState);
|
||||
}
|
||||
|
||||
|
@ -90,11 +90,13 @@ void LanguageModel_IRST::CreateFactors(FactorCollection &factorCollection)
|
||||
factorId = m_sentenceStart->GetId();
|
||||
lmIdMap[factorId] = GetLmID(BOS_);
|
||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||
m_sentenceStartArray[m_factorType] = m_sentenceStart;
|
||||
|
||||
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
|
||||
factorId = m_sentenceEnd->GetId();
|
||||
lmIdMap[factorId] = GetLmID(EOS_);;
|
||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
|
||||
|
||||
// add to lookup vector in object
|
||||
m_lmIdLookup.resize(maxFactorId+1);
|
||||
@ -119,8 +121,10 @@ int LanguageModel_IRST::GetLmID( const Factor *factor ) const
|
||||
return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId];
|
||||
}
|
||||
|
||||
float LanguageModel_IRST::GetValue(const vector<const Factor*> &contextFactor, State* finalState) const
|
||||
float LanguageModel_IRST::GetValue(const vector<const FactorArray*> &contextFactor, State* finalState) const
|
||||
{
|
||||
FactorType factorType = GetFactorType();
|
||||
|
||||
// set up context
|
||||
size_t count = contextFactor.size();
|
||||
ngram ng(m_lmtb->dict);
|
||||
@ -130,7 +134,7 @@ float LanguageModel_IRST::GetValue(const vector<const Factor*> &contextFactor, S
|
||||
#ifdef CDYER_DEBUG_LMSCORE
|
||||
std::cout << i <<"="<<contextFactor[i]->GetLmId().irst <<"," << contextFactor[i]->GetString()<<" ";
|
||||
#endif
|
||||
int lmId = GetLmID(contextFactor[i]);
|
||||
int lmId = GetLmID((*contextFactor[i])[factorType]);
|
||||
ng.pushc(lmId);
|
||||
}
|
||||
#ifdef CDYER_DEBUG_LMSCORE
|
||||
|
@ -56,7 +56,7 @@ public:
|
||||
, float weight
|
||||
, size_t nGramOrder);
|
||||
|
||||
virtual float GetValue(const std::vector<const Factor*> &contextFactor, State* finalState = 0) const;
|
||||
virtual float GetValue(const std::vector<const FactorArray*> &contextFactor, State* finalState = NULL) const;
|
||||
|
||||
};
|
||||
|
||||
|
@ -97,11 +97,13 @@ void LanguageModel_SRI::CreateFactors(FactorCollection &factorCollection)
|
||||
factorId = m_sentenceStart->GetId();
|
||||
lmIdMap[factorId] = GetLmID(BOS_);
|
||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||
m_sentenceStartArray[m_factorType] = m_sentenceStart;
|
||||
|
||||
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
|
||||
factorId = m_sentenceEnd->GetId();
|
||||
lmIdMap[factorId] = GetLmID(EOS_);
|
||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
|
||||
|
||||
// add to lookup vector in object
|
||||
m_lmIdLookup.resize(maxFactorId+1);
|
||||
@ -131,8 +133,9 @@ float LanguageModel_SRI::GetValue(VocabIndex wordId, VocabIndex *context) const
|
||||
return FloorSRIScore(TransformSRIScore(p)); // log10->log
|
||||
}
|
||||
|
||||
float LanguageModel_SRI::GetValue(const vector<const Factor*> &contextFactor, State* finalState) const
|
||||
float LanguageModel_SRI::GetValue(const vector<const FactorArray*> &contextFactor, State* finalState) const
|
||||
{
|
||||
FactorType factorType = GetFactorType();
|
||||
size_t count = contextFactor.size();
|
||||
if (count <= 0)
|
||||
{
|
||||
@ -144,12 +147,13 @@ float LanguageModel_SRI::GetValue(const vector<const Factor*> &contextFactor, St
|
||||
VocabIndex context[MAX_NGRAM_SIZE];
|
||||
for (size_t i = 0 ; i < count - 1 ; i++)
|
||||
{
|
||||
context[i] = GetLmID(contextFactor[count-2-i]);
|
||||
context[i] = GetLmID((*contextFactor[count-2-i])[factorType]);
|
||||
}
|
||||
context[count-1] = Vocab_None;
|
||||
|
||||
assert((*contextFactor[count-1])[factorType] != NULL);
|
||||
// call sri lm fn
|
||||
VocabIndex lmId= GetLmID(contextFactor[count-1]);
|
||||
VocabIndex lmId= GetLmID((*contextFactor[count-1])[factorType]);
|
||||
float ret = GetValue(lmId, context);
|
||||
|
||||
if (finalState) {
|
||||
|
@ -56,6 +56,6 @@ public:
|
||||
, float weight
|
||||
, size_t nGramOrder);
|
||||
|
||||
virtual float GetValue(const std::vector<const Factor*> &contextFactor, State* finalState = 0) const;
|
||||
virtual float GetValue(const std::vector<const FactorArray*> &contextFactor, State* finalState = NULL) const;
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user