mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-02 17:09:36 +03:00
91 lines
3.4 KiB
C++
91 lines
3.4 KiB
C++
|
#include <cassert>
|
||
|
#include <limits>
|
||
|
#include <iostream>
|
||
|
#include <fstream>
|
||
|
|
||
|
#include "FactorCollection.h"
|
||
|
#include "Phrase.h"
|
||
|
#include "InputFileStream.h"
|
||
|
#include "StaticData.h"
|
||
|
#include "LanguageModelORLM.h"
|
||
|
|
||
|
using std::map;
|
||
|
namespace Moses
|
||
|
{
|
||
|
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
|
||
|
size_t nGramOrder) {
|
||
|
cerr << "Loading LanguageModelORLM..." << endl;
|
||
|
m_filePath = filePath;
|
||
|
m_factorType = factorType;
|
||
|
m_nGramOrder = nGramOrder;
|
||
|
FileHandler fLmIn(m_filePath, std::ios::in|std::ios::binary, true);
|
||
|
m_lm = new OnlineRLM<T>(&fLmIn, m_nGramOrder);
|
||
|
fLmIn.close();
|
||
|
//m_lm = new MultiOnlineRLM<T>(m_filePath, m_nGramOrder);
|
||
|
assert(m_lm != NULL);
|
||
|
// get special word ids
|
||
|
m_oov_id = m_lm->vocab_->GetWordID("<unk>");
|
||
|
CreateFactors();
|
||
|
return true;
|
||
|
}
|
||
|
void LanguageModelORLM::CreateFactors() {
|
||
|
FactorCollection &factorCollection = FactorCollection::Instance();
|
||
|
size_t maxFactorId = 0; // to create lookup vector later on
|
||
|
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
|
||
|
|
||
|
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
|
||
|
vIter != m_lm->vocab_->VocabEnd(); vIter++){
|
||
|
// get word from ORLM vocab and associate with (new) factor id
|
||
|
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
|
||
|
m_lmids_map[factorId] = vIter->second;
|
||
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||
|
}
|
||
|
// add factors for BOS and EOS and store bf word ids
|
||
|
size_t factorId;
|
||
|
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, "<s>");
|
||
|
factorId = m_sentenceStart->GetId();
|
||
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||
|
m_sentenceStartArray[m_factorType] = m_sentenceStart;
|
||
|
|
||
|
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, "</s>");
|
||
|
factorId = m_sentenceEnd->GetId();
|
||
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||
|
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
|
||
|
// add to lookup vector in object
|
||
|
lm_ids_vec_.resize(maxFactorId+1);
|
||
|
// fill with OOV code
|
||
|
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
|
||
|
|
||
|
for (map<size_t, wordID_t>::const_iterator iter = m_lmids_map.begin();
|
||
|
iter != m_lmids_map.end() ; ++iter)
|
||
|
lm_ids_vec_[iter->first] = iter->second;
|
||
|
}
|
||
|
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const {
|
||
|
return m_lm->vocab_->GetWordID(str);
|
||
|
}
|
||
|
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const {
|
||
|
size_t factorId = factor->GetId();
|
||
|
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
|
||
|
}
|
||
|
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
|
||
|
State* finalState) const {
|
||
|
FactorType factorType = GetFactorType();
|
||
|
// set up context
|
||
|
wordID_t ngram[MAX_NGRAM_SIZE];
|
||
|
int count = contextFactor.size();
|
||
|
for (int i = 0; i < count; i++) {
|
||
|
ngram[i] = GetLmID((*contextFactor[i])[factorType]);
|
||
|
}
|
||
|
//float logprob = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, finalState)));
|
||
|
LMResult ret;
|
||
|
ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, finalState)));
|
||
|
ret.unknown = count && (ngram[count - 1] == m_oov_id);
|
||
|
/*if (finalState)
|
||
|
std::cout << " = " << logprob << "(" << *finalState << ", " << *len <<")"<< std::endl;
|
||
|
else
|
||
|
std::cout << " = " << logprob << std::endl;
|
||
|
*/
|
||
|
return ret;
|
||
|
}
|
||
|
}
|