mosesdecoder/contrib/other-builds/moses2/LM/KENLM.cpp

196 lines
5.1 KiB
C++
Raw Normal View History

2015-11-04 16:03:26 +03:00
/*
* KENLM.cpp
*
* Created on: 4 Nov 2015
* Author: hieu
*/
2015-11-04 17:54:20 +03:00
#include <vector>
2015-11-04 16:03:26 +03:00
#include "KENLM.h"
2015-11-04 17:54:20 +03:00
#include "../TargetPhrase.h"
#include "../Scores.h"
2015-11-04 16:09:53 +03:00
#include "../System.h"
2015-11-04 16:03:26 +03:00
#include "lm/state.hh"
2015-11-04 17:54:20 +03:00
#include "lm/left.hh"
2015-11-04 16:03:26 +03:00
#include "moses/FactorCollection.h"
2015-11-04 17:54:20 +03:00
using namespace std;
2015-11-04 16:03:26 +03:00
struct KenLMState : public Moses::FFState {
lm::ngram::State state;
virtual size_t hash() const {
size_t ret = hash_value(state);
return ret;
}
virtual bool operator==(const Moses::FFState& o) const {
const KenLMState &other = static_cast<const KenLMState &>(o);
bool ret = state == other.state;
return ret;
}
};
/////////////////////////////////////////////////////////////////
class MappingBuilder : public lm::EnumerateVocab
{
public:
MappingBuilder(Moses::FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
: m_factorCollection(factorCollection), m_mapping(mapping) {}
void Add(lm::WordIndex index, const StringPiece &str) {
std::size_t factorId = m_factorCollection.AddFactor(str)->GetId();
if (m_mapping.size() <= factorId) {
// 0 is <unk> :-)
m_mapping.resize(factorId + 1);
}
m_mapping[factorId] = index;
}
private:
Moses::FactorCollection &m_factorCollection;
std::vector<lm::WordIndex> &m_mapping;
};
/////////////////////////////////////////////////////////////////
KENLM::KENLM(size_t startInd, const std::string &line)
:StatefulFeatureFunction(startInd, line)
{
ReadParameters();
}
KENLM::~KENLM()
{
// TODO Auto-generated destructor stub
}
void KENLM::Load(System &system)
{
2015-11-04 17:54:20 +03:00
Moses::FactorCollection &fc = system.GetVocab();
m_sos = fc.AddFactor("<s>", false);
m_eos = fc.AddFactor("</s>", false);
2015-11-04 16:03:26 +03:00
2015-11-04 17:54:20 +03:00
lm::ngram::Config config;
config.messages = NULL;
2015-11-04 16:03:26 +03:00
2015-11-04 17:54:20 +03:00
Moses::FactorCollection &collection = system.GetVocab();
MappingBuilder builder(collection, m_lmIdLookup);
config.enumerate_vocab = &builder;
//config.load_method = lazy ? util::LAZY : util::POPULATE_OR_READ;
m_ngram.reset(new Model(m_path.c_str(), config));
2015-11-04 16:03:26 +03:00
}
void KENLM::SetParameter(const std::string& key, const std::string& value)
{
if (key == "path") {
m_path = value;
}
else if (key == "factor") {
m_factorType = Moses::Scan<Moses::FactorType>(value);
}
else if (key == "order") {
// don't need to store it
}
else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
//! return the state associated with the empty hypothesis for a given sentence
const Moses::FFState* KENLM::EmptyHypothesisState(const Manager &mgr, const PhraseImpl &input) const
{
KenLMState *ret = new KenLMState();
ret->state = m_ngram->BeginSentenceState();
return ret;
}
void
KENLM::EvaluateInIsolation(const System &system,
const Phrase &source, const TargetPhrase &targetPhrase,
Scores &scores,
2015-11-04 17:54:20 +03:00
Scores *estimatedScores) const
2015-11-04 16:03:26 +03:00
{
2015-11-04 17:54:20 +03:00
// contains factors used by this LM
float fullScore, nGramScore;
size_t oovCount;
CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
float estimateScore = fullScore - nGramScore;
bool GetLMEnableOOVFeature = false;
if (GetLMEnableOOVFeature) {
vector<float> scoresVec(2), estimateScoresVec(2);
scoresVec[0] = nGramScore;
scoresVec[1] = oovCount;
scores.Assign(system, *this, scoresVec);
estimateScoresVec[0] = estimateScore;
estimateScoresVec[1] = 0;
estimatedScores->Assign(system, *this, estimateScoresVec);
}
else {
scores.Assign(system, *this, nGramScore);
estimatedScores->Assign(system, *this, estimateScore);
}
2015-11-04 16:03:26 +03:00
}
Moses::FFState* KENLM::EvaluateWhenApplied(const Manager &mgr,
const Hypothesis &hypo,
const Moses::FFState &prevState,
Scores &scores) const
{
KenLMState *ret = new KenLMState();
ret->state = m_ngram->BeginSentenceState();
return ret;
}
2015-11-04 17:54:20 +03:00
void KENLM::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, std::size_t &oovCount) const
{
fullScore = 0;
ngramScore = 0;
oovCount = 0;
if (!phrase.GetSize()) return;
lm::ngram::ChartState discarded_sadly;
lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
size_t position;
if (m_sos == phrase[0][m_factorType]) {
scorer.BeginSentence();
position = 1;
} else {
position = 0;
}
size_t ngramBoundary = m_ngram->Order() - 1;
size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
for (; position < end_loop; ++position) {
const Word &word = phrase[position];
lm::WordIndex index = TranslateID(word);
scorer.Terminal(index);
if (!index) ++oovCount;
}
float before_boundary = fullScore + scorer.Finish();
for (; position < phrase.GetSize(); ++position) {
const Word &word = phrase[position];
lm::WordIndex index = TranslateID(word);
scorer.Terminal(index);
if (!index) ++oovCount;
}
fullScore += scorer.Finish();
ngramScore = Moses::TransformLMScore(fullScore - before_boundary);
fullScore = Moses::TransformLMScore(fullScore);
}
lm::WordIndex KENLM::TranslateID(const Word &word) const {
std::size_t factor = word[m_factorType]->GetId();
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
}