2015-11-04 16:03:26 +03:00
|
|
|
/*
|
|
|
|
* KENLM.cpp
|
|
|
|
*
|
|
|
|
* Created on: 4 Nov 2015
|
|
|
|
* Author: hieu
|
|
|
|
*/
|
|
|
|
|
2015-11-04 17:54:20 +03:00
|
|
|
#include <vector>
|
2015-11-04 16:03:26 +03:00
|
|
|
#include "KENLM.h"
|
2015-11-04 17:54:20 +03:00
|
|
|
#include "../TargetPhrase.h"
|
|
|
|
#include "../Scores.h"
|
2015-11-04 16:09:53 +03:00
|
|
|
#include "../System.h"
|
2015-11-04 19:11:56 +03:00
|
|
|
#include "../Search/Hypothesis.h"
|
|
|
|
#include "../Search/Manager.h"
|
2015-11-04 16:03:26 +03:00
|
|
|
#include "lm/state.hh"
|
2015-11-04 17:54:20 +03:00
|
|
|
#include "lm/left.hh"
|
2015-11-04 16:03:26 +03:00
|
|
|
#include "moses/FactorCollection.h"
|
|
|
|
|
2015-11-04 17:54:20 +03:00
|
|
|
using namespace std;
|
|
|
|
|
2015-11-04 16:03:26 +03:00
|
|
|
struct KenLMState : public Moses::FFState {
|
|
|
|
lm::ngram::State state;
|
|
|
|
virtual size_t hash() const {
|
|
|
|
size_t ret = hash_value(state);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
virtual bool operator==(const Moses::FFState& o) const {
|
|
|
|
const KenLMState &other = static_cast<const KenLMState &>(o);
|
|
|
|
bool ret = state == other.state;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////
|
|
|
|
class MappingBuilder : public lm::EnumerateVocab
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
MappingBuilder(Moses::FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
|
|
|
|
: m_factorCollection(factorCollection), m_mapping(mapping) {}
|
|
|
|
|
|
|
|
void Add(lm::WordIndex index, const StringPiece &str) {
|
|
|
|
std::size_t factorId = m_factorCollection.AddFactor(str)->GetId();
|
|
|
|
if (m_mapping.size() <= factorId) {
|
|
|
|
// 0 is <unk> :-)
|
|
|
|
m_mapping.resize(factorId + 1);
|
|
|
|
}
|
|
|
|
m_mapping[factorId] = index;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
Moses::FactorCollection &m_factorCollection;
|
|
|
|
std::vector<lm::WordIndex> &m_mapping;
|
|
|
|
};
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////
|
|
|
|
KENLM::KENLM(size_t startInd, const std::string &line)
|
|
|
|
:StatefulFeatureFunction(startInd, line)
|
2015-11-06 12:04:19 +03:00
|
|
|
,m_lazy(false)
|
2015-11-04 16:03:26 +03:00
|
|
|
{
|
|
|
|
ReadParameters();
|
|
|
|
}
|
|
|
|
|
|
|
|
KENLM::~KENLM()
|
|
|
|
{
|
|
|
|
// TODO Auto-generated destructor stub
|
|
|
|
}
|
|
|
|
|
|
|
|
void KENLM::Load(System &system)
|
|
|
|
{
|
2015-11-05 14:19:37 +03:00
|
|
|
Moses::FactorCollection &fc = system.vocab;
|
2015-11-04 17:54:20 +03:00
|
|
|
|
2015-11-04 17:55:18 +03:00
|
|
|
m_bos = fc.AddFactor("<s>", false);
|
2015-11-04 17:54:20 +03:00
|
|
|
m_eos = fc.AddFactor("</s>", false);
|
2015-11-04 16:03:26 +03:00
|
|
|
|
2015-11-04 17:54:20 +03:00
|
|
|
lm::ngram::Config config;
|
|
|
|
config.messages = NULL;
|
2015-11-04 16:03:26 +03:00
|
|
|
|
2015-11-05 14:19:37 +03:00
|
|
|
Moses::FactorCollection &collection = system.vocab;
|
2015-11-04 17:54:20 +03:00
|
|
|
MappingBuilder builder(collection, m_lmIdLookup);
|
|
|
|
config.enumerate_vocab = &builder;
|
2015-11-06 12:04:19 +03:00
|
|
|
config.load_method = m_lazy ? util::LAZY : util::POPULATE_OR_READ;
|
2015-11-04 17:54:20 +03:00
|
|
|
|
|
|
|
m_ngram.reset(new Model(m_path.c_str(), config));
|
2015-11-04 16:03:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void KENLM::SetParameter(const std::string& key, const std::string& value)
|
|
|
|
{
|
|
|
|
if (key == "path") {
|
|
|
|
m_path = value;
|
|
|
|
}
|
|
|
|
else if (key == "factor") {
|
|
|
|
m_factorType = Moses::Scan<Moses::FactorType>(value);
|
|
|
|
}
|
2015-11-06 12:04:19 +03:00
|
|
|
else if (key == "lazyken") {
|
|
|
|
m_lazy = Moses::Scan<bool>(value);
|
|
|
|
}
|
2015-11-04 16:03:26 +03:00
|
|
|
else if (key == "order") {
|
|
|
|
// don't need to store it
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
StatefulFeatureFunction::SetParameter(key, value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-11-05 18:34:24 +03:00
|
|
|
Moses::FFState* KENLM::BlankState(const Manager &mgr, const PhraseImpl &input) const
|
|
|
|
{
|
2015-11-05 18:59:09 +03:00
|
|
|
MemPool &pool = mgr.GetPool();
|
|
|
|
KenLMState *ret = new (pool.Allocate<KenLMState>()) KenLMState();
|
|
|
|
return ret;
|
2015-11-05 18:34:24 +03:00
|
|
|
}
|
|
|
|
|
2015-11-04 16:03:26 +03:00
|
|
|
//! return the state associated with the empty hypothesis for a given sentence
|
2015-11-05 19:16:55 +03:00
|
|
|
void KENLM::EmptyHypothesisState(Moses::FFState &state, const Manager &mgr, const PhraseImpl &input) const
|
2015-11-04 16:03:26 +03:00
|
|
|
{
|
2015-11-05 19:16:55 +03:00
|
|
|
KenLMState &stateCast = static_cast<KenLMState&>(state);
|
|
|
|
stateCast.state = m_ngram->BeginSentenceState();
|
2015-11-04 16:03:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
KENLM::EvaluateInIsolation(const System &system,
|
|
|
|
const Phrase &source, const TargetPhrase &targetPhrase,
|
|
|
|
Scores &scores,
|
2015-11-04 17:54:20 +03:00
|
|
|
Scores *estimatedScores) const
|
2015-11-04 16:03:26 +03:00
|
|
|
{
|
2015-11-04 18:25:09 +03:00
|
|
|
// contains factors used by this LM
|
|
|
|
float fullScore, nGramScore;
|
|
|
|
size_t oovCount;
|
2015-11-04 16:03:26 +03:00
|
|
|
|
2015-11-04 18:25:09 +03:00
|
|
|
CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
|
|
|
|
|
|
|
|
float estimateScore = fullScore - nGramScore;
|
|
|
|
|
|
|
|
bool GetLMEnableOOVFeature = false;
|
|
|
|
if (GetLMEnableOOVFeature) {
|
|
|
|
vector<float> scoresVec(2), estimateScoresVec(2);
|
|
|
|
scoresVec[0] = nGramScore;
|
|
|
|
scoresVec[1] = oovCount;
|
|
|
|
scores.Assign(system, *this, scoresVec);
|
|
|
|
|
|
|
|
estimateScoresVec[0] = estimateScore;
|
|
|
|
estimateScoresVec[1] = 0;
|
|
|
|
estimatedScores->Assign(system, *this, estimateScoresVec);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
scores.Assign(system, *this, nGramScore);
|
|
|
|
estimatedScores->Assign(system, *this, estimateScore);
|
|
|
|
}
|
2015-11-04 16:03:26 +03:00
|
|
|
}
|
|
|
|
|
2015-11-05 19:35:31 +03:00
|
|
|
void KENLM::EvaluateWhenApplied(const Manager &mgr,
|
2015-11-04 16:03:26 +03:00
|
|
|
const Hypothesis &hypo,
|
|
|
|
const Moses::FFState &prevState,
|
2015-11-05 19:35:31 +03:00
|
|
|
Scores &scores,
|
|
|
|
Moses::FFState &state) const
|
2015-11-04 16:03:26 +03:00
|
|
|
{
|
2015-11-05 19:35:31 +03:00
|
|
|
KenLMState &stateCast = static_cast<KenLMState&>(state);
|
|
|
|
|
2015-11-09 03:10:07 +03:00
|
|
|
const System &system = mgr.system;
|
2015-11-04 16:03:26 +03:00
|
|
|
|
2015-11-04 19:11:56 +03:00
|
|
|
const lm::ngram::State &in_state = static_cast<const KenLMState&>(prevState).state;
|
|
|
|
|
|
|
|
if (!hypo.GetTargetPhrase().GetSize()) {
|
2015-11-05 19:35:31 +03:00
|
|
|
stateCast.state = in_state;
|
|
|
|
return;
|
2015-11-04 19:11:56 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
|
|
|
|
//[begin, end) in STL-like fashion.
|
|
|
|
const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
|
|
|
|
const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1);
|
|
|
|
|
|
|
|
std::size_t position = begin;
|
|
|
|
typename Model::State aux_state;
|
2015-11-05 19:35:31 +03:00
|
|
|
typename Model::State *state0 = &stateCast.state, *state1 = &aux_state;
|
2015-11-04 19:11:56 +03:00
|
|
|
|
|
|
|
float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)), *state0);
|
|
|
|
++position;
|
|
|
|
for (; position < adjust_end; ++position) {
|
|
|
|
score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)), *state1);
|
|
|
|
std::swap(state0, state1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (hypo.GetBitmap().IsComplete()) {
|
|
|
|
// Score end of sentence.
|
|
|
|
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
|
|
|
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
2015-11-05 19:35:31 +03:00
|
|
|
score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob;
|
2015-11-04 19:11:56 +03:00
|
|
|
} else if (adjust_end < end) {
|
|
|
|
// Get state after adding a long phrase.
|
|
|
|
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
|
|
|
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
2015-11-05 19:35:31 +03:00
|
|
|
m_ngram->GetState(&indices.front(), last, stateCast.state);
|
|
|
|
} else if (state0 != &stateCast.state) {
|
2015-11-04 19:11:56 +03:00
|
|
|
// Short enough phrase that we can just reuse the state.
|
2015-11-05 19:35:31 +03:00
|
|
|
stateCast.state = *state0;
|
2015-11-04 19:11:56 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
score = Moses::TransformLMScore(score);
|
|
|
|
|
|
|
|
bool OOVFeatureEnabled = false;
|
|
|
|
if (OOVFeatureEnabled) {
|
|
|
|
std::vector<float> scoresVec(2);
|
|
|
|
scoresVec[0] = score;
|
|
|
|
scoresVec[1] = 0.0;
|
|
|
|
scores.PlusEquals(system, *this, scoresVec);
|
|
|
|
} else {
|
|
|
|
scores.PlusEquals(system, *this, score);
|
|
|
|
}
|
2015-11-04 16:03:26 +03:00
|
|
|
}
|
|
|
|
|
2015-11-04 17:54:20 +03:00
|
|
|
void KENLM::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, std::size_t &oovCount) const
|
|
|
|
{
|
|
|
|
fullScore = 0;
|
|
|
|
ngramScore = 0;
|
|
|
|
oovCount = 0;
|
|
|
|
|
|
|
|
if (!phrase.GetSize()) return;
|
|
|
|
|
|
|
|
lm::ngram::ChartState discarded_sadly;
|
|
|
|
lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
|
|
|
|
|
|
|
|
size_t position;
|
2015-11-04 17:55:18 +03:00
|
|
|
if (m_bos == phrase[0][m_factorType]) {
|
2015-11-04 17:54:20 +03:00
|
|
|
scorer.BeginSentence();
|
|
|
|
position = 1;
|
|
|
|
} else {
|
|
|
|
position = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t ngramBoundary = m_ngram->Order() - 1;
|
|
|
|
|
|
|
|
size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
|
|
|
|
for (; position < end_loop; ++position) {
|
|
|
|
const Word &word = phrase[position];
|
|
|
|
lm::WordIndex index = TranslateID(word);
|
|
|
|
scorer.Terminal(index);
|
|
|
|
if (!index) ++oovCount;
|
|
|
|
}
|
|
|
|
float before_boundary = fullScore + scorer.Finish();
|
|
|
|
for (; position < phrase.GetSize(); ++position) {
|
|
|
|
const Word &word = phrase[position];
|
|
|
|
lm::WordIndex index = TranslateID(word);
|
|
|
|
scorer.Terminal(index);
|
|
|
|
if (!index) ++oovCount;
|
|
|
|
}
|
|
|
|
fullScore += scorer.Finish();
|
|
|
|
|
|
|
|
ngramScore = Moses::TransformLMScore(fullScore - before_boundary);
|
|
|
|
fullScore = Moses::TransformLMScore(fullScore);
|
|
|
|
}
|
|
|
|
|
|
|
|
lm::WordIndex KENLM::TranslateID(const Word &word) const {
|
|
|
|
std::size_t factor = word[m_factorType]->GetId();
|
|
|
|
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
|
|
|
}
|
2015-11-04 19:11:56 +03:00
|
|
|
|
|
|
|
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
|
|
|
lm::WordIndex *KENLM::LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
|
|
|
|
lm::WordIndex *index = indices;
|
|
|
|
lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
|
|
|
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
|
|
|
for (; ; ++index, --position) {
|
|
|
|
if (index == end) return index;
|
|
|
|
if (position == -1) {
|
|
|
|
*index = m_ngram->GetVocabulary().BeginSentence();
|
|
|
|
return index + 1;
|
|
|
|
}
|
|
|
|
*index = TranslateID(hypo.GetWord(position));
|
|
|
|
}
|
2015-11-06 16:19:44 +03:00
|
|
|
return NULL;
|
2015-11-04 19:11:56 +03:00
|
|
|
}
|