2013-04-05 22:46:52 +04:00
|
|
|
// $Id$
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
#include "lm/binary_format.hh"
|
|
|
|
#include "lm/enumerate_vocab.hh"
|
|
|
|
#include "lm/left.hh"
|
|
|
|
#include "lm/model.hh"
|
|
|
|
|
2013-08-27 23:55:07 +04:00
|
|
|
#include "moses/FF/FFState.h"
|
2013-04-12 21:58:47 +04:00
|
|
|
#include "moses/Hypothesis.h"
|
2013-04-09 00:54:11 +04:00
|
|
|
#include "moses/Phrase.h"
|
|
|
|
|
2013-04-05 22:46:52 +04:00
|
|
|
#include "moses/LM/Ken.h"
|
|
|
|
#include "moses/LM/Backward.h"
|
|
|
|
|
2013-04-15 22:38:48 +04:00
|
|
|
//#include "moses/Util.h"
|
|
|
|
//#include "moses/StaticData.h"
|
2013-04-12 21:58:47 +04:00
|
|
|
//#include <iostream>
|
2013-04-08 22:08:54 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
|
|
|
|
/** Constructs a new backward language model. */
|
2013-08-27 23:55:07 +04:00
|
|
|
template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line,file,factorType,lazy)
|
2013-05-29 21:16:15 +04:00
|
|
|
{
|
|
|
|
//
|
|
|
|
// This space intentionally left blank
|
|
|
|
//
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Constructs an empty backward language model state.
|
|
|
|
*
|
|
|
|
* This state will correspond with a translation hypothesis
|
|
|
|
* where no source words have been translated.
|
|
|
|
*
|
|
|
|
* In a forward language model, the language model state of an empty hypothesis
|
|
|
|
* would store the beginning of sentence marker <s>.
|
|
|
|
*
|
|
|
|
* Because this is a backward language model, the language model state returned by this method
|
|
|
|
* instead stores the end of sentence marker </s>.
|
|
|
|
*/
|
|
|
|
template <class Model> const FFState *BackwardLanguageModel<Model>::EmptyHypothesisState(const InputType &/*input*/) const
|
|
|
|
{
|
|
|
|
BackwardLMState *ret = new BackwardLMState();
|
|
|
|
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, ret->state);
|
|
|
|
ruleScore.Terminal(m_ngram->GetVocabulary().EndSentence());
|
|
|
|
// float score =
|
|
|
|
ruleScore.Finish();
|
|
|
|
// VERBOSE(1, "BackwardLM EmptyHypothesisState has score " << score);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
template <class Model> double BackwardLanguageModel<Model>::Score(FFState *ffState) {
|
|
|
|
BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
|
|
|
|
lm::ngram::ChartState &state = lmState->state;
|
|
|
|
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
|
|
|
|
return ruleScore.Finish();
|
|
|
|
}
|
2013-04-12 21:58:47 +04:00
|
|
|
*/
|
2013-05-29 21:16:15 +04:00
|
|
|
/**
|
|
|
|
* Pre-calculate the n-gram probabilities for the words in the specified phrase.
|
|
|
|
*
|
|
|
|
* Note that when this method is called, we do not have access to the context
|
|
|
|
* in which this phrase will eventually be applied.
|
|
|
|
*
|
|
|
|
* In other words, we know what words are in this phrase,
|
|
|
|
* but we do not know what words will come before or after this phrase.
|
|
|
|
*
|
|
|
|
* The parameters fullScore, ngramScore, and oovCount are all output parameters.
|
|
|
|
*
|
|
|
|
* The value stored in oovCount is the number of words in the phrase
|
|
|
|
* that are not in the language model's vocabulary.
|
|
|
|
*
|
|
|
|
* The sum of the ngram scores for all words in this phrase are stored in fullScore.
|
|
|
|
*
|
|
|
|
* The value stored in ngramScore is similar, but only full-order ngram scores are included.
|
|
|
|
*
|
|
|
|
* This is best shown by example:
|
|
|
|
*
|
|
|
|
* Assume a trigram backward language model and a phrase "a b c d e f g"
|
|
|
|
*
|
|
|
|
* fullScore would represent the sum of the logprob scores for the following values:
|
|
|
|
*
|
|
|
|
* p(g)
|
|
|
|
* p(f | g)
|
|
|
|
* p(e | g f)
|
|
|
|
* p(d | f e)
|
|
|
|
* p(c | e d)
|
|
|
|
* p(b | d c)
|
|
|
|
* p(a | c b)
|
|
|
|
*
|
|
|
|
* ngramScore would represent the sum of the logprob scores for the following values:
|
|
|
|
*
|
|
|
|
* p(g)
|
|
|
|
* p(f | g)
|
|
|
|
* p(e | g f)
|
|
|
|
* p(d | f e)
|
|
|
|
* p(c | e d)
|
|
|
|
* p(b | d c)
|
|
|
|
* p(a | c b)
|
|
|
|
*/
|
|
|
|
template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
|
|
|
|
{
|
|
|
|
fullScore = 0;
|
|
|
|
ngramScore = 0;
|
|
|
|
oovCount = 0;
|
|
|
|
|
|
|
|
if (!phrase.GetSize()) return;
|
|
|
|
|
|
|
|
lm::ngram::ChartState discarded_sadly;
|
|
|
|
lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
|
|
|
|
|
|
|
|
UTIL_THROW_IF(
|
|
|
|
(m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)),
|
|
|
|
util::Exception,
|
|
|
|
"BackwardLanguageModel does not currently support rules that include <s>"
|
|
|
|
);
|
|
|
|
|
|
|
|
float before_boundary = 0.0f;
|
|
|
|
|
|
|
|
int lastWord = phrase.GetSize() - 1;
|
|
|
|
int ngramBoundary = m_ngram->Order() - 1;
|
|
|
|
int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary;
|
|
|
|
|
|
|
|
int position;
|
|
|
|
for (position = lastWord; position >= 0; position-=1) {
|
|
|
|
const Word &word = phrase.GetWord(position);
|
2013-04-09 00:54:11 +04:00
|
|
|
UTIL_THROW_IF(
|
2013-05-29 21:16:15 +04:00
|
|
|
(word.IsNonTerminal()),
|
|
|
|
util::Exception,
|
|
|
|
"BackwardLanguageModel does not currently support rules that include non-terminals "
|
|
|
|
);
|
2013-04-12 21:58:47 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
lm::WordIndex index = TranslateID(word);
|
|
|
|
scorer.Terminal(index);
|
|
|
|
if (!index) ++oovCount;
|
2013-04-09 00:54:11 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
if (position==boundary) {
|
|
|
|
before_boundary = scorer.Finish();
|
|
|
|
}
|
2013-04-09 00:54:11 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
fullScore = scorer.Finish();
|
|
|
|
|
|
|
|
ngramScore = TransformLMScore(fullScore - before_boundary);
|
|
|
|
fullScore = TransformLMScore(fullScore);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Calculate the ngram probabilities for the words at the beginning
|
|
|
|
* (and under some circumstances, also at the end)
|
|
|
|
* of the phrase represented by the provided hypothesis.
|
|
|
|
*
|
|
|
|
* Additionally, calculate a new language model state.
|
|
|
|
*
|
|
|
|
* This is best shown by example:
|
|
|
|
*
|
|
|
|
* Assume a trigram language model.
|
|
|
|
*
|
|
|
|
* Assume the previous phrase was "a b c d e f g",
|
|
|
|
* which means the previous language model state is "g f".
|
|
|
|
*
|
|
|
|
* When the phrase corresponding to "a b c d e f g" was previously processed by CalcScore
|
|
|
|
* the following full-order ngrams would have been calculated:
|
|
|
|
*
|
|
|
|
* p(a | c b)
|
|
|
|
* p(b | d c)
|
|
|
|
* p(c | e d)
|
|
|
|
* p(d | f e)
|
|
|
|
* p(e | g f)
|
|
|
|
*
|
|
|
|
* The following less-than-full-order ngrams would also have been calculated by CalcScore:
|
|
|
|
*
|
|
|
|
* p(f | g)
|
|
|
|
* p(g)
|
|
|
|
*
|
|
|
|
* In this method, we now have access to additional context which may allow
|
|
|
|
* us to compute the full-order ngrams for f and g.
|
|
|
|
*
|
|
|
|
* Assume the new provided hypothesis contains the new phrase "h i j k"
|
|
|
|
*
|
|
|
|
* Given these assumptions, this method is responsible
|
|
|
|
* for calculating the scores for the following:
|
|
|
|
*
|
|
|
|
* p(f | h g)
|
|
|
|
* p(g | i h)
|
|
|
|
*
|
|
|
|
* This method must also calculate and return a new language model state.
|
|
|
|
*
|
|
|
|
* In this example, the returned language model state would be "k j"
|
|
|
|
*
|
|
|
|
* If the provided hypothesis represents the end of a completed translation
|
|
|
|
* (all source words have been translated)
|
|
|
|
* then this method is additionally responsible for calculating the following:
|
|
|
|
*
|
|
|
|
* p(j | <s> k)
|
|
|
|
* p(k | <s>)
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
|
|
|
|
{
|
|
|
|
|
|
|
|
// If the current hypothesis contains zero target words
|
|
|
|
if (!hypo.GetCurrTargetLength()) {
|
|
|
|
|
|
|
|
// reuse and return the previous state
|
|
|
|
std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
|
|
|
|
ret->state = static_cast<const BackwardLMState&>(*ps).state;
|
|
|
|
return ret.release();
|
2013-04-20 00:38:31 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
} else {
|
2013-04-20 00:38:31 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
float returnedScore;
|
2013-04-20 00:38:31 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
FFState *returnedState = this->Evaluate(hypo.GetCurrTargetPhrase(), ps, returnedScore);
|
2013-04-20 00:38:31 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
out->PlusEquals(this, returnedScore);
|
2013-04-20 00:38:31 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
return returnedState;
|
2013-04-20 00:38:31 +04:00
|
|
|
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
}
|
2013-04-20 00:38:31 +04:00
|
|
|
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const
|
|
|
|
{
|
2013-04-20 00:38:31 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
returnedScore = 0.0f;
|
2013-04-12 21:58:47 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
const lm::ngram::ChartState &previous = static_cast<const BackwardLMState&>(*ps).state;
|
|
|
|
|
|
|
|
std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
|
|
|
|
|
|
|
|
lm::ngram::RuleScore<Model> scorer(*m_ngram, ret->state);
|
2013-04-12 21:58:47 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
int ngramBoundary = m_ngram->Order() - 1;
|
|
|
|
int lastWord = phrase.GetSize() - 1;
|
2013-04-12 21:58:47 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
// Get scores for words at the end of the previous phrase
|
|
|
|
// that are now adjacent to words at the the beginning of this phrase
|
|
|
|
for (int position=std::min( lastWord, ngramBoundary - 1); position >= 0; position-=1) {
|
|
|
|
const Word &word = phrase.GetWord(position);
|
|
|
|
UTIL_THROW_IF(
|
|
|
|
(word.IsNonTerminal()),
|
|
|
|
util::Exception,
|
|
|
|
"BackwardLanguageModel does not currently support rules that include non-terminals "
|
|
|
|
);
|
|
|
|
|
|
|
|
lm::WordIndex index = TranslateID(word);
|
|
|
|
scorer.Terminal(index);
|
2013-04-20 00:38:31 +04:00
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
scorer.NonTerminal(previous);
|
|
|
|
returnedScore = scorer.Finish();
|
|
|
|
/*
|
|
|
|
out->PlusEquals(this, score);
|
2013-04-12 21:58:47 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
|
|
UTIL_THROW_IF(
|
|
|
|
(1==1),
|
|
|
|
util::Exception,
|
|
|
|
"This method (BackwardLanguageModel<Model>::Evaluate) is not yet fully implemented"
|
|
|
|
);
|
|
|
|
*/
|
|
|
|
return ret.release();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2013-08-27 23:55:07 +04:00
|
|
|
LanguageModel *ConstructBackwardLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
|
2013-05-29 21:16:15 +04:00
|
|
|
{
|
|
|
|
try {
|
|
|
|
lm::ngram::ModelType model_type;
|
|
|
|
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
|
|
|
|
switch(model_type) {
|
|
|
|
case lm::ngram::PROBING:
|
2013-08-27 23:55:07 +04:00
|
|
|
return new BackwardLanguageModel<lm::ngram::ProbingModel>(line, file, factorType, lazy);
|
2013-05-29 21:16:15 +04:00
|
|
|
case lm::ngram::REST_PROBING:
|
2013-08-27 23:55:07 +04:00
|
|
|
return new BackwardLanguageModel<lm::ngram::RestProbingModel>(line, file, factorType, lazy);
|
2013-05-29 21:16:15 +04:00
|
|
|
case lm::ngram::TRIE:
|
2013-08-27 23:55:07 +04:00
|
|
|
return new BackwardLanguageModel<lm::ngram::TrieModel>(line, file, factorType, lazy);
|
2013-05-29 21:16:15 +04:00
|
|
|
case lm::ngram::QUANT_TRIE:
|
2013-08-27 23:55:07 +04:00
|
|
|
return new BackwardLanguageModel<lm::ngram::QuantTrieModel>(line, file, factorType, lazy);
|
2013-05-29 21:16:15 +04:00
|
|
|
case lm::ngram::ARRAY_TRIE:
|
2013-08-27 23:55:07 +04:00
|
|
|
return new BackwardLanguageModel<lm::ngram::ArrayTrieModel>(line, file, factorType, lazy);
|
2013-05-29 21:16:15 +04:00
|
|
|
case lm::ngram::QUANT_ARRAY_TRIE:
|
2013-08-27 23:55:07 +04:00
|
|
|
return new BackwardLanguageModel<lm::ngram::QuantArrayTrieModel>(line, file, factorType, lazy);
|
2013-05-29 21:16:15 +04:00
|
|
|
default:
|
|
|
|
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
|
|
|
|
abort();
|
2013-04-05 22:46:52 +04:00
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
} else {
|
2013-08-27 23:55:07 +04:00
|
|
|
return new BackwardLanguageModel<lm::ngram::ProbingModel>(line, file, factorType, lazy);
|
2013-04-05 22:46:52 +04:00
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
} catch (std::exception &e) {
|
|
|
|
std::cerr << e.what() << std::endl;
|
|
|
|
abort();
|
2013-04-05 22:46:52 +04:00
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
}
|
2013-04-05 22:46:52 +04:00
|
|
|
|
|
|
|
} // namespace Moses
|