mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-08 20:46:59 +03:00
312 lines
7.5 KiB
C++
312 lines
7.5 KiB
C++
/*
|
|
* LanguageModel.cpp
|
|
*
|
|
* Created on: 29 Oct 2015
|
|
* Author: hieu
|
|
*/
|
|
#include <vector>
|
|
#include "LanguageModel.h"
|
|
#include "../TargetPhrase.h"
|
|
#include "../System.h"
|
|
#include "../Search/Manager.h"
|
|
#include "../Search/Hypothesis.h"
|
|
#include "moses/Util.h"
|
|
#include "moses/InputFileStream.h"
|
|
#include "moses/LM/PointerState.h"
|
|
#include "moses/Bitmap.h"
|
|
|
|
using namespace std;
|
|
|
|
struct LMState : public Moses::PointerState
|
|
{
|
|
LMState()
|
|
:PointerState()
|
|
{
|
|
// uninitialised
|
|
}
|
|
|
|
void Set(MemPool &pool, void *lms, const std::vector<const Moses::Factor*> &context)
|
|
{
|
|
lmstate = lms;
|
|
|
|
numWords = context.size();
|
|
lastWords = (const Moses::Factor**) pool.Allocate(sizeof(const Moses::Factor*) * numWords);
|
|
for (size_t i = 0; i < numWords; ++i) {
|
|
lastWords[i] = context[i];
|
|
}
|
|
}
|
|
|
|
void Init(MemPool &pool, const Moses::Factor *factor)
|
|
{
|
|
lmstate = NULL;
|
|
numWords = 1;
|
|
lastWords = (const Moses::Factor**) pool.Allocate(sizeof(const Moses::Factor*));
|
|
lastWords[0] = factor;
|
|
}
|
|
|
|
size_t numWords;
|
|
const Moses::Factor** lastWords;
|
|
};
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////
|
|
LanguageModel::LanguageModel(size_t startInd, const std::string &line)
|
|
:StatefulFeatureFunction(startInd, line)
|
|
,m_oov(-100)
|
|
{
|
|
ReadParameters();
|
|
}
|
|
|
|
LanguageModel::~LanguageModel() {
|
|
// TODO Auto-generated destructor stub
|
|
}
|
|
|
|
void LanguageModel::Load(System &system)
|
|
{
|
|
Moses::FactorCollection &fc = system.vocab;
|
|
|
|
m_bos = fc.AddFactor("<s>", false);
|
|
m_eos = fc.AddFactor("</s>", false);
|
|
|
|
Moses::InputFileStream infile(m_path);
|
|
size_t lineNum = 0;
|
|
string line;
|
|
while (getline(infile, line)) {
|
|
if (++lineNum % 100000 == 0) {
|
|
cerr << lineNum << " ";
|
|
}
|
|
|
|
vector<string> substrings;
|
|
Moses::Tokenize(substrings, line, "\t");
|
|
|
|
if (substrings.size() < 2)
|
|
continue;
|
|
|
|
assert(substrings.size() == 2 || substrings.size() == 3);
|
|
|
|
SCORE prob = Moses::TransformLMScore(Moses::Scan<SCORE>(substrings[0]));
|
|
if (substrings[1] == "<unk>") {
|
|
m_oov = prob;
|
|
continue;
|
|
}
|
|
|
|
SCORE backoff = 0.f;
|
|
if (substrings.size() == 3) {
|
|
backoff = Moses::TransformLMScore(Moses::Scan<SCORE>(substrings[2]));
|
|
}
|
|
|
|
// ngram
|
|
vector<string> key;
|
|
Moses::Tokenize(key, substrings[1], " ");
|
|
|
|
vector<const Moses::Factor*> factorKey(key.size());
|
|
for (size_t i = 0; i < key.size(); ++i) {
|
|
factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], false);
|
|
}
|
|
|
|
m_root.insert(factorKey, LMScores(prob, backoff));
|
|
}
|
|
|
|
}
|
|
|
|
void LanguageModel::SetParameter(const std::string& key, const std::string& value)
|
|
{
|
|
if (key == "path") {
|
|
m_path = value;
|
|
}
|
|
else if (key == "factor") {
|
|
m_factorType = Moses::Scan<Moses::FactorType>(value);
|
|
}
|
|
else if (key == "order") {
|
|
m_order = Moses::Scan<size_t>(value);
|
|
}
|
|
else {
|
|
StatefulFeatureFunction::SetParameter(key, value);
|
|
}
|
|
}
|
|
|
|
Moses::FFState* LanguageModel::BlankState(const Manager &mgr, const PhraseImpl &input) const
|
|
{
|
|
MemPool &pool = mgr.GetPool();
|
|
return new (pool.Allocate<LMState>()) LMState();
|
|
}
|
|
|
|
void LanguageModel::EmptyHypothesisState(Moses::FFState &state, const Manager &mgr, const PhraseImpl &input) const
|
|
{
|
|
LMState &stateCast = static_cast<LMState&>(state);
|
|
|
|
MemPool &pool = mgr.GetPool();
|
|
stateCast.Init(pool, m_bos);
|
|
}
|
|
|
|
void
|
|
LanguageModel::EvaluateInIsolation(const System &system,
|
|
const Phrase &source, const TargetPhrase &targetPhrase,
|
|
Scores &scores,
|
|
Scores *estimatedScores) const
|
|
{
|
|
if (targetPhrase.GetSize() == 0) {
|
|
return;
|
|
}
|
|
|
|
SCORE score = 0;
|
|
SCORE nonFullScore = 0;
|
|
vector<const Moses::Factor*> context;
|
|
// context.push_back(m_bos);
|
|
|
|
context.reserve(m_order);
|
|
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
|
|
const Moses::Factor *factor = targetPhrase[i][m_factorType];
|
|
ShiftOrPush(context, factor);
|
|
|
|
if (context.size() == m_order) {
|
|
std::pair<SCORE, void*> fromScoring = Score(context);
|
|
score += fromScoring.first;
|
|
}
|
|
else if (estimatedScores) {
|
|
std::pair<SCORE, void*> fromScoring = Score(context);
|
|
nonFullScore += fromScoring.first;
|
|
}
|
|
}
|
|
|
|
scores.PlusEquals(system, *this, score);
|
|
if (estimatedScores) {
|
|
estimatedScores->PlusEquals(system, *this, nonFullScore);
|
|
}
|
|
}
|
|
|
|
void LanguageModel::EvaluateWhenApplied(const Manager &mgr,
|
|
const Hypothesis &hypo,
|
|
const Moses::FFState &prevState,
|
|
Scores &scores,
|
|
Moses::FFState &state) const
|
|
{
|
|
const LMState &prevLMState = static_cast<const LMState &>(prevState);
|
|
size_t numWords = prevLMState.numWords;
|
|
|
|
// context is held backwards
|
|
vector<const Moses::Factor*> context(numWords);
|
|
for (size_t i = 0; i < numWords; ++i) {
|
|
context[i] = prevLMState.lastWords[i];
|
|
}
|
|
//DebugContext(context);
|
|
|
|
SCORE score = 0;
|
|
std::pair<SCORE, void*> fromScoring;
|
|
const PhraseImpl &tp = hypo.GetTargetPhrase();
|
|
for (size_t i = 0; i < tp.GetSize(); ++i) {
|
|
const Word &word = tp[i];
|
|
const Moses::Factor *factor = word[m_factorType];
|
|
ShiftOrPush(context, factor);
|
|
fromScoring = Score(context);
|
|
score += fromScoring.first;
|
|
}
|
|
|
|
const Moses::Bitmap &bm = hypo.GetBitmap();
|
|
if (bm.IsComplete()) {
|
|
// everything translated
|
|
ShiftOrPush(context, m_eos);
|
|
fromScoring = Score(context);
|
|
score += fromScoring.first;
|
|
fromScoring.second = NULL;
|
|
context.clear();
|
|
}
|
|
else {
|
|
assert(context.size());
|
|
if (context.size() == m_order) {
|
|
context.resize(context.size() - 1);
|
|
}
|
|
}
|
|
|
|
scores.PlusEquals(mgr.GetSystem(), *this, score);
|
|
|
|
// return state
|
|
//DebugContext(context);
|
|
|
|
LMState &stateCast = static_cast<LMState&>(state);
|
|
MemPool &pool = mgr.GetPool();
|
|
stateCast.Set(pool, fromScoring.second, context);
|
|
}
|
|
|
|
void LanguageModel::ShiftOrPush(std::vector<const Moses::Factor*> &context, const Moses::Factor *factor) const
|
|
{
|
|
if (context.size() < m_order) {
|
|
context.resize(context.size() + 1);
|
|
}
|
|
assert(context.size());
|
|
|
|
for (size_t i = context.size() - 1; i > 0; --i) {
|
|
context[i] = context[i - 1];
|
|
}
|
|
|
|
context[0] = factor;
|
|
}
|
|
|
|
std::pair<SCORE, void*> LanguageModel::Score(const std::vector<const Moses::Factor*> &context) const
|
|
{
|
|
//cerr << "context=";
|
|
//DebugContext(context);
|
|
|
|
std::pair<SCORE, void*> ret;
|
|
|
|
typedef Node<const Moses::Factor*, LMScores> LMNode;
|
|
const LMNode *node = m_root.getNode(context);
|
|
if (node) {
|
|
ret.first = node->getValue().prob;
|
|
ret.second = (void*) node;
|
|
}
|
|
else {
|
|
SCORE backoff = 0;
|
|
std::vector<const Moses::Factor*> backOffContext(context.begin() + 1, context.end());
|
|
node = m_root.getNode(backOffContext);
|
|
if (node) {
|
|
backoff = node->getValue().backoff;
|
|
}
|
|
|
|
std::vector<const Moses::Factor*> newContext(context.begin(), context.end() - 1);
|
|
std::pair<SCORE, void*> newRet = Score(newContext);
|
|
|
|
ret.first = backoff + newRet.first;
|
|
ret.second = newRet.second;
|
|
}
|
|
|
|
//cerr << "score=" << ret.first << endl;
|
|
return ret;
|
|
}
|
|
|
|
SCORE LanguageModel::BackoffScore(const std::vector<const Moses::Factor*> &context) const
|
|
{
|
|
//cerr << "backoff=";
|
|
//DebugContext(context);
|
|
|
|
SCORE ret;
|
|
size_t stoppedAtInd;
|
|
const Node<const Moses::Factor*, LMScores> &node = m_root.getNode(context, stoppedAtInd);
|
|
|
|
if (stoppedAtInd == context.size()) {
|
|
// found entire ngram
|
|
ret = node.getValue().backoff;
|
|
}
|
|
else {
|
|
if (stoppedAtInd == 0) {
|
|
ret = m_oov;
|
|
stoppedAtInd = 1;
|
|
}
|
|
else {
|
|
ret = node.getValue().backoff;
|
|
}
|
|
|
|
// recursive
|
|
std::vector<const Moses::Factor*> backoff(context.begin() + stoppedAtInd, context.end());
|
|
ret += BackoffScore(backoff);
|
|
}
|
|
|
|
}
|
|
|
|
void LanguageModel::DebugContext(const std::vector<const Moses::Factor*> &context) const
|
|
{
|
|
for (size_t i = 0; i < context.size(); ++i) {
|
|
cerr << context[i]->ToString() << " ";
|
|
}
|
|
cerr << endl;
|
|
}
|