2008-06-11 14:52:57 +04:00
|
|
|
// $Id$
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
2011-11-18 16:07:41 +04:00
|
|
|
#include "util/check.hh"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include <limits>
|
|
|
|
#include <iostream>
|
|
|
|
#include <fstream>
|
|
|
|
|
2012-11-14 17:43:04 +04:00
|
|
|
#include "SRI.h"
|
|
|
|
#include "moses/TypeDef.h"
|
|
|
|
#include "moses/Util.h"
|
|
|
|
#include "moses/FactorCollection.h"
|
|
|
|
#include "moses/Phrase.h"
|
|
|
|
#include "moses/StaticData.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-11-18 17:40:20 +04:00
|
|
|
#include "Vocab.h"
|
|
|
|
#include "Ngram.h"
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
2010-11-17 17:06:21 +03:00
|
|
|
LanguageModelSRI::LanguageModelSRI()
|
2011-02-24 16:14:42 +03:00
|
|
|
: m_srilmVocab(0)
|
|
|
|
, m_srilmModel(0)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
LanguageModelSRI::~LanguageModelSRI()
|
|
|
|
{
|
|
|
|
delete m_srilmModel;
|
|
|
|
delete m_srilmVocab;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool LanguageModelSRI::Load(const std::string &filePath
|
2011-02-24 16:14:42 +03:00
|
|
|
, FactorType factorType
|
|
|
|
, size_t nGramOrder)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
m_srilmVocab = new ::Vocab();
|
2008-06-11 14:52:57 +04:00
|
|
|
m_srilmModel = new Ngram(*m_srilmVocab, nGramOrder);
|
2011-02-24 16:14:42 +03:00
|
|
|
m_factorType = factorType;
|
|
|
|
m_nGramOrder = nGramOrder;
|
|
|
|
m_filePath = filePath;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
m_srilmModel->skipOOVs() = false;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
File file( filePath.c_str(), "r" );
|
|
|
|
m_srilmModel->read(file);
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// LM can be ok, just outputs warnings
|
|
|
|
CreateFactors();
|
2008-06-11 14:52:57 +04:00
|
|
|
m_unknownId = m_srilmVocab->unkIndex();
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void LanguageModelSRI::CreateFactors()
|
2011-02-24 16:14:42 +03:00
|
|
|
{
|
|
|
|
// add factors which have srilm id
|
|
|
|
FactorCollection &factorCollection = FactorCollection::Instance();
|
|
|
|
|
|
|
|
std::map<size_t, VocabIndex> lmIdMap;
|
|
|
|
size_t maxFactorId = 0; // to create lookup vector later on
|
|
|
|
|
|
|
|
VocabString str;
|
|
|
|
VocabIter iter(*m_srilmVocab);
|
|
|
|
while ( (str = iter.next()) != NULL) {
|
|
|
|
VocabIndex lmId = GetLmID(str);
|
|
|
|
size_t factorId = factorCollection.AddFactor(Output, m_factorType, str)->GetId();
|
|
|
|
lmIdMap[factorId] = lmId;
|
|
|
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t factorId;
|
|
|
|
|
|
|
|
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
|
|
|
|
factorId = m_sentenceStart->GetId();
|
|
|
|
lmIdMap[factorId] = GetLmID(BOS_);
|
|
|
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
|
|
|
m_sentenceStartArray[m_factorType] = m_sentenceStart;
|
|
|
|
|
|
|
|
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
|
|
|
|
factorId = m_sentenceEnd->GetId();
|
|
|
|
lmIdMap[factorId] = GetLmID(EOS_);
|
|
|
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
|
|
|
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
|
|
|
|
|
|
|
|
// add to lookup vector in object
|
|
|
|
m_lmIdLookup.resize(maxFactorId+1);
|
|
|
|
|
|
|
|
fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);
|
|
|
|
|
|
|
|
map<size_t, VocabIndex>::iterator iterMap;
|
|
|
|
for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) {
|
|
|
|
m_lmIdLookup[iterMap->first] = iterMap->second;
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
VocabIndex LanguageModelSRI::GetLmID( const std::string &str ) const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
return m_srilmVocab->getIndex( str.c_str(), m_unknownId );
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
VocabIndex LanguageModelSRI::GetLmID( const Factor *factor ) const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t factorId = factor->GetId();
|
|
|
|
return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId];
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-03-08 02:21:09 +03:00
|
|
|
LMResult LanguageModelSRI::GetValue(VocabIndex wordId, VocabIndex *context) const
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-03-08 02:21:09 +03:00
|
|
|
LMResult ret;
|
|
|
|
ret.score = FloorScore(TransformLMScore(m_srilmModel->wordProb( wordId, context)));
|
|
|
|
ret.unknown = (wordId == m_unknownId);
|
|
|
|
return ret;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-03-08 02:21:09 +03:00
|
|
|
LMResult LanguageModelSRI::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-03-08 02:21:09 +03:00
|
|
|
LMResult ret;
|
2011-02-24 16:14:42 +03:00
|
|
|
FactorType factorType = GetFactorType();
|
|
|
|
size_t count = contextFactor.size();
|
|
|
|
if (count <= 0) {
|
|
|
|
if(finalState)
|
|
|
|
*finalState = NULL;
|
2011-03-08 02:21:09 +03:00
|
|
|
ret.score = 0.0;
|
|
|
|
ret.unknown = false;
|
|
|
|
return ret;
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// set up context
|
|
|
|
VocabIndex ngram[count + 1];
|
|
|
|
for (size_t i = 0 ; i < count - 1 ; i++) {
|
|
|
|
ngram[i+1] = GetLmID((*contextFactor[count-2-i])[factorType]);
|
|
|
|
}
|
|
|
|
ngram[count] = Vocab_None;
|
|
|
|
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK((*contextFactor[count-1])[factorType] != NULL);
|
2011-02-24 16:14:42 +03:00
|
|
|
// call sri lm fn
|
|
|
|
VocabIndex lmId = GetLmID((*contextFactor[count-1])[factorType]);
|
2011-03-08 02:21:09 +03:00
|
|
|
ret = GetValue(lmId, ngram+1);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
if (finalState) {
|
|
|
|
ngram[0] = lmId;
|
|
|
|
unsigned int dummy;
|
|
|
|
*finalState = m_srilmModel->contextID(ngram, dummy);
|
|
|
|
}
|
|
|
|
return ret;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
|