2008-11-04 21:03:03 +03:00
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
2013-09-27 12:35:24 +04:00
|
|
|
// This file should be compiled only when the LM_RAND flag is enabled.
|
2013-08-29 20:56:25 +04:00
|
|
|
//
|
2013-09-27 12:35:24 +04:00
|
|
|
// The following ifdef prevents XCode and other non-bjam build systems
|
2013-08-29 20:56:25 +04:00
|
|
|
// from attempting to compile this file when LM_RAND is disabled.
|
|
|
|
//
|
|
|
|
|
2008-11-04 21:03:03 +03:00
|
|
|
#include <limits>
|
|
|
|
#include <iostream>
|
|
|
|
#include <fstream>
|
2011-11-18 19:40:56 +04:00
|
|
|
|
2012-11-14 18:18:53 +04:00
|
|
|
#include "Rand.h"
|
|
|
|
#include "moses/Factor.h"
|
|
|
|
#include "moses/Util.h"
|
|
|
|
#include "moses/FactorCollection.h"
|
|
|
|
#include "moses/Phrase.h"
|
|
|
|
#include "moses/InputFileStream.h"
|
|
|
|
#include "moses/StaticData.h"
|
2013-07-23 15:56:35 +04:00
|
|
|
#include "RandLM.h"
|
2008-11-04 21:03:03 +03:00
|
|
|
|
2013-07-20 03:19:04 +04:00
|
|
|
using namespace std;
|
2011-11-18 16:07:41 +04:00
|
|
|
|
2008-11-04 21:03:03 +03:00
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
|
2013-07-20 03:19:04 +04:00
|
|
|
LanguageModelRandLM::LanguageModelRandLM(const std::string &line)
|
2013-10-29 22:59:53 +04:00
|
|
|
:LanguageModelSingleFactor(line)
|
2013-07-20 03:19:04 +04:00
|
|
|
, m_lm(0)
|
2011-11-18 19:40:56 +04:00
|
|
|
{
|
2013-07-20 03:19:04 +04:00
|
|
|
}
|
|
|
|
|
2013-07-25 18:23:05 +04:00
|
|
|
LanguageModelRandLM::~LanguageModelRandLM()
|
|
|
|
{
|
2013-07-20 03:19:04 +04:00
|
|
|
delete m_lm;
|
|
|
|
}
|
|
|
|
|
|
|
|
void LanguageModelRandLM::Load()
|
2011-02-24 16:14:42 +03:00
|
|
|
{
|
2008-11-04 21:03:03 +03:00
|
|
|
cerr << "Loading LanguageModelRandLM..." << endl;
|
|
|
|
FactorCollection &factorCollection = FactorCollection::Instance();
|
|
|
|
int cache_MB = 50; // increase cache size
|
2013-07-20 03:19:04 +04:00
|
|
|
m_lm = randlm::RandLM::initRandLM(m_filePath, m_nGramOrder, cache_MB);
|
2013-11-23 00:27:46 +04:00
|
|
|
UTIL_THROW_IF2(m_lm == NULL, "RandLM object not created");
|
2008-11-04 21:03:03 +03:00
|
|
|
// get special word ids
|
|
|
|
m_oov_id = m_lm->getWordID(m_lm->getOOV());
|
|
|
|
CreateFactors(factorCollection);
|
2011-03-17 00:04:34 +03:00
|
|
|
m_lm->initThreadSpecificData();
|
2008-11-04 21:03:03 +03:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection) // add factors which have randlm id
|
|
|
|
{
|
2008-11-04 21:03:03 +03:00
|
|
|
// code copied & paste from SRI LM class. should do template function
|
|
|
|
// first get all bf vocab in map
|
|
|
|
std::map<size_t, randlm::WordID> randlm_ids_map; // map from factor id -> randlm id
|
|
|
|
size_t maxFactorId = 0; // to create lookup vector later on
|
|
|
|
for(std::map<randlm::Word, randlm::WordID>::const_iterator vIter = m_lm->vocabStart();
|
2011-02-24 16:14:42 +03:00
|
|
|
vIter != m_lm->vocabEnd(); vIter++) {
|
2008-11-04 21:03:03 +03:00
|
|
|
// get word from randlm vocab and associate with (new) factor id
|
|
|
|
size_t factorId=factorCollection.AddFactor(Output,m_factorType,vIter->first)->GetId();
|
|
|
|
randlm_ids_map[factorId] = vIter->second;
|
|
|
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
|
|
|
}
|
|
|
|
// add factors for BOS and EOS and store bf word ids
|
|
|
|
size_t factorId;
|
|
|
|
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, m_lm->getBOS());
|
|
|
|
factorId = m_sentenceStart->GetId();
|
|
|
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
2013-02-22 02:55:41 +04:00
|
|
|
m_sentenceStartWord[m_factorType] = m_sentenceStart;
|
2008-11-04 21:03:03 +03:00
|
|
|
|
|
|
|
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, m_lm->getEOS());
|
|
|
|
factorId = m_sentenceEnd->GetId();
|
|
|
|
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
2013-02-22 02:55:41 +04:00
|
|
|
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
|
2008-11-04 21:03:03 +03:00
|
|
|
|
|
|
|
// add to lookup vector in object
|
|
|
|
m_randlm_ids_vec.resize(maxFactorId+1);
|
|
|
|
// fill with OOV code
|
|
|
|
fill(m_randlm_ids_vec.begin(), m_randlm_ids_vec.end(), m_oov_id);
|
|
|
|
|
|
|
|
for (map<size_t, randlm::WordID>::const_iterator iter = randlm_ids_map.begin();
|
|
|
|
iter != randlm_ids_map.end() ; ++iter)
|
|
|
|
m_randlm_ids_vec[iter->first] = iter->second;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
|
|
|
|
{
|
2008-11-04 21:03:03 +03:00
|
|
|
return m_lm->getWordID(str);
|
|
|
|
}
|
|
|
|
|
2013-07-25 18:23:05 +04:00
|
|
|
randlm::WordID LanguageModelRandLM::GetLmID( const Factor *factor ) const
|
|
|
|
{
|
2013-07-20 03:19:04 +04:00
|
|
|
size_t factorId = factor->GetId();
|
|
|
|
return ( factorId >= m_randlm_ids_vec.size()) ? m_oov_id : m_randlm_ids_vec[factorId];
|
|
|
|
}
|
|
|
|
|
2011-03-08 02:21:09 +03:00
|
|
|
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
|
2013-05-29 21:16:15 +04:00
|
|
|
State* finalState) const
|
2011-02-24 16:14:42 +03:00
|
|
|
{
|
2008-11-04 21:03:03 +03:00
|
|
|
FactorType factorType = GetFactorType();
|
|
|
|
// set up context
|
|
|
|
randlm::WordID ngram[MAX_NGRAM_SIZE];
|
|
|
|
int count = contextFactor.size();
|
|
|
|
for (int i = 0 ; i < count ; i++) {
|
|
|
|
ngram[i] = GetLmID((*contextFactor[i])[factorType]);
|
|
|
|
//std::cerr << m_lm->getWord(ngram[i]) << " ";
|
|
|
|
}
|
|
|
|
int found = 0;
|
2011-03-08 02:21:09 +03:00
|
|
|
LMResult ret;
|
|
|
|
ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState)));
|
|
|
|
ret.unknown = count && (ngram[count - 1] == m_oov_id);
|
2008-11-04 21:03:03 +03:00
|
|
|
//if (finalState)
|
2011-01-27 22:01:45 +03:00
|
|
|
// std::cerr << " = " << logprob << "(" << *finalState << ", " <<")"<< std::endl;
|
2008-11-04 21:03:03 +03:00
|
|
|
//else
|
|
|
|
// std::cerr << " = " << logprob << std::endl;
|
2011-03-18 18:58:12 +03:00
|
|
|
return ret;
|
2008-11-04 21:03:03 +03:00
|
|
|
}
|
|
|
|
|
2013-07-25 18:23:05 +04:00
|
|
|
void LanguageModelRandLM::InitializeForInput(InputType const& source)
|
|
|
|
{
|
2013-07-20 03:19:04 +04:00
|
|
|
m_lm->initThreadSpecificData(); // Creates thread specific data iff // compiled with multithreading.
|
|
|
|
}
|
2013-07-25 18:23:05 +04:00
|
|
|
void LanguageModelRandLM::CleanUpAfterSentenceProcessing(const InputType& source)
|
|
|
|
{
|
2013-07-20 03:19:04 +04:00
|
|
|
m_lm->clearCaches(); // clear caches
|
2008-11-04 21:03:03 +03:00
|
|
|
}
|
|
|
|
|
2011-11-18 19:40:56 +04:00
|
|
|
}
|
2008-11-04 21:03:03 +03:00
|
|
|
|