2013-05-29 21:16:15 +04:00
|
|
|
// $Id$
|
|
|
|
// vim:tabstop=2
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
2012-08-02 20:32:55 +04:00
|
|
|
|
|
|
|
#include <fstream>
|
|
|
|
#include <string>
|
|
|
|
#include <iterator>
|
|
|
|
#include <queue>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
|
|
|
|
#include "PhraseDictionaryCompact.h"
|
2012-11-12 23:56:18 +04:00
|
|
|
#include "moses/FactorCollection.h"
|
|
|
|
#include "moses/Word.h"
|
|
|
|
#include "moses/Util.h"
|
|
|
|
#include "moses/InputFileStream.h"
|
|
|
|
#include "moses/StaticData.h"
|
|
|
|
#include "moses/WordsRange.h"
|
|
|
|
#include "moses/UserMessage.h"
|
|
|
|
#include "moses/ThreadPool.h"
|
2013-11-20 18:02:38 +04:00
|
|
|
#include "util/exception.hh"
|
2012-08-02 20:32:55 +04:00
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
namespace Moses
|
|
|
|
{
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2013-06-20 15:50:41 +04:00
|
|
|
PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line)
|
2013-10-29 22:20:55 +04:00
|
|
|
:PhraseDictionary(line)
|
2013-06-20 15:50:41 +04:00
|
|
|
,m_inMemory(true)
|
|
|
|
,m_useAlignmentInfo(true)
|
|
|
|
,m_hash(10, 16)
|
|
|
|
,m_phraseDecoder(0)
|
|
|
|
,m_weight(0)
|
|
|
|
{
|
2013-06-20 16:06:03 +04:00
|
|
|
ReadParameters();
|
2013-06-20 15:50:41 +04:00
|
|
|
}
|
|
|
|
|
2013-05-31 23:21:02 +04:00
|
|
|
void PhraseDictionaryCompact::Load()
|
2012-08-02 20:32:55 +04:00
|
|
|
{
|
2013-02-25 22:52:58 +04:00
|
|
|
const StaticData &staticData = StaticData::Instance();
|
|
|
|
|
2014-08-03 21:44:43 +04:00
|
|
|
SetFeaturesToApply();
|
|
|
|
|
2013-02-25 22:52:58 +04:00
|
|
|
m_weight = staticData.GetWeights(this);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2013-02-25 22:52:58 +04:00
|
|
|
std::string tFilePath = m_filePath;
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-11-17 13:07:33 +04:00
|
|
|
std::string suffix = ".minphr";
|
2013-05-29 21:16:15 +04:00
|
|
|
if(tFilePath.substr(tFilePath.length() - suffix.length(), suffix.length()) == suffix) {
|
|
|
|
if(!FileExists(tFilePath)) {
|
2013-06-03 15:33:18 +04:00
|
|
|
throw runtime_error("Error: File " + tFilePath + " does not exit.");
|
2012-11-17 13:07:33 +04:00
|
|
|
exit(1);
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
} else {
|
|
|
|
if(FileExists(tFilePath + suffix)) {
|
2012-11-17 13:07:33 +04:00
|
|
|
tFilePath += suffix;
|
2013-05-29 21:16:15 +04:00
|
|
|
} else {
|
2013-05-31 23:21:02 +04:00
|
|
|
throw runtime_error("Error: File " + tFilePath + ".minphr does not exit.");
|
2012-08-03 18:38:45 +04:00
|
|
|
}
|
|
|
|
}
|
2012-08-02 20:32:55 +04:00
|
|
|
|
2013-02-25 22:52:58 +04:00
|
|
|
m_phraseDecoder = new PhraseDecoder(*this, &m_input, &m_output,
|
2013-05-29 21:16:15 +04:00
|
|
|
m_numScoreComponents, &m_weight);
|
2012-08-02 20:32:55 +04:00
|
|
|
|
2012-08-03 18:38:45 +04:00
|
|
|
std::FILE* pFile = std::fopen(tFilePath.c_str() , "r");
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-11-17 03:46:22 +04:00
|
|
|
size_t indexSize;
|
2012-08-02 20:32:55 +04:00
|
|
|
if(m_inMemory)
|
|
|
|
// Load source phrase index into memory
|
|
|
|
indexSize = m_hash.Load(pFile);
|
|
|
|
else
|
|
|
|
// Keep source phrase index on disk
|
|
|
|
indexSize = m_hash.LoadIndex(pFile);
|
2012-11-17 03:46:22 +04:00
|
|
|
|
|
|
|
size_t coderSize = m_phraseDecoder->Load(pFile);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-11-17 03:46:22 +04:00
|
|
|
size_t phraseSize;
|
2012-08-02 20:32:55 +04:00
|
|
|
if(m_inMemory)
|
|
|
|
// Load target phrase collections into memory
|
|
|
|
phraseSize = m_targetPhrasesMemory.load(pFile, false);
|
|
|
|
else
|
|
|
|
// Keep target phrase collections on disk
|
|
|
|
phraseSize = m_targetPhrasesMapped.load(pFile, true);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2013-11-23 00:27:46 +04:00
|
|
|
UTIL_THROW_IF2(indexSize == 0 || coderSize == 0 || phraseSize == 0,
|
2014-01-15 19:49:57 +04:00
|
|
|
"Not successfully loaded");
|
2012-08-02 20:32:55 +04:00
|
|
|
}
|
|
|
|
|
2013-09-27 12:35:24 +04:00
|
|
|
// now properly declared in TargetPhraseCollection.h
|
|
|
|
// and defined in TargetPhraseCollection.cpp
|
|
|
|
// struct CompareTargetPhrase {
|
|
|
|
// bool operator() (const TargetPhrase &a, const TargetPhrase &b) {
|
|
|
|
// return a.GetFutureScore() > b.GetFutureScore();
|
|
|
|
// }
|
|
|
|
// };
|
2012-08-02 20:32:55 +04:00
|
|
|
|
|
|
|
const TargetPhraseCollection*
|
2013-08-24 00:08:53 +04:00
|
|
|
PhraseDictionaryCompact::GetTargetPhraseCollectionNonCacheLEGACY(const Phrase &sourcePhrase) const
|
2013-05-29 21:16:15 +04:00
|
|
|
{
|
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
// There is no souch source phrase if source phrase is longer than longest
|
2013-05-29 21:16:15 +04:00
|
|
|
// observed source phrase during compilation
|
2012-08-02 20:32:55 +04:00
|
|
|
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
// Retrieve target phrase collection from phrase table
|
|
|
|
TargetPhraseVectorPtr decodedPhraseColl
|
2013-08-12 01:41:23 +04:00
|
|
|
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
|
|
|
|
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
|
|
|
|
TargetPhraseCollection* phraseColl = new TargetPhraseCollection();
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
// Score phrases and if possible apply ttable_limit
|
|
|
|
TargetPhraseVector::iterator nth =
|
|
|
|
(m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
|
|
|
|
tpv->end() : tpv->begin() + m_tableLimit;
|
2013-11-15 14:55:38 +04:00
|
|
|
NTH_ELEMENT4(tpv->begin(), nth, tpv->end(), CompareTargetPhrase());
|
2013-04-26 18:55:32 +04:00
|
|
|
for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) {
|
|
|
|
TargetPhrase *tp = new TargetPhrase(*it);
|
|
|
|
phraseColl->Add(tp);
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2014-01-03 22:45:31 +04:00
|
|
|
// Cache phrase pair for clean-up or retrieval with PREnc
|
2012-08-11 03:26:40 +04:00
|
|
|
const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(phraseColl);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
return phraseColl;
|
2013-05-29 21:16:15 +04:00
|
|
|
} else
|
2012-08-02 20:32:55 +04:00
|
|
|
return NULL;
|
2012-09-13 18:09:32 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
TargetPhraseVectorPtr
|
2013-05-29 21:16:15 +04:00
|
|
|
PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const
|
|
|
|
{
|
2012-09-13 18:09:32 +04:00
|
|
|
|
2014-01-03 22:45:31 +04:00
|
|
|
// There is no such source phrase if source phrase is longer than longest
|
2013-05-29 21:16:15 +04:00
|
|
|
// observed source phrase during compilation
|
2012-09-13 18:09:32 +04:00
|
|
|
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
|
|
|
|
return TargetPhraseVectorPtr();
|
|
|
|
|
|
|
|
// Retrieve target phrase collection from phrase table
|
2013-07-08 11:58:02 +04:00
|
|
|
return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, false);
|
2012-08-02 20:32:55 +04:00
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
PhraseDictionaryCompact::~PhraseDictionaryCompact()
|
|
|
|
{
|
2012-08-02 20:32:55 +04:00
|
|
|
if(m_phraseDecoder)
|
|
|
|
delete m_phraseDecoder;
|
|
|
|
}
|
|
|
|
|
|
|
|
//TO_STRING_BODY(PhraseDictionaryCompact)
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc)
|
|
|
|
{
|
2012-08-02 20:32:55 +04:00
|
|
|
#ifdef WITH_THREADS
|
|
|
|
boost::mutex::scoped_lock lock(m_sentenceMutex);
|
2013-05-29 21:16:15 +04:00
|
|
|
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
|
2012-08-02 20:32:55 +04:00
|
|
|
#else
|
2013-05-29 21:16:15 +04:00
|
|
|
PhraseCache &ref = m_sentenceCache;
|
2012-08-02 20:32:55 +04:00
|
|
|
#endif
|
2012-08-11 03:26:40 +04:00
|
|
|
ref.push_back(tpc);
|
2012-08-02 20:32:55 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source,
|
2013-05-29 21:16:15 +04:00
|
|
|
const TargetPhrase &targetPhrase) { }
|
2012-08-02 20:32:55 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void PhraseDictionaryCompact::CleanUpAfterSentenceProcessing(const InputType &source)
|
|
|
|
{
|
2012-08-02 20:32:55 +04:00
|
|
|
if(!m_inMemory)
|
|
|
|
m_hash.KeepNLastRanges(0.01, 0.2);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
m_phraseDecoder->PruneCache();
|
2013-05-29 21:16:15 +04:00
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
#ifdef WITH_THREADS
|
|
|
|
boost::mutex::scoped_lock lock(m_sentenceMutex);
|
2013-05-29 21:16:15 +04:00
|
|
|
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
|
2012-08-02 20:32:55 +04:00
|
|
|
#else
|
2013-05-29 21:16:15 +04:00
|
|
|
PhraseCache &ref = m_sentenceCache;
|
2012-08-02 20:32:55 +04:00
|
|
|
#endif
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
|
|
for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++)
|
|
|
|
delete *it;
|
|
|
|
|
2012-08-02 20:32:55 +04:00
|
|
|
PhraseCache temp;
|
|
|
|
temp.swap(ref);
|
2013-08-20 19:06:48 +04:00
|
|
|
|
|
|
|
ReduceCache();
|
2012-08-02 20:32:55 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|