2008-06-11 14:52:57 +04:00
|
|
|
// vim:tabstop=2
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
2012-11-27 19:08:31 +04:00
|
|
|
#include "moses/TranslationModel/PhraseDictionary.h"
|
|
|
|
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
|
2012-11-27 20:57:23 +04:00
|
|
|
#include "moses/TranslationModel/RuleTable/PhraseDictionarySCFG.h"
|
|
|
|
#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
|
|
|
|
#include "moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h"
|
|
|
|
#include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
|
2012-07-18 20:59:21 +04:00
|
|
|
|
2010-04-12 17:50:11 +04:00
|
|
|
#ifndef WIN32
|
2012-11-27 19:08:31 +04:00
|
|
|
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
|
2012-11-27 22:04:01 +04:00
|
|
|
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
|
2010-04-12 17:50:11 +04:00
|
|
|
#endif
|
2012-11-27 20:57:23 +04:00
|
|
|
#include "moses/TranslationModel/RuleTable/UTrie.h"
|
2011-11-04 19:43:42 +04:00
|
|
|
|
2012-11-27 19:08:31 +04:00
|
|
|
#include "moses/StaticData.h"
|
|
|
|
#include "moses/InputType.h"
|
|
|
|
#include "moses/TranslationOption.h"
|
|
|
|
#include "moses/UserMessage.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2010-08-10 17:51:20 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
namespace Moses
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
const TargetPhraseCollection *PhraseDictionary::
|
2011-02-24 16:14:42 +03:00
|
|
|
GetTargetPhraseCollection(InputType const& src,WordsRange const& range) const
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
return GetTargetPhraseCollection(src.GetSubString(range));
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2012-06-01 04:49:42 +04:00
|
|
|
size_t PhraseDictionary::GetDictIndex() const
|
|
|
|
{
|
|
|
|
return m_feature->GetDictIndex();
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
PhraseDictionaryFeature::PhraseDictionaryFeature
|
|
|
|
(PhraseTableImplementation implementation
|
2011-09-20 19:32:26 +04:00
|
|
|
, SparsePhraseDictionaryFeature* spdf
|
2011-02-24 16:14:42 +03:00
|
|
|
, size_t numScoreComponent
|
|
|
|
, unsigned numInputScores
|
|
|
|
, const std::vector<FactorType> &input
|
|
|
|
, const std::vector<FactorType> &output
|
|
|
|
, const std::string &filePath
|
|
|
|
, const std::vector<float> &weight
|
2012-06-01 04:49:42 +04:00
|
|
|
, size_t dictIndex
|
2011-02-24 16:14:42 +03:00
|
|
|
, size_t tableLimit
|
|
|
|
, const std::string &targetFile // default param
|
|
|
|
, const std::string &alignmentsFile) // default param
|
2011-11-09 01:22:34 +04:00
|
|
|
:DecodeFeature("PhraseModel",numScoreComponent,input,output),
|
2012-09-27 01:49:33 +04:00
|
|
|
m_dictIndex(dictIndex),
|
2011-02-24 16:14:42 +03:00
|
|
|
m_numInputScores(numInputScores),
|
|
|
|
m_filePath(filePath),
|
|
|
|
m_tableLimit(tableLimit),
|
|
|
|
m_implementation(implementation),
|
|
|
|
m_targetFile(targetFile),
|
2011-09-20 19:32:26 +04:00
|
|
|
m_alignmentsFile(alignmentsFile),
|
|
|
|
m_sparsePhraseDictionaryFeature(spdf)
|
2010-04-08 21:16:10 +04:00
|
|
|
{
|
2012-09-27 01:49:33 +04:00
|
|
|
if (implementation == Memory || implementation == SCFG || implementation == SuffixArray ||
|
2012-11-29 19:27:38 +04:00
|
|
|
implementation==Compact || implementation==FuzzyMatch ) {
|
2011-02-24 16:14:42 +03:00
|
|
|
m_useThreadSafePhraseDictionary = true;
|
2012-06-01 04:49:42 +04:00
|
|
|
if (implementation == SuffixArray) {
|
2012-11-23 22:52:06 +04:00
|
|
|
cerr << "Warning: implementation holds cached weights!" << endl;
|
2012-06-01 04:49:42 +04:00
|
|
|
exit(1);
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
} else {
|
|
|
|
m_useThreadSafePhraseDictionary = false;
|
|
|
|
}
|
2010-04-19 17:52:56 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSystem* system)
|
|
|
|
{
|
|
|
|
const StaticData& staticData = StaticData::Instance();
|
2012-07-18 01:36:10 +04:00
|
|
|
std::vector<float> weightT = staticData.GetWeights(this);
|
2012-04-24 08:21:18 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_implementation == Memory) {
|
|
|
|
// memory phrase table
|
|
|
|
VERBOSE(2,"using standard phrase tables" << std::endl);
|
|
|
|
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
|
|
|
|
m_filePath += ".gz";
|
|
|
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
|
|
|
}
|
|
|
|
if (staticData.GetInputType() != SentenceInput) {
|
|
|
|
UserMessage::Add("Must use binary phrase table for this input type");
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(GetNumScoreComponents(),this);
|
2011-02-24 16:14:42 +03:00
|
|
|
bool ret = pdm->Load(GetInput(), GetOutput()
|
|
|
|
, m_filePath
|
2012-04-24 08:21:18 +04:00
|
|
|
, weightT
|
2011-02-24 16:14:42 +03:00
|
|
|
, m_tableLimit
|
2012-12-20 20:38:15 +04:00
|
|
|
, staticData.GetLMList()
|
2012-12-19 19:38:57 +04:00
|
|
|
, staticData.GetWeightWordPenalty());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2011-02-24 16:14:42 +03:00
|
|
|
return pdm;
|
|
|
|
} else if (m_implementation == Binary) {
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionaryTreeAdaptor* pdta = new PhraseDictionaryTreeAdaptor(GetNumScoreComponents(), m_numInputScores,this);
|
2011-02-24 16:14:42 +03:00
|
|
|
bool ret = pdta->Load( GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
2012-04-24 08:21:18 +04:00
|
|
|
, weightT
|
2011-02-24 16:14:42 +03:00
|
|
|
, m_tableLimit
|
2012-12-20 20:38:15 +04:00
|
|
|
, staticData.GetLMList()
|
2012-12-19 19:38:57 +04:00
|
|
|
, staticData.GetWeightWordPenalty());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2010-08-12 16:54:55 +04:00
|
|
|
return pdta;
|
2012-01-24 00:41:49 +04:00
|
|
|
} else if (m_implementation == SCFG || m_implementation == Hiero) {
|
2011-02-24 16:14:42 +03:00
|
|
|
// memory phrase table
|
2012-01-24 00:41:49 +04:00
|
|
|
if (m_implementation == Hiero) {
|
|
|
|
VERBOSE(2,"using Hiero format phrase tables" << std::endl);
|
|
|
|
} else {
|
2012-07-18 20:59:21 +04:00
|
|
|
VERBOSE(2,"using Moses-formatted SCFG phrase tables" << std::endl);
|
2012-01-24 00:41:49 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
|
|
|
|
m_filePath += ".gz";
|
|
|
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2012-01-26 15:38:40 +04:00
|
|
|
RuleTableTrie *dict;
|
|
|
|
if (staticData.GetParsingAlgorithm() == ParseScope3) {
|
2012-05-25 00:34:06 +04:00
|
|
|
dict = new RuleTableUTrie(GetNumScoreComponents(), this);
|
2012-01-26 15:38:40 +04:00
|
|
|
} else {
|
2012-05-25 00:34:06 +04:00
|
|
|
dict = new PhraseDictionarySCFG(GetNumScoreComponents(), this);
|
2011-11-04 19:43:42 +04:00
|
|
|
}
|
2012-01-26 15:38:40 +04:00
|
|
|
bool ret = dict->Load(GetInput()
|
2011-11-04 19:43:42 +04:00
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
2012-04-24 08:21:18 +04:00
|
|
|
, weightT
|
2011-11-04 19:43:42 +04:00
|
|
|
, m_tableLimit
|
2012-12-20 20:38:15 +04:00
|
|
|
, staticData.GetLMList()
|
2012-12-19 19:38:57 +04:00
|
|
|
, staticData.GetWordPenaltyProducer());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2012-01-26 15:38:40 +04:00
|
|
|
return dict;
|
2011-11-06 12:35:05 +04:00
|
|
|
} else if (m_implementation == ALSuffixArray) {
|
|
|
|
// memory phrase table
|
2012-06-01 04:49:42 +04:00
|
|
|
cerr << "Warning: Implementation holds cached weights!" << endl;
|
|
|
|
exit(1);
|
2011-11-06 12:35:05 +04:00
|
|
|
VERBOSE(2,"using Hiero format phrase tables" << std::endl);
|
|
|
|
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
|
|
|
|
m_filePath += ".gz";
|
|
|
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
|
|
|
}
|
|
|
|
|
2011-11-11 12:25:13 +04:00
|
|
|
PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(GetNumScoreComponents(),this);
|
2011-11-06 12:35:05 +04:00
|
|
|
bool ret = pdm->Load(GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
2012-09-27 01:49:33 +04:00
|
|
|
, weightT
|
2011-11-06 12:35:05 +04:00
|
|
|
, m_tableLimit
|
2012-12-20 20:38:15 +04:00
|
|
|
, staticData.GetLMList()
|
2012-12-19 19:38:57 +04:00
|
|
|
, staticData.GetWordPenaltyProducer());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2011-11-06 12:35:05 +04:00
|
|
|
return pdm;
|
2011-02-24 16:14:42 +03:00
|
|
|
} else if (m_implementation == OnDisk) {
|
|
|
|
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionaryOnDisk* pdta = new PhraseDictionaryOnDisk(GetNumScoreComponents(), this);
|
2011-02-24 16:14:42 +03:00
|
|
|
bool ret = pdta->Load(GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
2012-09-27 01:49:33 +04:00
|
|
|
, weightT
|
2011-02-24 16:14:42 +03:00
|
|
|
, m_tableLimit
|
2012-12-20 20:38:15 +04:00
|
|
|
, staticData.GetLMList()
|
2012-12-19 19:38:57 +04:00
|
|
|
, staticData.GetWordPenaltyProducer());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2011-02-24 16:14:42 +03:00
|
|
|
return pdta;
|
|
|
|
} else if (m_implementation == SuffixArray) {
|
2012-06-01 04:49:42 +04:00
|
|
|
cerr << "Warning: Implementation holds cached weights!" << endl;
|
|
|
|
exit(1);
|
2011-02-24 16:14:42 +03:00
|
|
|
#ifndef WIN32
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionaryDynSuffixArray *pd = new PhraseDictionaryDynSuffixArray(GetNumScoreComponents(), this);
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!(pd->Load(
|
|
|
|
GetInput()
|
|
|
|
,GetOutput()
|
|
|
|
,m_filePath
|
|
|
|
,m_targetFile
|
2012-06-01 04:49:42 +04:00
|
|
|
,m_alignmentsFile
|
|
|
|
,weightT, m_tableLimit
|
2012-12-20 20:38:15 +04:00
|
|
|
,staticData.GetLMList()
|
2012-12-19 19:38:57 +04:00
|
|
|
,staticData.GetWeightWordPenalty()))) {
|
2011-02-24 16:14:42 +03:00
|
|
|
std::cerr << "FAILED TO LOAD\n" << endl;
|
|
|
|
delete pd;
|
|
|
|
pd = NULL;
|
|
|
|
}
|
|
|
|
std::cerr << "Suffix array phrase table loaded" << std::endl;
|
|
|
|
return pd;
|
|
|
|
#else
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
#endif
|
2012-08-14 02:53:14 +04:00
|
|
|
} else if (m_implementation == FuzzyMatch) {
|
2012-07-18 20:59:21 +04:00
|
|
|
|
2012-09-27 01:49:33 +04:00
|
|
|
PhraseDictionaryFuzzyMatch *dict = new PhraseDictionaryFuzzyMatch(GetNumScoreComponents(), this);
|
2012-07-18 20:59:21 +04:00
|
|
|
|
|
|
|
bool ret = dict->Load(GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
2012-09-27 01:49:33 +04:00
|
|
|
, weightT
|
2012-07-18 20:59:21 +04:00
|
|
|
, m_tableLimit
|
2012-12-20 20:38:15 +04:00
|
|
|
, staticData.GetLMList()
|
2012-12-19 19:38:57 +04:00
|
|
|
, staticData.GetWordPenaltyProducer());
|
2012-09-27 01:49:33 +04:00
|
|
|
CHECK(ret);
|
2012-07-23 17:31:05 +04:00
|
|
|
|
2012-07-18 20:59:21 +04:00
|
|
|
return dict;
|
2012-08-03 18:38:45 +04:00
|
|
|
} else if (m_implementation == Compact) {
|
2012-08-03 14:04:39 +04:00
|
|
|
#ifndef WIN32
|
|
|
|
VERBOSE(2,"Using compact phrase table" << std::endl);
|
|
|
|
|
2012-09-27 01:49:33 +04:00
|
|
|
PhraseDictionaryCompact* pd = new PhraseDictionaryCompact(GetNumScoreComponents(), m_implementation, this);
|
2012-08-03 14:04:39 +04:00
|
|
|
bool ret = pd->Load(GetInput(), GetOutput()
|
|
|
|
, m_filePath
|
2012-09-27 01:49:33 +04:00
|
|
|
, weightT
|
2012-08-03 14:04:39 +04:00
|
|
|
, m_tableLimit
|
2012-12-20 20:38:15 +04:00
|
|
|
, staticData.GetLMList()
|
2012-12-19 19:38:57 +04:00
|
|
|
, staticData.GetWeightWordPenalty());
|
2012-09-27 01:49:33 +04:00
|
|
|
CHECK(ret);
|
2012-08-03 14:04:39 +04:00
|
|
|
return pd;
|
|
|
|
#else
|
|
|
|
CHECK(false);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
else {
|
2011-02-24 16:14:42 +03:00
|
|
|
std::cerr << "Unknown phrase table type " << m_implementation << endl;
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system)
|
2010-08-10 17:12:00 +04:00
|
|
|
{
|
|
|
|
//Thread-safe phrase dictionaries get loaded now
|
|
|
|
if (m_useThreadSafePhraseDictionary && !m_threadSafePhraseDictionary.get()) {
|
|
|
|
IFVERBOSE(1)
|
2011-02-24 16:14:42 +03:00
|
|
|
PrintUserTime("Start loading phrase table from " + m_filePath);
|
2010-08-10 17:12:00 +04:00
|
|
|
m_threadSafePhraseDictionary.reset(LoadPhraseTable(system));
|
|
|
|
IFVERBOSE(1)
|
2011-02-24 16:14:42 +03:00
|
|
|
PrintUserTime("Finished loading phrase tables");
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
|
|
|
//Other types will be lazy loaded
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2010-08-10 17:12:00 +04:00
|
|
|
//Called when we start translating a new sentence
|
|
|
|
void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system, const InputType& source)
|
2010-04-08 21:16:10 +04:00
|
|
|
{
|
2010-08-10 17:12:00 +04:00
|
|
|
PhraseDictionary* dict;
|
|
|
|
if (m_useThreadSafePhraseDictionary) {
|
|
|
|
//thread safe dictionary should already be loaded
|
|
|
|
dict = m_threadSafePhraseDictionary.get();
|
|
|
|
} else {
|
|
|
|
//thread-unsafe dictionary may need to be loaded if this is a new thread.
|
|
|
|
if (!m_threadUnsafePhraseDictionary.get()) {
|
|
|
|
m_threadUnsafePhraseDictionary.reset(LoadPhraseTable(system));
|
|
|
|
}
|
|
|
|
dict = m_threadUnsafePhraseDictionary.get();
|
|
|
|
}
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(dict);
|
2010-08-10 17:12:00 +04:00
|
|
|
dict->InitializeForInput(source);
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2009-08-07 20:47:54 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
const PhraseDictionary* PhraseDictionaryFeature::GetDictionary() const
|
|
|
|
{
|
2010-08-10 17:12:00 +04:00
|
|
|
PhraseDictionary* dict;
|
|
|
|
if (m_useThreadSafePhraseDictionary) {
|
|
|
|
dict = m_threadSafePhraseDictionary.get();
|
|
|
|
} else {
|
|
|
|
dict = m_threadUnsafePhraseDictionary.get();
|
|
|
|
}
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(dict);
|
2010-08-10 17:12:00 +04:00
|
|
|
return dict;
|
2010-04-19 17:52:56 +04:00
|
|
|
}
|
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
PhraseDictionary* PhraseDictionaryFeature::GetDictionary()
|
|
|
|
{
|
|
|
|
PhraseDictionary* dict;
|
|
|
|
if (m_useThreadSafePhraseDictionary) {
|
|
|
|
dict = m_threadSafePhraseDictionary.get();
|
|
|
|
} else {
|
|
|
|
dict = m_threadUnsafePhraseDictionary.get();
|
|
|
|
}
|
2011-12-09 13:30:48 +04:00
|
|
|
CHECK(dict);
|
2011-08-19 20:09:36 +04:00
|
|
|
return dict;
|
2010-09-14 14:13:06 +04:00
|
|
|
}
|
2010-04-08 21:16:10 +04:00
|
|
|
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
PhraseDictionaryFeature::~PhraseDictionaryFeature()
|
2010-04-08 21:16:10 +04:00
|
|
|
{}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
bool PhraseDictionaryFeature::ComputeValueInTranslationOption() const
|
|
|
|
{
|
|
|
|
return true;
|
2009-02-06 18:43:06 +03:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
const PhraseDictionaryFeature* PhraseDictionary::GetFeature() const
|
|
|
|
{
|
|
|
|
return m_feature;
|
|
|
|
}
|
2009-08-07 20:47:54 +04:00
|
|
|
|
2012-06-01 04:49:42 +04:00
|
|
|
size_t PhraseDictionaryFeature::GetDictIndex() const
|
|
|
|
{
|
|
|
|
return m_dictIndex;
|
|
|
|
}
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
}
|
|
|
|
|