2008-06-11 14:52:57 +04:00
|
|
|
// $Id$
|
|
|
|
// vim:tabstop=2
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
#include "PhraseDictionary.h"
|
2009-08-07 20:47:54 +04:00
|
|
|
#include "PhraseDictionaryTreeAdaptor.h"
|
2010-07-18 02:29:06 +04:00
|
|
|
#include "PhraseDictionarySCFG.h"
|
2010-04-08 21:16:10 +04:00
|
|
|
#include "PhraseDictionaryOnDisk.h"
|
2011-11-04 19:43:42 +04:00
|
|
|
#include "PhraseDictionaryHiero.h"
|
2011-11-06 12:35:05 +04:00
|
|
|
#include "PhraseDictionaryALSuffixArray.h"
|
2010-04-12 17:50:11 +04:00
|
|
|
#ifndef WIN32
|
2010-04-08 21:57:38 +04:00
|
|
|
#include "PhraseDictionaryDynSuffixArray.h"
|
2010-04-12 17:50:11 +04:00
|
|
|
#endif
|
2011-11-04 19:43:42 +04:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "StaticData.h"
|
|
|
|
#include "InputType.h"
|
2009-02-06 18:43:06 +03:00
|
|
|
#include "TranslationOption.h"
|
2010-01-28 15:12:57 +03:00
|
|
|
#include "UserMessage.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2010-08-10 17:51:20 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
namespace Moses
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
const TargetPhraseCollection *PhraseDictionary::
|
2011-02-24 16:14:42 +03:00
|
|
|
GetTargetPhraseCollection(InputType const& src,WordsRange const& range) const
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
return GetTargetPhraseCollection(src.GetSubString(range));
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
PhraseDictionaryFeature::PhraseDictionaryFeature
|
|
|
|
(PhraseTableImplementation implementation
|
2011-09-20 19:32:26 +04:00
|
|
|
, SparsePhraseDictionaryFeature* spdf
|
2011-02-24 16:14:42 +03:00
|
|
|
, size_t numScoreComponent
|
|
|
|
, unsigned numInputScores
|
|
|
|
, const std::vector<FactorType> &input
|
|
|
|
, const std::vector<FactorType> &output
|
|
|
|
, const std::string &filePath
|
|
|
|
, const std::vector<float> &weight
|
|
|
|
, size_t tableLimit
|
|
|
|
, const std::string &targetFile // default param
|
|
|
|
, const std::string &alignmentsFile) // default param
|
2011-11-09 01:22:34 +04:00
|
|
|
:DecodeFeature("PhraseModel",numScoreComponent,input,output),
|
2011-02-24 16:14:42 +03:00
|
|
|
m_numInputScores(numInputScores),
|
|
|
|
m_filePath(filePath),
|
|
|
|
m_weight(weight),
|
|
|
|
m_tableLimit(tableLimit),
|
|
|
|
m_implementation(implementation),
|
|
|
|
m_targetFile(targetFile),
|
2011-09-20 19:32:26 +04:00
|
|
|
m_alignmentsFile(alignmentsFile),
|
|
|
|
m_sparsePhraseDictionaryFeature(spdf)
|
2010-04-08 21:16:10 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (implementation == Memory || implementation == SCFG || implementation == SuffixArray) {
|
|
|
|
m_useThreadSafePhraseDictionary = true;
|
|
|
|
} else {
|
|
|
|
m_useThreadSafePhraseDictionary = false;
|
|
|
|
}
|
2010-04-19 17:52:56 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSystem* system)
|
|
|
|
{
|
|
|
|
const StaticData& staticData = StaticData::Instance();
|
|
|
|
if (m_implementation == Memory) {
|
|
|
|
// memory phrase table
|
|
|
|
VERBOSE(2,"using standard phrase tables" << std::endl);
|
|
|
|
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
|
|
|
|
m_filePath += ".gz";
|
|
|
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
|
|
|
}
|
|
|
|
if (staticData.GetInputType() != SentenceInput) {
|
|
|
|
UserMessage::Add("Must use binary phrase table for this input type");
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(GetNumScoreComponents(),this);
|
2011-02-24 16:14:42 +03:00
|
|
|
bool ret = pdm->Load(GetInput(), GetOutput()
|
|
|
|
, m_filePath
|
|
|
|
, m_weight
|
|
|
|
, m_tableLimit
|
|
|
|
, system->GetLanguageModels()
|
|
|
|
, system->GetWeightWordPenalty());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2011-02-24 16:14:42 +03:00
|
|
|
return pdm;
|
|
|
|
} else if (m_implementation == Binary) {
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionaryTreeAdaptor* pdta = new PhraseDictionaryTreeAdaptor(GetNumScoreComponents(), m_numInputScores,this);
|
2011-02-24 16:14:42 +03:00
|
|
|
bool ret = pdta->Load( GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
|
|
|
, m_weight
|
|
|
|
, m_tableLimit
|
|
|
|
, system->GetLanguageModels()
|
|
|
|
, system->GetWeightWordPenalty());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2010-08-12 16:54:55 +04:00
|
|
|
return pdta;
|
2011-02-24 16:14:42 +03:00
|
|
|
} else if (m_implementation == SCFG) {
|
|
|
|
// memory phrase table
|
|
|
|
VERBOSE(2,"using New Format phrase tables" << std::endl);
|
|
|
|
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
|
|
|
|
m_filePath += ".gz";
|
|
|
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionarySCFG* pdm = new PhraseDictionarySCFG(GetNumScoreComponents(),this);
|
2011-02-24 16:14:42 +03:00
|
|
|
bool ret = pdm->Load(GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
|
|
|
, m_weight
|
|
|
|
, m_tableLimit
|
|
|
|
, system->GetLanguageModels()
|
|
|
|
, system->GetWordPenaltyProducer());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2011-02-24 16:14:42 +03:00
|
|
|
return pdm;
|
2011-11-04 19:43:42 +04:00
|
|
|
} else if (m_implementation == Hiero) {
|
|
|
|
// memory phrase table
|
|
|
|
VERBOSE(2,"using Hiero format phrase tables" << std::endl);
|
|
|
|
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
|
|
|
|
m_filePath += ".gz";
|
|
|
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
|
|
|
}
|
|
|
|
|
2011-11-11 12:25:13 +04:00
|
|
|
PhraseDictionaryHiero* pdm = new PhraseDictionaryHiero(GetNumScoreComponents(),this);
|
2011-11-04 19:43:42 +04:00
|
|
|
bool ret = pdm->Load(GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
|
|
|
, m_weight
|
|
|
|
, m_tableLimit
|
|
|
|
, system->GetLanguageModels()
|
|
|
|
, system->GetWordPenaltyProducer());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2011-11-04 19:43:42 +04:00
|
|
|
return pdm;
|
2011-11-06 12:35:05 +04:00
|
|
|
} else if (m_implementation == ALSuffixArray) {
|
|
|
|
// memory phrase table
|
|
|
|
VERBOSE(2,"using Hiero format phrase tables" << std::endl);
|
|
|
|
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
|
|
|
|
m_filePath += ".gz";
|
|
|
|
VERBOSE(2,"Using gzipped file" << std::endl);
|
|
|
|
}
|
|
|
|
|
2011-11-11 12:25:13 +04:00
|
|
|
PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(GetNumScoreComponents(),this);
|
2011-11-06 12:35:05 +04:00
|
|
|
bool ret = pdm->Load(GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
|
|
|
, m_weight
|
|
|
|
, m_tableLimit
|
|
|
|
, system->GetLanguageModels()
|
|
|
|
, system->GetWordPenaltyProducer());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2011-11-06 12:35:05 +04:00
|
|
|
return pdm;
|
2011-02-24 16:14:42 +03:00
|
|
|
} else if (m_implementation == OnDisk) {
|
|
|
|
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionaryOnDisk* pdta = new PhraseDictionaryOnDisk(GetNumScoreComponents(), this);
|
2011-02-24 16:14:42 +03:00
|
|
|
bool ret = pdta->Load(GetInput()
|
|
|
|
, GetOutput()
|
|
|
|
, m_filePath
|
|
|
|
, m_weight
|
|
|
|
, m_tableLimit
|
|
|
|
, system->GetLanguageModels()
|
|
|
|
, system->GetWordPenaltyProducer());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(ret);
|
2011-02-24 16:14:42 +03:00
|
|
|
return pdta;
|
|
|
|
} else if (m_implementation == SuffixArray) {
|
|
|
|
#ifndef WIN32
|
2011-11-09 01:22:34 +04:00
|
|
|
PhraseDictionaryDynSuffixArray *pd = new PhraseDictionaryDynSuffixArray(GetNumScoreComponents(), this);
|
2011-02-24 16:14:42 +03:00
|
|
|
if(!(pd->Load(
|
|
|
|
GetInput()
|
|
|
|
,GetOutput()
|
|
|
|
,m_filePath
|
|
|
|
,m_targetFile
|
|
|
|
, m_alignmentsFile
|
|
|
|
, m_weight, m_tableLimit
|
|
|
|
, system->GetLanguageModels()
|
|
|
|
, system->GetWeightWordPenalty()))) {
|
|
|
|
std::cerr << "FAILED TO LOAD\n" << endl;
|
|
|
|
delete pd;
|
|
|
|
pd = NULL;
|
|
|
|
}
|
|
|
|
std::cerr << "Suffix array phrase table loaded" << std::endl;
|
|
|
|
return pd;
|
|
|
|
#else
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
std::cerr << "Unknown phrase table type " << m_implementation << endl;
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system)
|
2010-08-10 17:12:00 +04:00
|
|
|
{
|
|
|
|
//Thread-safe phrase dictionaries get loaded now
|
|
|
|
if (m_useThreadSafePhraseDictionary && !m_threadSafePhraseDictionary.get()) {
|
|
|
|
IFVERBOSE(1)
|
2011-02-24 16:14:42 +03:00
|
|
|
PrintUserTime("Start loading phrase table from " + m_filePath);
|
2010-08-10 17:12:00 +04:00
|
|
|
m_threadSafePhraseDictionary.reset(LoadPhraseTable(system));
|
|
|
|
IFVERBOSE(1)
|
2011-02-24 16:14:42 +03:00
|
|
|
PrintUserTime("Finished loading phrase tables");
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
|
|
|
//Other types will be lazy loaded
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2010-08-10 17:12:00 +04:00
|
|
|
//Called when we start translating a new sentence
|
|
|
|
void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system, const InputType& source)
|
2010-04-08 21:16:10 +04:00
|
|
|
{
|
2010-08-10 17:12:00 +04:00
|
|
|
PhraseDictionary* dict;
|
|
|
|
if (m_useThreadSafePhraseDictionary) {
|
|
|
|
//thread safe dictionary should already be loaded
|
|
|
|
dict = m_threadSafePhraseDictionary.get();
|
|
|
|
} else {
|
|
|
|
//thread-unsafe dictionary may need to be loaded if this is a new thread.
|
|
|
|
if (!m_threadUnsafePhraseDictionary.get()) {
|
|
|
|
m_threadUnsafePhraseDictionary.reset(LoadPhraseTable(system));
|
|
|
|
}
|
|
|
|
dict = m_threadUnsafePhraseDictionary.get();
|
|
|
|
}
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(dict);
|
2010-08-10 17:12:00 +04:00
|
|
|
dict->InitializeForInput(source);
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2009-08-07 20:47:54 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
const PhraseDictionary* PhraseDictionaryFeature::GetDictionary() const
|
|
|
|
{
|
2010-08-10 17:12:00 +04:00
|
|
|
PhraseDictionary* dict;
|
|
|
|
if (m_useThreadSafePhraseDictionary) {
|
|
|
|
dict = m_threadSafePhraseDictionary.get();
|
|
|
|
} else {
|
|
|
|
dict = m_threadUnsafePhraseDictionary.get();
|
|
|
|
}
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(dict);
|
2010-08-10 17:12:00 +04:00
|
|
|
return dict;
|
2010-04-19 17:52:56 +04:00
|
|
|
}
|
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
PhraseDictionary* PhraseDictionaryFeature::GetDictionary()
|
|
|
|
{
|
|
|
|
PhraseDictionary* dict;
|
|
|
|
if (m_useThreadSafePhraseDictionary) {
|
|
|
|
dict = m_threadSafePhraseDictionary.get();
|
|
|
|
} else {
|
|
|
|
dict = m_threadUnsafePhraseDictionary.get();
|
|
|
|
}
|
2011-12-09 13:30:48 +04:00
|
|
|
CHECK(dict);
|
2011-08-19 20:09:36 +04:00
|
|
|
return dict;
|
2010-09-14 14:13:06 +04:00
|
|
|
}
|
2010-04-08 21:16:10 +04:00
|
|
|
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
PhraseDictionaryFeature::~PhraseDictionaryFeature()
|
2010-04-08 21:16:10 +04:00
|
|
|
{}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
|
2011-08-30 16:25:50 +04:00
|
|
|
std::string PhraseDictionaryFeature::GetScoreProducerWeightShortName(unsigned idx) const
|
|
|
|
{
|
|
|
|
if (idx < GetNumInputScores()){
|
|
|
|
return "I";
|
|
|
|
}else{
|
|
|
|
return "tm";
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t PhraseDictionaryFeature::GetNumInputScores() const
|
|
|
|
{
|
|
|
|
return m_numInputScores;
|
2009-08-07 20:47:54 +04:00
|
|
|
}
|
2009-02-06 18:43:06 +03:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
bool PhraseDictionaryFeature::ComputeValueInTranslationOption() const
|
|
|
|
{
|
|
|
|
return true;
|
2009-02-06 18:43:06 +03:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
const PhraseDictionaryFeature* PhraseDictionary::GetFeature() const
|
|
|
|
{
|
|
|
|
return m_feature;
|
|
|
|
}
|
2009-08-07 20:47:54 +04:00
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
}
|
|
|
|
|