mosesdecoder/moses/TranslationModel/PhraseDictionary.cpp

362 lines
13 KiB
C++
Raw Normal View History

// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
2012-11-27 20:57:23 +04:00
#include "moses/TranslationModel/RuleTable/PhraseDictionarySCFG.h"
#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
#include "moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h"
#include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
#ifndef WIN32
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
2012-11-27 22:04:01 +04:00
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
#endif
2012-11-27 20:57:23 +04:00
#include "moses/TranslationModel/RuleTable/UTrie.h"
#include "moses/StaticData.h"
#include "moses/InputType.h"
#include "moses/TranslationOption.h"
#include "moses/UserMessage.h"
using namespace std;
namespace Moses
{
const TargetPhraseCollection *PhraseDictionary::
GetTargetPhraseCollection(InputType const& src,WordsRange const& range) const
{
return GetTargetPhraseCollection(src.GetSubString(range));
}
2013-02-01 19:50:57 +04:00
PhraseDictionaryFeature::PhraseDictionaryFeature(const std::string &line)
:DecodeFeature("PhraseModel",5, line) // TODO not always 5
2013-02-01 19:50:57 +04:00
,m_tableLimit(20) // TODO default?
{
/* m_targetFile(targetFile),
m_alignmentsFile(alignmentsFile),
*/
cerr << "line=" << line << endl;
vector<string> toks = Tokenize(line);
for (size_t i = 1; i < toks.size(); ++i) {
vector<string> args = Tokenize(toks[i], "=");
CHECK(args.size() == 2);
if (args[0] == "implementation") {
m_implementation = (PhraseTableImplementation) Scan<size_t>(args[1]);
}
else if (args[0] == "input-factor") {
m_input =Tokenize<FactorType>(args[1]);
}
else if (args[0] == "output-factor") {
m_output =Tokenize<FactorType>(args[1]);
}
2013-02-04 03:30:36 +04:00
else if (args[0] == "num-input-features") {
m_numInputScores = Scan<unsigned>(args[1]);
2013-02-01 19:50:57 +04:00
}
else if (args[0] == "path") {
m_filePath = args[1];
}
else if (args[0] == "table-limit") {
m_tableLimit = Scan<size_t>(args[1]);
}
2013-02-04 03:55:05 +04:00
else if (args[0] == "target-path") {
m_targetFile = args[1];
}
else if (args[0] == "alignment-path") {
m_alignmentsFile = args[1];
}
2013-02-01 19:50:57 +04:00
2013-02-04 03:55:05 +04:00
else {
UserMessage::Add("Unknown argument " + args[0]);
abort();
}
2013-02-01 19:50:57 +04:00
} // for (size_t i = 0; i < toks.size(); ++i) {
}
PhraseDictionaryFeature::PhraseDictionaryFeature
(PhraseTableImplementation implementation
, size_t numScoreComponent
, unsigned numInputScores
, const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
, size_t tableLimit
, const std::string &targetFile // default param
, const std::string &alignmentsFile) // default param
:DecodeFeature("PhraseModel",numScoreComponent,input,output, "PhraseModel"),
m_numInputScores(numInputScores),
m_filePath(filePath),
m_tableLimit(tableLimit),
m_implementation(implementation),
m_targetFile(targetFile),
m_alignmentsFile(alignmentsFile)
{
if (implementation == Memory || implementation == SCFG || implementation == SuffixArray ||
implementation==Compact || implementation==FuzzyMatch ) {
m_useThreadSafePhraseDictionary = true;
} else {
m_useThreadSafePhraseDictionary = false;
}
}
PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSystem* system)
{
const StaticData& staticData = StaticData::Instance();
std::vector<float> weightT = staticData.GetWeights(this);
2012-04-24 08:21:18 +04:00
if (m_implementation == Memory) {
// memory phrase table
VERBOSE(2,"using standard phrase tables" << std::endl);
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
m_filePath += ".gz";
VERBOSE(2,"Using gzipped file" << std::endl);
}
if (staticData.GetInputType() != SentenceInput) {
UserMessage::Add("Must use binary phrase table for this input type");
CHECK(false);
}
PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(GetNumScoreComponents(),this);
bool ret = pdm->Load(GetInput(), GetOutput()
, m_filePath
2012-04-24 08:21:18 +04:00
, weightT
, m_tableLimit
, staticData.GetLMList()
2012-12-19 19:38:57 +04:00
, staticData.GetWeightWordPenalty());
CHECK(ret);
return pdm;
} else if (m_implementation == Binary) {
PhraseDictionaryTreeAdaptor* pdta = new PhraseDictionaryTreeAdaptor(GetNumScoreComponents(), m_numInputScores,this);
bool ret = pdta->Load( GetInput()
, GetOutput()
, m_filePath
2012-04-24 08:21:18 +04:00
, weightT
, m_tableLimit
, staticData.GetLMList()
2012-12-19 19:38:57 +04:00
, staticData.GetWeightWordPenalty());
CHECK(ret);
return pdta;
} else if (m_implementation == SCFG || m_implementation == Hiero) {
// memory phrase table
if (m_implementation == Hiero) {
VERBOSE(2,"using Hiero format phrase tables" << std::endl);
} else {
VERBOSE(2,"using Moses-formatted SCFG phrase tables" << std::endl);
}
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
m_filePath += ".gz";
VERBOSE(2,"Using gzipped file" << std::endl);
}
RuleTableTrie *dict;
if (staticData.GetParsingAlgorithm() == ParseScope3) {
2012-05-25 00:34:06 +04:00
dict = new RuleTableUTrie(GetNumScoreComponents(), this);
} else {
2012-05-25 00:34:06 +04:00
dict = new PhraseDictionarySCFG(GetNumScoreComponents(), this);
}
bool ret = dict->Load(GetInput()
, GetOutput()
, m_filePath
2012-04-24 08:21:18 +04:00
, weightT
, m_tableLimit
, staticData.GetLMList()
2012-12-19 19:38:57 +04:00
, staticData.GetWordPenaltyProducer());
CHECK(ret);
return dict;
} else if (m_implementation == ALSuffixArray) {
// memory phrase table
VERBOSE(2,"using Hiero format phrase tables" << std::endl);
if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) {
m_filePath += ".gz";
VERBOSE(2,"Using gzipped file" << std::endl);
}
PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(GetNumScoreComponents(),this);
bool ret = pdm->Load(GetInput()
, GetOutput()
, m_filePath
, weightT
, m_tableLimit
, staticData.GetLMList()
2012-12-19 19:38:57 +04:00
, staticData.GetWordPenaltyProducer());
CHECK(ret);
return pdm;
} else if (m_implementation == OnDisk) {
PhraseDictionaryOnDisk* pdta = new PhraseDictionaryOnDisk(GetNumScoreComponents(), this);
bool ret = pdta->Load(GetInput()
, GetOutput()
, m_filePath
, weightT
, m_tableLimit
, staticData.GetLMList()
2012-12-19 19:38:57 +04:00
, staticData.GetWordPenaltyProducer());
CHECK(ret);
return pdta;
} else if (m_implementation == SuffixArray) {
#ifndef WIN32
PhraseDictionaryDynSuffixArray *pd = new PhraseDictionaryDynSuffixArray(GetNumScoreComponents(), this);
if(!(pd->Load(
GetInput()
,GetOutput()
,m_filePath
,m_targetFile
,m_alignmentsFile
,weightT, m_tableLimit
,staticData.GetLMList()
2012-12-19 19:38:57 +04:00
,staticData.GetWeightWordPenalty()))) {
std::cerr << "FAILED TO LOAD\n" << endl;
delete pd;
pd = NULL;
}
std::cerr << "Suffix array phrase table loaded" << std::endl;
return pd;
#else
CHECK(false);
#endif
2012-08-14 02:53:14 +04:00
} else if (m_implementation == FuzzyMatch) {
PhraseDictionaryFuzzyMatch *dict = new PhraseDictionaryFuzzyMatch(GetNumScoreComponents(), this);
bool ret = dict->Load(GetInput()
, GetOutput()
, m_filePath
, weightT
, m_tableLimit
, staticData.GetLMList()
2012-12-19 19:38:57 +04:00
, staticData.GetWordPenaltyProducer());
CHECK(ret);
return dict;
} else if (m_implementation == Compact) {
#ifndef WIN32
VERBOSE(2,"Using compact phrase table" << std::endl);
PhraseDictionaryCompact* pd = new PhraseDictionaryCompact(GetNumScoreComponents(), m_implementation, this);
bool ret = pd->Load(GetInput(), GetOutput()
, m_filePath
, weightT
, m_tableLimit
, staticData.GetLMList()
2012-12-19 19:38:57 +04:00
, staticData.GetWeightWordPenalty());
CHECK(ret);
return pd;
#else
CHECK(false);
#endif
}
else {
std::cerr << "Unknown phrase table type " << m_implementation << endl;
CHECK(false);
}
}
void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system)
{
//Thread-safe phrase dictionaries get loaded now
if (m_useThreadSafePhraseDictionary && !m_threadSafePhraseDictionary.get()) {
IFVERBOSE(1)
PrintUserTime("Start loading phrase table from " + m_filePath);
m_threadSafePhraseDictionary.reset(LoadPhraseTable(system));
IFVERBOSE(1)
PrintUserTime("Finished loading phrase tables");
}
//Other types will be lazy loaded
}
const PhraseDictionary* PhraseDictionaryFeature::GetDictionary() const
{
PhraseDictionary* dict;
if (m_useThreadSafePhraseDictionary) {
dict = m_threadSafePhraseDictionary.get();
} else {
dict = m_threadUnsafePhraseDictionary.get();
}
CHECK(dict);
return dict;
}
Merge commit 'a0b6abdfd3599e7fbdc6aac76fcd2cb4483d63ce' into miramerge Conflicts: moses/src/ConfusionNet.cpp moses/src/DecodeFeature.cpp moses/src/DecodeFeature.h moses/src/DotChartOnDisk.cpp moses/src/DummyScoreProducers.cpp moses/src/DummyScoreProducers.h moses/src/DynSAInclude/vocab.h moses/src/FeatureFunction.h moses/src/GenerationDictionary.cpp moses/src/GenerationDictionary.h moses/src/GlobalLexicalModel.cpp moses/src/GlobalLexicalModel.h moses/src/LMList.cpp moses/src/LMList.h moses/src/LanguageModel.cpp moses/src/LanguageModel.h moses/src/LanguageModelFactory.cpp moses/src/LanguageModelFactory.h moses/src/LanguageModelImplementation.h moses/src/LanguageModelKen.h moses/src/LanguageModelMultiFactor.cpp moses/src/LanguageModelMultiFactor.h moses/src/LanguageModelParallelBackoff.h moses/src/LanguageModelRemote.h moses/src/LanguageModelSingleFactor.cpp moses/src/LanguageModelSingleFactor.h moses/src/LexicalReordering.cpp moses/src/LexicalReordering.h moses/src/LexicalReorderingState.cpp moses/src/LexicalReorderingState.h moses/src/Manager.cpp moses/src/PDTAimp.h moses/src/Parameter.cpp moses/src/Parameter.h moses/src/Phrase.h moses/src/PhraseDictionary.cpp moses/src/PhraseDictionary.h moses/src/PhraseDictionaryMemory.cpp moses/src/ScoreComponentCollection.cpp moses/src/ScoreComponentCollection.h moses/src/ScoreIndexManager.cpp moses/src/ScoreIndexManager.h moses/src/ScoreProducer.h moses/src/StaticData.cpp moses/src/StaticData.h moses/src/TargetPhrase.cpp moses/src/TargetPhrase.h moses/src/TranslationOption.cpp moses/src/TranslationOptionCollection.cpp moses/src/TranslationSystem.cpp moses/src/TranslationSystem.h moses/src/TrellisPath.h
2011-08-19 20:09:36 +04:00
PhraseDictionary* PhraseDictionaryFeature::GetDictionary()
{
PhraseDictionary* dict;
if (m_useThreadSafePhraseDictionary) {
dict = m_threadSafePhraseDictionary.get();
} else {
dict = m_threadUnsafePhraseDictionary.get();
}
2011-12-09 13:30:48 +04:00
CHECK(dict);
Merge commit 'a0b6abdfd3599e7fbdc6aac76fcd2cb4483d63ce' into miramerge Conflicts: moses/src/ConfusionNet.cpp moses/src/DecodeFeature.cpp moses/src/DecodeFeature.h moses/src/DotChartOnDisk.cpp moses/src/DummyScoreProducers.cpp moses/src/DummyScoreProducers.h moses/src/DynSAInclude/vocab.h moses/src/FeatureFunction.h moses/src/GenerationDictionary.cpp moses/src/GenerationDictionary.h moses/src/GlobalLexicalModel.cpp moses/src/GlobalLexicalModel.h moses/src/LMList.cpp moses/src/LMList.h moses/src/LanguageModel.cpp moses/src/LanguageModel.h moses/src/LanguageModelFactory.cpp moses/src/LanguageModelFactory.h moses/src/LanguageModelImplementation.h moses/src/LanguageModelKen.h moses/src/LanguageModelMultiFactor.cpp moses/src/LanguageModelMultiFactor.h moses/src/LanguageModelParallelBackoff.h moses/src/LanguageModelRemote.h moses/src/LanguageModelSingleFactor.cpp moses/src/LanguageModelSingleFactor.h moses/src/LexicalReordering.cpp moses/src/LexicalReordering.h moses/src/LexicalReorderingState.cpp moses/src/LexicalReorderingState.h moses/src/Manager.cpp moses/src/PDTAimp.h moses/src/Parameter.cpp moses/src/Parameter.h moses/src/Phrase.h moses/src/PhraseDictionary.cpp moses/src/PhraseDictionary.h moses/src/PhraseDictionaryMemory.cpp moses/src/ScoreComponentCollection.cpp moses/src/ScoreComponentCollection.h moses/src/ScoreIndexManager.cpp moses/src/ScoreIndexManager.h moses/src/ScoreProducer.h moses/src/StaticData.cpp moses/src/StaticData.h moses/src/TargetPhrase.cpp moses/src/TargetPhrase.h moses/src/TranslationOption.cpp moses/src/TranslationOptionCollection.cpp moses/src/TranslationSystem.cpp moses/src/TranslationSystem.h moses/src/TrellisPath.h
2011-08-19 20:09:36 +04:00
return dict;
}
PhraseDictionaryFeature::~PhraseDictionaryFeature()
{}
bool PhraseDictionaryFeature::ComputeValueInTranslationOption() const
{
return true;
Feature function overhaul. Each feature function is computed in one of three ways: 1) Stateless feature functions from the phrase table/generation table: these are computed when the TranslationOption is created. They become part of the ScoreBreakdown object contained in the TranslationOption and are added to the feature value vector when a hypothesis is extended. 2) Stateless feature functions that are computed during state exploration. Currently, only WordPenalty falls into this category, but these functions implement a method Evaluate which do does not receive a Hypothesis or any contextual information. 3) Stateful feature functions: these features receive the arc information (translation option), compute some value and then return some context information. The context information created by a particular feature function is passed back to it as the previous context when a hypothesis originating at the node where the previous edge terminates is created. States in the search space may be recombined if the context information is identical. The context information must be stored in an object implementing the FFState interface. TODO: 1) the command line interface / MERT interface needs to go to named parameters that are otherwise opaque 2) StatefulFeatureFunction's Evaluate method should just take a TranslationOption and a context object. It is not good that it takes a hypothesis, because then people may be tempted to access information about the "previous" hypothesis without "declaring" this dependency. 3) Future cost estimates should be handled using feature functions. All stateful feature functions need some kind of future cost estimate. 4) Philipp's poor-man's cube pruning is broken. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2087 1f5c12ca-751b-0410-a591-d2e778427230
2009-02-06 18:43:06 +03:00
}
const PhraseDictionaryFeature* PhraseDictionary::GetFeature() const
{
return m_feature;
}
void PhraseDictionaryFeature::InitializeForInput(const InputType& source)
{
PhraseDictionary* dict;
if (m_useThreadSafePhraseDictionary) {
//thread safe dictionary should already be loaded
dict = m_threadSafePhraseDictionary.get();
} else {
//thread-unsafe dictionary may need to be loaded if this is a new thread.
if (!m_threadUnsafePhraseDictionary.get()) {
m_threadUnsafePhraseDictionary.reset(LoadPhraseTable(NULL));
}
dict = m_threadUnsafePhraseDictionary.get();
}
CHECK(dict);
dict->InitializeForInput(source);
}
void PhraseDictionaryFeature::CleanUpAfterSentenceProcessing(const InputType& source)
{
PhraseDictionary* dict;
if (m_useThreadSafePhraseDictionary) {
//thread safe dictionary should already be loaded
dict = m_threadSafePhraseDictionary.get();
} else {
dict = m_threadUnsafePhraseDictionary.get();
}
CHECK(dict);
dict->CleanUpAfterSentenceProcessing(source);
}
}