From de45d7076a4d7dc312b63c2e99e4bb5170284095 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 1 Jan 2013 17:27:26 +0000 Subject: [PATCH] add [feature] arg. Use for GlobalLexicalModelUnlimited. Not tested --- moses/GlobalLexicalModelUnlimited.cpp | 55 +++++++++++++ moses/GlobalLexicalModelUnlimited.h | 1 + moses/StaticData.cpp | 107 +++++++------------------- moses/StaticData.h | 2 - 4 files changed, 84 insertions(+), 81 deletions(-) diff --git a/moses/GlobalLexicalModelUnlimited.cpp b/moses/GlobalLexicalModelUnlimited.cpp index f1de65bd0..d0aad8986 100644 --- a/moses/GlobalLexicalModelUnlimited.cpp +++ b/moses/GlobalLexicalModelUnlimited.cpp @@ -8,6 +8,61 @@ using namespace std; namespace Moses { +GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line) +:StatelessFeatureFunction("glm",ScoreProducer::unlimited) +{ + const vector modelSpec = Tokenize(line); + + for (size_t i = 0; i < modelSpec.size(); i++ ) { + bool ignorePunctuation = true, biasFeature = false, restricted = false; + size_t context = 0; + string filenameSource, filenameTarget; + vector< string > factors; + vector< string > spec = Tokenize(modelSpec[i]," "); + + // read optional punctuation and bias specifications + if (spec.size() > 0) { + if (spec.size() != 2 && spec.size() != 3 && spec.size() != 4 && spec.size() != 6) { + UserMessage::Add("Format of glm feature is - [ignore-punct] [use-bias] " + "[context-type] [filename-src filename-tgt]"); + //return false; + } + + factors = Tokenize(spec[0],"-"); + if (spec.size() >= 2) + ignorePunctuation = Scan(spec[1]); + if (spec.size() >= 3) + biasFeature = Scan(spec[2]); + if (spec.size() >= 4) + context = Scan(spec[3]); + if (spec.size() == 6) { + filenameSource = spec[4]; + filenameTarget = spec[5]; + restricted = true; + } + } + else + factors = Tokenize(modelSpec[i],"-"); + + if ( factors.size() != 2 ) { + UserMessage::Add("Wrong factor definition for global lexical model unlimited: " + modelSpec[i]); + //return false; + } + + const vector inputFactors = Tokenize(factors[0],","); + const vector outputFactors = Tokenize(factors[1],","); + throw runtime_error("GlobalLexicalModelUnlimited should be reimplemented as a stateful feature"); + GlobalLexicalModelUnlimited* glmu = NULL; // new GlobalLexicalModelUnlimited(inputFactors, outputFactors, biasFeature, ignorePunctuation, context); + + if (restricted) { + cerr << "loading word translation word lists from " << filenameSource << " and " << filenameTarget << endl; + if (!glmu->Load(filenameSource, filenameTarget)) { + UserMessage::Add("Unable to load word lists for word translation feature from files " + filenameSource + " and " + filenameTarget); + //return false; + } + } + } +} bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource, const std::string &filePathTarget) diff --git a/moses/GlobalLexicalModelUnlimited.h b/moses/GlobalLexicalModelUnlimited.h index 747031102..af93559c6 100644 --- a/moses/GlobalLexicalModelUnlimited.h +++ b/moses/GlobalLexicalModelUnlimited.h @@ -68,6 +68,7 @@ private: std::set m_vocabTarget; public: + GlobalLexicalModelUnlimited(const std::string &line); GlobalLexicalModelUnlimited(const std::vector< FactorType >& inFactors, const std::vector< FactorType >& outFactors, bool biasFeature, bool ignorePunctuation, size_t context): StatelessFeatureFunction("glm",ScoreProducer::unlimited), diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 750cdf23c..34294fa98 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -83,6 +83,21 @@ static size_t CalcMax(size_t x, const vector& y, const vector& z return max; } +int GetFeatureIndex(std::map &map, const string &featureName) +{ + std::map::iterator iter; + iter = map.find(featureName); + if (iter == map.end()) { + map[featureName] = 0; + return 0; + } + else { + int &index = iter->second; + ++index; + return index; + } +} + StaticData StaticData::s_instance; StaticData::StaticData() @@ -530,14 +545,24 @@ SetWeight(m_unknownWordPenaltyProducer, weightUnknownWord); } // all features + map featureIndexMap; + const vector &features = m_parameter->GetParam("feature"); for (size_t i = 0; i < features.size(); ++i) { const string &line = features[i]; vector toks = Tokenize(line); - if (toks[0] == "GlobalLexicalModel") { + const string &feature = toks[0]; + int featureIndex = GetFeatureIndex(featureIndexMap, feature); + + if (feature == "GlobalLexicalModel") { GlobalLexicalModel *model = new GlobalLexicalModel(line); - const vector &weights = m_parameter->GetWeights(toks[0], 0); + const vector &weights = m_parameter->GetWeights(feature, featureIndex); + SetWeights(model, weights); + } + else if (feature == "glm") { + GlobalLexicalModelUnlimited *model = NULL; //new GlobalLexicalModelUnlimited(line); + const vector &weights = m_parameter->GetWeights(feature, featureIndex); SetWeights(model, weights); } @@ -553,7 +578,6 @@ SetWeight(m_unknownWordPenaltyProducer, weightUnknownWord); if (!LoadLanguageModels()) return false; if (!LoadGenerationTables()) return false; if (!LoadPhraseTables()) return false; - if (!LoadGlobalLexicalModelUnlimited()) return false; if (!LoadDecodeGraphs()) return false; if (!LoadReferences()) return false; if (!LoadDiscrimLMFeature()) return false; @@ -601,16 +625,7 @@ SetWeight(m_unknownWordPenaltyProducer, weightUnknownWord); UserMessage::Add("Unable to load weights from " + extraWeightConfig[0]); return false; } - - // GLM: apply additional weight to sparse features if applicable - for (size_t i = 0; i < m_globalLexicalModelsUnlimited.size(); ++i) { - float weight = m_globalLexicalModelsUnlimited[i]->GetSparseProducerWeight(); - if (weight != 1) { - AddSparseProducer(m_globalLexicalModelsUnlimited[i]); - cerr << "glm sparse producer weight: " << weight << endl; - } - } - + m_allWeights.PlusEquals(extraWeights); } @@ -778,72 +793,6 @@ bool StaticData::LoadLexicalReorderingModel() return true; } -bool StaticData::LoadGlobalLexicalModelUnlimited() -{ - const vector &weight = Scan(m_parameter->GetParam("weight-glm")); - const vector &modelSpec = m_parameter->GetParam("glm-feature"); - - if (weight.size() != 0 && weight.size() != modelSpec.size()) { - std::cerr << "number of sparse producer weights and model specs for the global lexical model unlimited " - "does not match (" << weight.size() << " != " << modelSpec.size() << ")" << std::endl; - return false; - } - - for (size_t i = 0; i < modelSpec.size(); i++ ) { - bool ignorePunctuation = true, biasFeature = false, restricted = false; - size_t context = 0; - string filenameSource, filenameTarget; - vector< string > factors; - vector< string > spec = Tokenize(modelSpec[i]," "); - - // read optional punctuation and bias specifications - if (spec.size() > 0) { - if (spec.size() != 2 && spec.size() != 3 && spec.size() != 4 && spec.size() != 6) { - UserMessage::Add("Format of glm feature is - [ignore-punct] [use-bias] " - "[context-type] [filename-src filename-tgt]"); - return false; - } - - factors = Tokenize(spec[0],"-"); - if (spec.size() >= 2) - ignorePunctuation = Scan(spec[1]); - if (spec.size() >= 3) - biasFeature = Scan(spec[2]); - if (spec.size() >= 4) - context = Scan(spec[3]); - if (spec.size() == 6) { - filenameSource = spec[4]; - filenameTarget = spec[5]; - restricted = true; - } - } - else - factors = Tokenize(modelSpec[i],"-"); - - if ( factors.size() != 2 ) { - UserMessage::Add("Wrong factor definition for global lexical model unlimited: " + modelSpec[i]); - return false; - } - - const vector inputFactors = Tokenize(factors[0],","); - const vector outputFactors = Tokenize(factors[1],","); - throw runtime_error("GlobalLexicalModelUnlimited should be reimplemented as a stateful feature"); - GlobalLexicalModelUnlimited* glmu = NULL; // new GlobalLexicalModelUnlimited(inputFactors, outputFactors, biasFeature, ignorePunctuation, context); - - m_globalLexicalModelsUnlimited.push_back(glmu); - if (restricted) { - cerr << "loading word translation word lists from " << filenameSource << " and " << filenameTarget << endl; - if (!glmu->Load(filenameSource, filenameTarget)) { - UserMessage::Add("Unable to load word lists for word translation feature from files " + filenameSource + " and " + filenameTarget); - return false; - } - } - if (weight.size() > i) - m_globalLexicalModelsUnlimited[i]->SetSparseProducerWeight(weight[i]); - } - return true; -} - bool StaticData::LoadLanguageModels() { if (m_parameter->GetParam("lmodel-file").size() > 0) { diff --git a/moses/StaticData.h b/moses/StaticData.h index 86ae58543..f23845658 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -89,7 +89,6 @@ protected: LMList m_languageModel; ScoreComponentCollection m_allWeights; std::vector m_reorderModels; - std::vector m_globalLexicalModelsUnlimited; #ifdef HAVE_SYNLM SyntacticLanguageModel* m_syntacticLanguageModel; #endif @@ -249,7 +248,6 @@ protected: //! load decoding steps bool LoadDecodeGraphs(); bool LoadLexicalReorderingModel(); - bool LoadGlobalLexicalModelUnlimited(); //References used for scoring feature (eg BleuScoreFeature) for online training bool LoadReferences(); bool LoadDiscrimLMFeature();