From ffd7af1168be694f2416597c60dbb965c1a3f70e Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 6 Feb 2014 20:15:25 +0000 Subject: [PATCH 01/23] Fixes to hypergraph and htk outputs, better cmd line help. Mixing boost paths and strings is bad. Leaks in htk output. assert that should be an exception --- moses-cmd/Main.cpp | 9 ++++----- moses/Parameter.cpp | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index f5f52583c..eb5b5c35a 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -180,6 +180,7 @@ public: } else { TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); } + delete file; } // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder @@ -233,7 +234,7 @@ public: } else { stringstream hypergraphDirName; - hypergraphDirName << boost::filesystem::current_path() << "/hypergraph"; + hypergraphDirName << boost::filesystem::current_path().string() << "/hypergraph"; hypergraphDir = hypergraphDirName.str(); } } @@ -527,9 +528,7 @@ size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff } return index+numScoreComps; } else { - cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl; - assert(false); - return 0; + UTIL_THROW2("Sparse features are not yet supported when outputting hypergraph format"); } } @@ -641,7 +640,7 @@ int main(int argc, char** argv) boost::filesystem::path nbestPath(nbestFile); weightsFilename << nbestPath.parent_path().filename() << "/weights"; } else { - weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights"; + weightsFilename << boost::filesystem::current_path().string() << "/hypergraph/weights"; } } boost::filesystem::path weightsFilePath(weightsFilename.str()); diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 1b4683dc0..d4accb3fc 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -102,8 +102,8 @@ Parameter::Parameter() AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename"); AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format"); AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses"); - AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)"); - AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)"); + AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF) - the flag should be followed byy a directory name, which must exist"); + AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder). This flag is followed by 3 values: 'true (gz|txt|bz) directory-name'"); AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)"); #ifdef HAVE_PROTOBUF AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path."); From 91d6bfe0d5205c77adaf9ebcf8d5da2d8b171862 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 10 Apr 2014 21:34:34 +0100 Subject: [PATCH 02/23] start on FF ReferenceComparison --- contrib/other-builds/moses/.project | 10 ++++++++++ moses/FF/Factory.cpp | 2 ++ moses/FF/SetSourcePhrase.h | 1 + 3 files changed, 13 insertions(+) diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index d8679eb44..c7fd19dcf 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1251,6 +1251,16 @@ 1 PARENT-3-PROJECT_LOC/moses/FF/PhrasePenalty.h + + FF/ReferenceComparison.cpp + 1 + PARENT-3-PROJECT_LOC/moses/FF/ReferenceComparison.cpp + + + FF/ReferenceComparison.h + 1 + PARENT-3-PROJECT_LOC/moses/FF/ReferenceComparison.h + FF/SetSourcePhrase.cpp 1 diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index c42d737cc..731a86047 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -39,6 +39,7 @@ #include "moses/FF/HyperParameterAsWeight.h" #include "moses/FF/SetSourcePhrase.h" #include "CountNonTerms.h" +#include "ReferenceComparison.h" #include "moses/FF/SkeletonStatelessFF.h" #include "moses/FF/SkeletonStatefulFF.h" @@ -181,6 +182,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(HyperParameterAsWeight); MOSES_FNAME(SetSourcePhrase); MOSES_FNAME(CountNonTerms); + MOSES_FNAME(ReferenceComparison); MOSES_FNAME(SkeletonStatelessFF); MOSES_FNAME(SkeletonStatefulFF); diff --git a/moses/FF/SetSourcePhrase.h b/moses/FF/SetSourcePhrase.h index bd07ffbd4..6b391baa4 100644 --- a/moses/FF/SetSourcePhrase.h +++ b/moses/FF/SetSourcePhrase.h @@ -5,6 +5,7 @@ namespace Moses { +// the only thing this FF does is set TargetPhrase::m_ruleSource so that other FF can use it in Evaluate(Search). class SetSourcePhrase : public StatelessFeatureFunction { public: From 1686686e65e861d257de9d8dfaea3b1db1ac1081 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 10 Apr 2014 21:47:06 +0100 Subject: [PATCH 03/23] start on FF ReferenceComparison --- moses/FF/ReferenceComparison.cpp | 11 ++++++++ moses/FF/ReferenceComparison.h | 45 ++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 moses/FF/ReferenceComparison.cpp create mode 100644 moses/FF/ReferenceComparison.h diff --git a/moses/FF/ReferenceComparison.cpp b/moses/FF/ReferenceComparison.cpp new file mode 100644 index 000000000..b11d133c2 --- /dev/null +++ b/moses/FF/ReferenceComparison.cpp @@ -0,0 +1,11 @@ +#include "ReferenceComparison.h" + +namespace Moses +{ +ReferenceComparison::ReferenceComparison(const std::string &line) +:StatelessFeatureFunction(0, line) +{ +} + +} + diff --git a/moses/FF/ReferenceComparison.h b/moses/FF/ReferenceComparison.h new file mode 100644 index 000000000..d3db29ddd --- /dev/null +++ b/moses/FF/ReferenceComparison.h @@ -0,0 +1,45 @@ +#pragma once +#include +#include "StatelessFeatureFunction.h" + +namespace Moses +{ + +// the only thing this FF does is set TargetPhrase::m_ruleSource so that other FF can use it in Evaluate(Search). +class ReferenceComparison : public StatelessFeatureFunction +{ +public: + ReferenceComparison(const std::string &line); + + virtual bool IsUseable(const FactorMask &mask) const + { return true; } + + virtual void Evaluate(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const + {} + + virtual void Evaluate(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const + {} + + virtual void Evaluate(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const + {} + + virtual void EvaluateChart(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const + {} + + std::vector DefaultWeights() const + { return std::vector(); } + + +}; + +} + From 9644a308587689906c7eb630c0a8a425e43dd282 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 11 Apr 2014 10:22:03 +0100 Subject: [PATCH 04/23] add FF RuleAmbiguity --- contrib/other-builds/moses/.project | 10 +++++ moses/FF/Factory.cpp | 2 + moses/FF/RuleAmbiguity.cpp | 61 +++++++++++++++++++++++++++++ moses/FF/RuleAmbiguity.h | 44 +++++++++++++++++++++ moses/Phrase.h | 8 ++++ 5 files changed, 125 insertions(+) create mode 100644 moses/FF/RuleAmbiguity.cpp create mode 100644 moses/FF/RuleAmbiguity.h diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index c7fd19dcf..1a9939c51 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1261,6 +1261,16 @@ 1 PARENT-3-PROJECT_LOC/moses/FF/ReferenceComparison.h + + FF/RuleAmbiguity.cpp + 1 + PARENT-3-PROJECT_LOC/moses/FF/RuleAmbiguity.cpp + + + FF/RuleAmbiguity.h + 1 + PARENT-3-PROJECT_LOC/moses/FF/RuleAmbiguity.h + FF/SetSourcePhrase.cpp 1 diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 731a86047..ddab3df72 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -40,6 +40,7 @@ #include "moses/FF/SetSourcePhrase.h" #include "CountNonTerms.h" #include "ReferenceComparison.h" +#include "RuleAmbiguity.h" #include "moses/FF/SkeletonStatelessFF.h" #include "moses/FF/SkeletonStatefulFF.h" @@ -183,6 +184,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(SetSourcePhrase); MOSES_FNAME(CountNonTerms); MOSES_FNAME(ReferenceComparison); + MOSES_FNAME(RuleAmbiguity); MOSES_FNAME(SkeletonStatelessFF); MOSES_FNAME(SkeletonStatefulFF); diff --git a/moses/FF/RuleAmbiguity.cpp b/moses/FF/RuleAmbiguity.cpp new file mode 100644 index 000000000..e447eee74 --- /dev/null +++ b/moses/FF/RuleAmbiguity.cpp @@ -0,0 +1,61 @@ +#include "RuleScope.h" +#include "moses/StaticData.h" +#include "moses/Word.h" + +namespace Moses +{ +RuleAmbiguity::RuleAmbiguity(const std::string &line) +:StatelessFeatureFunction(1, line) +,m_sourceSyntax(true) +{ +} + +bool IsAmbiguous(const Word &word, bool sourceSyntax) +{ + const Word &inputDefaultNonTerminal = StaticData::Instance().GetInputDefaultNonTerminal(); + return word.IsNonTerminal() && (!sourceSyntax || word == inputDefaultNonTerminal); +} + +void RuleAmbiguity::Evaluate(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const +{ + // source can't be empty, right? + float score = 0; + + int count = 0; + for (size_t i = 0; i < source.GetSize() - 0; ++i) { + const Word &word = source.GetWord(i); + bool ambiguous = IsAmbiguous(word, m_sourceSyntax); + if (ambiguous) { + ++count; + } + else { + if (count > 0) { + score += count; + } + count = -1; + } + } + + // 1st & last always adjacent to ambiguity + ++count; + if (count > 0) { + score += count; + } + + scoreBreakdown.PlusEquals(this, score); +} + +void RuleAmbiguity::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "source-syntax") { + m_sourceSyntax = Scan(value); + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + +} + diff --git a/moses/FF/RuleAmbiguity.h b/moses/FF/RuleAmbiguity.h new file mode 100644 index 000000000..83cd0272f --- /dev/null +++ b/moses/FF/RuleAmbiguity.h @@ -0,0 +1,44 @@ +#pragma once +#include +#include "StatelessFeatureFunction.h" + +namespace Moses +{ + +// the only thing this FF does is set TargetPhrase::m_ruleSource so that other FF can use it in Evaluate(Search). +class RuleAmbiguity : public StatelessFeatureFunction +{ +public: + RuleAmbiguity(const std::string &line); + + virtual bool IsUseable(const FactorMask &mask) const + { return true; } + + virtual void Evaluate(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; + + virtual void Evaluate(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const + {} + + virtual void Evaluate(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const + {} + + virtual void EvaluateChart(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const + {} + + void SetParameter(const std::string& key, const std::string& value); + +protected: + bool m_sourceSyntax; +}; + +} + diff --git a/moses/Phrase.h b/moses/Phrase.h index 1de00bfdf..55fb2bdf5 100644 --- a/moses/Phrase.h +++ b/moses/Phrase.h @@ -121,6 +121,14 @@ public: return m_words[GetSize() - 1]; } + inline const Word &Front() const { + return m_words[0]; + } + + inline const Word &Back() const { + return m_words[GetSize() - 1]; + } + //! particular factor at a particular position inline const Factor *GetFactor(size_t pos, FactorType factorType) const { const Word &ptr = m_words[pos]; From 1e116a21aef9db3ef26f451309aa0efe4180eecf Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 11 Apr 2014 10:29:57 +0100 Subject: [PATCH 05/23] add FF RuleAmbiguity --- moses/FF/RuleAmbiguity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/FF/RuleAmbiguity.h b/moses/FF/RuleAmbiguity.h index 83cd0272f..436e2fa58 100644 --- a/moses/FF/RuleAmbiguity.h +++ b/moses/FF/RuleAmbiguity.h @@ -5,7 +5,7 @@ namespace Moses { -// the only thing this FF does is set TargetPhrase::m_ruleSource so that other FF can use it in Evaluate(Search). +// similar to Scope, however, adjacent non-term count as 1 ammbiguity, rather than 2 class RuleAmbiguity : public StatelessFeatureFunction { public: From 0ec2fe016990bd3f6de51316bfada5650ea2c22f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 11 Apr 2014 10:42:38 +0100 Subject: [PATCH 06/23] add FF RuleAmbiguity --- moses/FF/RuleAmbiguity.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/FF/RuleAmbiguity.cpp b/moses/FF/RuleAmbiguity.cpp index e447eee74..8f8760d28 100644 --- a/moses/FF/RuleAmbiguity.cpp +++ b/moses/FF/RuleAmbiguity.cpp @@ -1,4 +1,4 @@ -#include "RuleScope.h" +#include "RuleAmbiguity.h" #include "moses/StaticData.h" #include "moses/Word.h" From e347020049ed4233c6ae00fbdf1ca7d5eb91519d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 11 Apr 2014 11:03:50 +0100 Subject: [PATCH 07/23] add FF ReferenceComparison --- moses/FF/ReferenceComparison.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/moses/FF/ReferenceComparison.h b/moses/FF/ReferenceComparison.h index d3db29ddd..aef7be493 100644 --- a/moses/FF/ReferenceComparison.h +++ b/moses/FF/ReferenceComparison.h @@ -5,7 +5,8 @@ namespace Moses { -// the only thing this FF does is set TargetPhrase::m_ruleSource so that other FF can use it in Evaluate(Search). +// Count how many hypotheses are in each stack, compare score with reference hypo +// NOT threadsafe. class ReferenceComparison : public StatelessFeatureFunction { public: @@ -38,6 +39,7 @@ public: std::vector DefaultWeights() const { return std::vector(); } +protected: }; From 46cef770b75e4a17693ab1d03b96337b4de838f4 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 12 Apr 2014 17:20:34 +0200 Subject: [PATCH 08/23] add header allowOrigin * --- contrib/server/mosesserver.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 25e2cb0ed..80eab8f20 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -512,7 +512,7 @@ int main(int argc, char** argv) xmlrpc_limit_set(XMLRPC_XML_SIZE_LIMIT_ID, 512*1024*1024); xmlrpc_c::registry myRegistry; - + xmlrpc_c::methodPtr const translator(new Translator); xmlrpc_c::methodPtr const updater(new Updater); xmlrpc_c::methodPtr const optimizer(new Optimizer); @@ -522,9 +522,11 @@ int main(int argc, char** argv) myRegistry.addMethod("optimize", optimizer); xmlrpc_c::serverAbyss myAbyssServer( - myRegistry, - port, // TCP port on which to listen - logfile + xmlrpc_c::serverAbyss::constrOpt() + .registryPtr(&myRegistry) + .portNumber(port) // TCP port on which to listen + .logFileName(logfile) + .allowOrigin("*") ); cerr << "Listening on port " << port << endl; From 66d0fe81e295dfd976941b2e1112c65d7903c8b0 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 15 Apr 2014 17:34:21 +0100 Subject: [PATCH 09/23] moses_chart: add ChartKBestExtractor (not enabled yet) Implements algorithm 3 from Huang and Chiang (2005) --- moses/ChartHypothesis.cpp | 16 +++ moses/ChartHypothesis.h | 4 + moses/ChartKBestExtractor.cpp | 258 ++++++++++++++++++++++++++++++++++ moses/ChartKBestExtractor.h | 125 ++++++++++++++++ 4 files changed, 403 insertions(+) create mode 100644 moses/ChartKBestExtractor.cpp create mode 100644 moses/ChartKBestExtractor.h diff --git a/moses/ChartHypothesis.cpp b/moses/ChartHypothesis.cpp index 01eb49ccc..212a28d23 100644 --- a/moses/ChartHypothesis.cpp +++ b/moses/ChartHypothesis.cpp @@ -66,6 +66,22 @@ ChartHypothesis::ChartHypothesis(const ChartTranslationOptions &transOpt, } } +// Intended to be used by ChartKBestExtractor only. This creates a mock +// ChartHypothesis for use by the extractor's top-level target vertex. +ChartHypothesis::ChartHypothesis(const ChartHypothesis &pred, + const ChartKBestExtractor & /*unused*/) + :m_currSourceWordsRange(pred.m_currSourceWordsRange) + ,m_scoreBreakdown(pred.m_scoreBreakdown) + ,m_totalScore(pred.m_totalScore) + ,m_arcList(NULL) + ,m_winningHypo(NULL) + ,m_manager(pred.m_manager) + ,m_id(pred.m_manager.GetNextHypoId()) +{ + // One predecessor, which is an existing top-level ChartHypothesis. + m_prevHypos.push_back(&pred); +} + ChartHypothesis::~ChartHypothesis() { // delete feature function states diff --git a/moses/ChartHypothesis.h b/moses/ChartHypothesis.h index 6e2facb9c..532d757c9 100644 --- a/moses/ChartHypothesis.h +++ b/moses/ChartHypothesis.h @@ -31,6 +31,7 @@ namespace Moses { +class ChartKBestExtractor; class ChartHypothesis; class ChartManager; class RuleCubeItem; @@ -74,6 +75,9 @@ protected: //! not implemented ChartHypothesis(const ChartHypothesis ©); + //! only used by ChartKBestExtractor + ChartHypothesis(const ChartHypothesis &, const ChartKBestExtractor &); + public: #ifdef USE_HYPO_POOL void *operator new(size_t /* num_bytes */) { diff --git a/moses/ChartKBestExtractor.cpp b/moses/ChartKBestExtractor.cpp new file mode 100644 index 000000000..60c066191 --- /dev/null +++ b/moses/ChartKBestExtractor.cpp @@ -0,0 +1,258 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2014 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "ChartKBestExtractor.h" + +#include "ChartHypothesis.h" +#include "ScoreComponentCollection.h" +#include "StaticData.h" + +#include + +#include + +namespace Moses +{ + +// Extract the k-best list from the search graph. +void ChartKBestExtractor::Extract( + const std::vector &topHypos, std::size_t k, + KBestVec &kBestList) +{ + typedef std::vector HypoVec; + + kBestList.clear(); + if (topHypos.empty()) { + return; + } + + // Create a new top-level ChartHypothesis that has the best hypothesis as its + // predecessor. This is the search hypergraph's target vertex. + HypoVec::const_iterator iter = topHypos.begin(); + boost::scoped_ptr supremeHypo( + new ChartHypothesis(**iter, *this)); + + // Do the same for each alternative top-level hypothesis, but add the new + // ChartHypothesis objects as arcs from supremeHypo, as if they had been + // recombined. + float prevScore = (*iter)->GetTotalScore(); + for (++iter; iter != topHypos.end(); ++iter) { + // Check that the first item in topHypos really was the best. + UTIL_THROW_IF2((*iter)->GetTotalScore() <= prevScore, + "top-level vertices are not correctly sorted"); + // Note: there's no need for a smart pointer here: supremeHypo will take + // ownership of altHypo. + ChartHypothesis *altHypo = new ChartHypothesis(**iter, *this); + supremeHypo->AddArc(altHypo); + } + + // Create the target vertex corresponding to supremeHypo then generate + // it's k-best list. + boost::shared_ptr top = FindOrCreateVertex(*supremeHypo); + LazyKthBest(*top, k, k); + + // Copy the k-best list from the target vertex, but drop the top edge from + // each derivation. + kBestList.reserve(top->kBestList.size()); + for (KBestVec::const_iterator p = top->kBestList.begin(); + p != top->kBestList.end(); ++p) { + const Derivation &d = **p; + assert(d.edge->tail.size() == 1); // d should have exactly one predecessor. + assert(d.backPointers.size() == 1); + std::size_t i = d.backPointers[0]; + boost::shared_ptr pred = d.edge.tail[0]->kBestList[i]; + kBestList.push_back(pred); + } +} + +// Generate the target-side yield of the derivation d. +Phrase ChartKBestExtractor::GetOutputPhrase(const Derivation &d) +{ + FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor(); + + Phrase ret(ARRAY_SIZE_INCR); + + const ChartHypothesis &hypo = d.edge.head->hypothesis; + const TargetPhrase &phrase = hypo.GetCurrTargetPhrase(); + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + phrase.GetAlignNonTerm().GetNonTermIndexMap(); + for (std::size_t pos = 0; pos < phrase.GetSize(); ++pos) { + const Word &word = phrase.GetWord(pos); + if (word.IsNonTerminal()) { + std::size_t nonTermInd = nonTermIndexMap[pos]; + const Derivation &subderivation = + *d.edge.tail[nonTermInd]->kBestList[d.backPointers[nonTermInd]]; + Phrase subPhrase = GetOutputPhrase(subderivation); + ret.Append(subPhrase); + } else { + ret.AddWord(word); + if (placeholderFactor == NOT_FOUND) { + continue; + } + std::set sourcePosSet = + phrase.GetAlignTerm().GetAlignmentsForTarget(pos); + if (sourcePosSet.size() == 1) { + const std::vector *ruleSourceFromInputPath = + hypo.GetTranslationOption().GetSourceRuleFromInputPath(); + UTIL_THROW_IF2(ruleSourceFromInputPath == NULL, + "Source Words in of the rules hasn't been filled out"); + std::size_t sourcePos = *sourcePosSet.begin(); + const Word *sourceWord = ruleSourceFromInputPath->at(sourcePos); + UTIL_THROW_IF2(sourceWord == NULL, + "Null source word at position " << sourcePos); + const Factor *factor = sourceWord->GetFactor(placeholderFactor); + if (factor) { + ret.Back()[0] = factor; + } + } + } + } + + return ret; +} + +// Create an unweighted hyperarc corresponding to the given ChartHypothesis. +ChartKBestExtractor::UnweightedHyperarc ChartKBestExtractor::CreateEdge( + const ChartHypothesis &h) +{ + UnweightedHyperarc edge; + edge.head = FindOrCreateVertex(h); + const std::vector &prevHypos = h.GetPrevHypos(); + edge.tail.resize(prevHypos.size()); + for (std::size_t i = 0; i < prevHypos.size(); ++i) { + const ChartHypothesis *prevHypo = prevHypos[i]; + edge.tail[i] = FindOrCreateVertex(*prevHypo); + } + return edge; +} + +void ChartKBestExtractor::GetCandidates(Vertex &v, std::size_t k) +{ + // Create a derivation for v's best incoming edge. + UnweightedHyperarc bestEdge = CreateEdge(v.hypothesis); + boost::shared_ptr d(new Derivation(bestEdge)); + v.candidates.push(d); + v.seen.insert(d); + // Create derivations for the rest of v's incoming edges. + const ChartArcList *arcList = v.hypothesis.GetArcList(); + if (arcList) { + for (std::size_t i = 0; i < arcList->size(); ++i) { + const ChartHypothesis &recombinedHypo = *(*arcList)[i]; + UnweightedHyperarc edge = CreateEdge(recombinedHypo); + boost::shared_ptr d(new Derivation(edge)); + v.candidates.push(d); + v.seen.insert(d); + } + } +} + +// Look for the vertex corresponding to a given ChartHypothesis, creating +// a new one if necessary. +boost::shared_ptr +ChartKBestExtractor::FindOrCreateVertex(const ChartHypothesis &h) +{ + VertexMap::value_type element(&h, boost::shared_ptr()); + std::pair p = m_vertexMap.insert(element); + boost::shared_ptr &sp = p.first->second; + if (!p.second) { + return sp; // Vertex was already in m_vertexMap. + } + sp.reset(new Vertex(h)); + return sp; +} + +void ChartKBestExtractor::LazyKthBest(Vertex &v, std::size_t k, + std::size_t globalK) +{ + // If this is the first visit to vertex v then initialize the priority queue. + if (v.visited == false) { + GetCandidates(v, globalK); + v.visited = true; + } + // Add derivations to the k-best list until it contains k or there are none + // left to add. + while (v.kBestList.size() < k) { + if (!v.kBestList.empty()) { + // Update the priority queue by adding the successors of the last + // derivation (unless they've been seen before). + const Derivation &d = *v.kBestList.back(); + LazyNext(v, d, globalK); + } + // Check if there are any derivations left in the queue. + if (v.candidates.empty()) { + break; + } + // Get the next best derivation and delete it from the queue. + boost::shared_ptr d = v.candidates.top(); + v.candidates.pop(); + // Add it to the k-best list. + v.kBestList.push_back(d); + } +} + +void ChartKBestExtractor::LazyNext(Vertex &v, const Derivation &d, + std::size_t globalK) +{ + // Create the neighbours of Derivation d. + for (std::size_t i = 0; i < d.backPointers.size(); ++i) { + Vertex &predVertex = *d.edge.tail[i]; + // Ensure that predVertex's k-best list contains enough derivations. + std::size_t k = d.backPointers[i] + 2; + LazyKthBest(predVertex, k, globalK); + if (predVertex.kBestList.size() < k) { + // predVertex's derivations have been exhausted. + continue; + } + // Create the neighbour. + boost::shared_ptr next(new Derivation(d, i)); + // Check if it has been created before. + std::pair p = v.seen.insert(next); + if (p.second) { + v.candidates.push(next); // Haven't previously seen it. + } + } +} + +// Construct a Derivation corresponding to a ChartHypothesis. +ChartKBestExtractor::Derivation::Derivation(const UnweightedHyperarc &e) +{ + edge = e; + backPointers.resize(edge.tail.size(), 0); + scoreBreakdown = edge.head->hypothesis.GetScoreBreakdown(); + score = edge.head->hypothesis.GetTotalScore(); +} + +// Construct a Derivation that neighbours an existing Derivation. +ChartKBestExtractor::Derivation::Derivation(const Derivation &d, std::size_t i) +{ + edge.head = d.edge.head; + edge.tail = d.edge.tail; + backPointers = d.backPointers; + std::size_t j = ++backPointers[i]; + scoreBreakdown = d.scoreBreakdown; + // Deduct the score of the old subderivation. + const Derivation &oldSubderivation = *(edge.tail[i]->kBestList[j-1]); + scoreBreakdown.MinusEquals(oldSubderivation.scoreBreakdown); + // Add the score of the new subderivation. + const Derivation &newSubderivation = *(edge.tail[i]->kBestList[j]); + scoreBreakdown.PlusEquals(newSubderivation.scoreBreakdown); + score = scoreBreakdown.GetWeightedScore(); +} + +} // namespace Moses diff --git a/moses/ChartKBestExtractor.h b/moses/ChartKBestExtractor.h new file mode 100644 index 000000000..07df7eacb --- /dev/null +++ b/moses/ChartKBestExtractor.h @@ -0,0 +1,125 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2014 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include "ChartHypothesis.h" +#include "ScoreComponentCollection.h" + +#include + +#include +#include + +namespace Moses +{ + +// k-best list extractor that implements algorithm 3 from this paper: +// +// Liang Huang and David Chiang +// "Better k-best parsing" +// In Proceedings of IWPT 2005 +// +class ChartKBestExtractor +{ +public: + struct Vertex; + + struct UnweightedHyperarc { + boost::shared_ptr head; + std::vector > tail; + }; + + struct Derivation { + Derivation(const UnweightedHyperarc &); + Derivation(const Derivation &, std::size_t); + + UnweightedHyperarc edge; + std::vector backPointers; + ScoreComponentCollection scoreBreakdown; + float score; + }; + + struct DerivationOrderer { + bool operator()(const boost::shared_ptr &d1, + const boost::shared_ptr &d2) const { + return d1->score < d2->score; + } + }; + + struct DerivationHasher { + std::size_t operator()(const boost::shared_ptr &d) const { + std::size_t seed = 0; + boost::hash_combine(seed, d->edge.head); + boost::hash_combine(seed, d->edge.tail); + boost::hash_combine(seed, d->backPointers); + return seed; + } + }; + + struct DerivationEqualityPred { + bool operator()(const boost::shared_ptr &d1, + const boost::shared_ptr &d2) const { + return d1->edge.head == d2->edge.head && + d1->edge.tail == d2->edge.tail && + d1->backPointers == d2->backPointers; + } + }; + + struct Vertex { + typedef std::priority_queue, + std::vector >, + DerivationOrderer> DerivationQueue; + + typedef boost::unordered_set, + DerivationHasher, + DerivationEqualityPred> DerivationSet; + + Vertex(const ChartHypothesis &h) : hypothesis(h), visited(false) {} + + const ChartHypothesis &hypothesis; + std::vector > kBestList; + DerivationQueue candidates; + DerivationSet seen; + bool visited; + }; + + typedef std::vector > KBestVec; + + // Extract the k-best list from the search hypergraph given the full, sorted + // list of top-level vertices. + void Extract(const std::vector &topHypos, + std::size_t k, KBestVec &); + + static Phrase GetOutputPhrase(const Derivation &); + +private: + typedef boost::unordered_map > VertexMap; + + UnweightedHyperarc CreateEdge(const ChartHypothesis &); + boost::shared_ptr FindOrCreateVertex(const ChartHypothesis &); + void GetCandidates(Vertex &, std::size_t); + void LazyKthBest(Vertex &, std::size_t, std::size_t); + void LazyNext(Vertex &, const Derivation &, std::size_t); + + VertexMap m_vertexMap; +}; + +} // namespace Moses From 5e3e50d4ec922c119a972387bfc1c2fe3c0ca9fb Mon Sep 17 00:00:00 2001 From: Nadir Durrani Date: Wed, 16 Apr 2014 17:28:49 +0100 Subject: [PATCH 10/23] In-Decoding Transliteration Module --- .../in-decoding-transliteration.pl | 230 ++++++++++++++++++ scripts/ems/experiment.meta | 19 +- scripts/ems/experiment.perl | 15 +- scripts/training/train-model.perl | 17 +- 4 files changed, 270 insertions(+), 11 deletions(-) create mode 100755 scripts/Transliteration/in-decoding-transliteration.pl diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl new file mode 100755 index 000000000..e4f0503a8 --- /dev/null +++ b/scripts/Transliteration/in-decoding-transliteration.pl @@ -0,0 +1,230 @@ +#!/usr/bin/perl -w + +use strict; + +use utf8; +use File::Basename; +use Getopt::Long "GetOptions"; +use FindBin qw($RealBin); +use Scalar::Util qw(looks_like_number); +use IO::Handle; +binmode(STDIN, ':utf8'); +binmode(STDOUT, ':utf8'); +binmode(STDERR, ':utf8'); + +my $___FACTOR_DELIMITER = "|"; +my $OUT_FILE = "/tmp/transliteration-phrase-table.$$"; + +my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$OOV_FILE, $OOV_FILE_NAME, $EXTERNAL_BIN_DIR, $LM_FILE, $INPUT_EXTENSION, $OUTPUT_EXTENSION); +die("ERROR: wrong syntax when invoking postDecodingTransliteration.perl") + unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR, + 'external-bin-dir=s' => \$EXTERNAL_BIN_DIR, + 'transliteration-model-dir=s' => \$TRANSLIT_MODEL, + 'input-extension=s' => \$INPUT_EXTENSION, + 'output-extension=s' => \$OUTPUT_EXTENSION, + 'transliteration-file=s' => \$OOV_FILE, + 'out-file=s' => \$OUT_FILE); + +# check if the files are in place +die("ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --transliteration-file, --input-extension, and --output-extension") + unless (defined($MOSES_SRC_DIR) && + defined($TRANSLIT_MODEL) && + defined($OOV_FILE) && + defined($INPUT_EXTENSION)&& + defined($OUTPUT_EXTENSION)&& + defined($EXTERNAL_BIN_DIR)); + +die("ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'") + unless -e $TRANSLIT_MODEL; +die("ERROR: could not find Transliteration file $OOV_FILE'") + unless -e $OOV_FILE; + +$OOV_FILE_NAME = basename ($OOV_FILE); + +`mkdir $TRANSLIT_MODEL/evaluation`; +`cp $OOV_FILE $TRANSLIT_MODEL/evaluation/`; +my $translitFile = $TRANSLIT_MODEL . "/evaluation/" . $OOV_FILE_NAME; + +print "Preparing for Transliteration\n"; +prepare_for_transliteration ($OOV_FILE, $translitFile); +print "Run Transliteration\n"; +run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $OOV_FILE_NAME); +print "Pick Best Transliteration\n"; +form_corpus ($translitFile , $translitFile.".op.nBest" , $OUT_FILE); + + +################### Read the UNK word file and prepare for Transliteration ############################### + +sub prepare_for_transliteration +{ + my @list = @_; + my $testFile = $list[0]; + my $translitFile = $list[1]; + my %UNK; + my @words; + my $src; + my @tW; + + open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n"; + + while () + { + chomp; + #print "$_\n"; + @words = split(/ /, "$_"); + + foreach (@words) + { + + @tW = split /\Q$___FACTOR_DELIMITER/; + + if (defined $tW[0]) + { + + if (! ($tW[0] =~ /[0-9.,]/)) + { + $UNK{$tW[0]} = 1; + } + else + { + print "Not transliterating $tW[0] \n"; + } + } + } + } + close (MYFILE); + + open MYFILE, ">:encoding(UTF-8)", $translitFile or die "Can't open $translitFile: $!\n"; + + foreach my $key ( keys %UNK ) + { + $src=join(' ', split('',$key)); + print MYFILE "$src\n"; + } + close (MYFILE); +} + +################### Run Transliteration Module to Obtain Transliterations ############################### + +sub run_transliteration +{ + my @list = @_; + my $MOSES_SRC = $list[0]; + my $EXTERNAL_BIN_DIR = $list[1]; + my $TRANSLIT_MODEL = $list[2]; + my $eval_file = $list[3]; + + `touch $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`; + + print "Filter Table\n"; + + `$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`; + + `$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file -Binarizer "$MOSES_SRC/bin/processPhraseTable"`; + + `rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`; + + print "Apply Filter\n"; + + `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`; + + `$MOSES_SRC/bin/moses -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 100 distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini < $TRANSLIT_MODEL/evaluation/$eval_file > $TRANSLIT_MODEL/evaluation/$eval_file.op`; + +} + +################### Read the output of Transliteration Model and Form Corpus ############################### + + +sub form_corpus +{ + + my @list = @_; + my $inp_file = $list[0]; + my $testFile = $list[1]; + my @words; + my $thisStr; + my $features; + my $prev = 0; + my $sNum; + my @UNK; + my %vocab; + + my $antLog = exp(0.2); + my $phraseTable = $list[2]; + + open MYFILE, "<:encoding(UTF-8)", $inp_file or die "Can't open $inp_file: $!\n"; + open PT, ">:encoding(UTF-8)", $phraseTable or die "Can't open $phraseTable: $!\n"; + + while () + { + chomp; + #print "$_\n"; + @words = split(/ /, "$_"); + + $thisStr = ""; + foreach (@words) + { + $thisStr = $thisStr . "$_"; + } + + push(@UNK, $thisStr); + $vocab{$thisStr} = 1; + } + close (MYFILE); + + open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n"; + my $inpCount = 0; + + while () + { + chomp; + #print "$_\n"; + @words = split(/ /, "$_"); + + $sNum = $words[0]; + + if ($prev != $sNum){ + $inpCount++; + } + + my $i = 2; + $thisStr = ""; + $features = ""; + + while ($words[$i] ne "|||") + { + $thisStr = $thisStr . $words[$i]; + $i++; + } + + $i++; + + while ($words[$i] ne "|||") + { + if ($words[$i] =~ /Penalty0/ || $words[$i] eq "Distortion0=" || $words[$i] eq "LM0=" ){ + $i++; + } + elsif (looks_like_number($words[$i])){ + $features = $features . " " . exp($words[$i]); + } + + $i++; + } + $i++; + + #$features = $features . " " . $words[$i]; + + if ($thisStr ne ""){ + print PT "$UNK[$inpCount] ||| $thisStr ||| $features ||| 0-0 ||| 0 0 0\n"; + } + $prev = $sNum; + } + close (MYFILE); + close (PT); + + + `gzip $phraseTable`; + +} + + diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index e2b21019d..83d597aa0 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -533,6 +533,13 @@ build-transliteration-model ignore-unless: transliteration-module rerun-on-change: transliteration-module training-options script giza-settings default-name: model/Transliteration +build-translit-table + in: transliteration-model + out: transliteration-table + ignore-unless: in-decoding-transliteration + rerun-on-change: in-decoding-transliteration transliteration-module + default-name: model/transliteration-phrase-table + template: $moses-script-dir/Transliteration/in-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN --input-extension $input-extension --output-extension $output-extension --transliteration-file $transliteration-file --out-file OUT extract-phrases in: corpus-mml-postfilter=OR=word-alignment scored-corpus out: extracted-phrases @@ -601,7 +608,7 @@ build-sparse default-name: model/sparse-features template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features" create-config - in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-model generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm + in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm out: config ignore-if: use-hiero rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini @@ -863,7 +870,7 @@ split-reference-devtest multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-splitter -model IN1.$output-extension < IN > OUT filter - in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains + in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table out: filtered-dir default-name: tuning/filtered rerun-on-change: filter-settings ttable-binarizer @@ -989,8 +996,8 @@ split-input pass-unless: input-splitter template: $input-splitter -model IN1.$input-extension < IN > OUT filter - in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains - out: filtered-dir + in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table + out: filtered-dir default-name: evaluation/filtered rerun-on-change: filter-settings report-precision-by-coverage ttable-binarizer pass-if: TRAINING:binarize-all @@ -1027,11 +1034,11 @@ remove-markup pass-unless: report-segmentation template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT post-decoding-transliteration - in: cleaned-output system-output TRAINING:transliteration-model LM:binlm + in: cleaned-output system-output TRAINING:transliteration-model out: transliterated-output default-name: evaluation/transliterated pass-unless: TRAINING:post-decoding-transliteration - template: $moses-script-dir/Transliteration/post-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN2 --input-extension $input-extension --output-extension $output-extension --language-model IN3 --output-file IN0 --oov-file IN1.oov + template: $moses-script-dir/Transliteration/post-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN2 --input-extension $input-extension --output-extension $output-extension --language-model $TRAINING:language-model-file --output-file IN0 --oov-file IN1.oov recase-output in: transliterated-output RECASING:recase-config out: recased-output diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 761c7a694..f6a7e4db3 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2233,11 +2233,15 @@ sub get_config_tables { sub define_training_create_config { my ($step_id) = @_; - my ($config,$reordering_table,$phrase_translation_table,$translit_model,$generation_table,$sparse_lexical_features,$domains,$osm, @LM) + my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm, @LM) = &get_output_and_input($step_id); my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains); + if($transliteration_pt){ + $cmd .= "-transliteration-phrase-table $transliteration_pt "; + } + if($osm){ my $osm_settings = &get("TRAINING:operation-sequence-model-settings"); @@ -2623,7 +2627,7 @@ sub define_tuningevaluation_filter { my $tuning_flag = !defined($set); my $hierarchical = &get("TRAINING:hierarchical-rule-set"); - my ($filter_dir,$input,$phrase_translation_table,$reordering_table,$domains) = &get_output_and_input($step_id); + my ($filter_dir,$input,$phrase_translation_table,$reordering_table,$domains,$transliteration_table) = &get_output_and_input($step_id); my $binarizer; $binarizer = &backoff_and_get("EVALUATION:$set:ttable-binarizer") unless $tuning_flag; @@ -2683,7 +2687,14 @@ sub define_tuningevaluation_filter { $cmd .= &get_config_tables($config,$reordering_table,$phrase_translation_table,undef,$domains); + if (&get("TRAINING:in-decoding-transliteration")) { + + $cmd .= "-transliteration-phrase-table $dir/model/transliteration-phrase-table.$VERSION "; + } + + $cmd .= "-lm 0:3:$config:8\n"; # dummy kenlm 3-gram model on factor 0 + } # filter command diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 3764ab0c2..46a7e1fe6 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -31,7 +31,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_ $_DECODING_GRAPH_BACKOFF, $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE, @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, - $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, + $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE, $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS, $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE, $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, @@ -122,7 +122,8 @@ $_HELP = 1 'config=s' => \$_CONFIG, 'osm-model=s' => \$_OSM, 'osm-setting=s' => \$_OSM_FACTORS, - 'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT, + 'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT, + 'transliteration-phrase-table=s' => \$_TRANSLITERATION_PHRASE_TABLE, 'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING, 'do-steps=s' => \$_DO_STEPS, 'memscore:s' => \$_MEMSCORE, @@ -1879,6 +1880,8 @@ sub create_ini { $path++; } print INI "1 T 1\n" if $_GLUE_GRAMMAR; + + print INI "1 T 1\n" if $_TRANSLITERATION_PHRASE_TABLE; if (defined($_DECODING_GRAPH_BACKOFF)) { $_DECODING_GRAPH_BACKOFF =~ s/\s+/ /g; @@ -1962,6 +1965,13 @@ sub create_ini { exit 1 if $i < $stepsused{"T"}; # fatal to define less } + if ($_TRANSLITERATION_PHRASE_TABLE){ + + $feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i table-limit=100 num-features=4 path=$_TRANSLITERATION_PHRASE_TABLE input-factor=0 output-factor=0\n"; + $weight_spec .= "TranslationModel$i= 0.2 0.2 0.2 0.2\n"; + $i++; + } + # glue grammar if ($_GLUE_GRAMMAR) { &full_path(\$___GLUE_GRAMMAR_FILE); @@ -2069,8 +2079,9 @@ sub create_ini { my $lm_oov_prob = 0.1; - if ($_POST_DECODING_TRANSLIT){ + if ($_POST_DECODING_TRANSLIT || $_TRANSLITERATION_PHRASE_TABLE){ $lm_oov_prob = -100.0; + $_LMODEL_OOV_FEATURE = "yes"; } $feature_spec .= "$type_name name=LM$i factor=$f path=$fn order=$o\n"; From d90aaf101839f172994142a14cd6c5908a5f962e Mon Sep 17 00:00:00 2001 From: Nadir Durrani Date: Wed, 16 Apr 2014 17:40:49 +0100 Subject: [PATCH 11/23] Z --- scripts/Transliteration/in-decoding-transliteration.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl index e4f0503a8..237aec587 100755 --- a/scripts/Transliteration/in-decoding-transliteration.pl +++ b/scripts/Transliteration/in-decoding-transliteration.pl @@ -16,7 +16,7 @@ my $___FACTOR_DELIMITER = "|"; my $OUT_FILE = "/tmp/transliteration-phrase-table.$$"; my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$OOV_FILE, $OOV_FILE_NAME, $EXTERNAL_BIN_DIR, $LM_FILE, $INPUT_EXTENSION, $OUTPUT_EXTENSION); -die("ERROR: wrong syntax when invoking postDecodingTransliteration.perl") +die("ERROR: wrong syntax when invoking in-decoding-transliteration.perl") unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR, 'external-bin-dir=s' => \$EXTERNAL_BIN_DIR, 'transliteration-model-dir=s' => \$TRANSLIT_MODEL, From a4d32a2b090be41969b651fcad2df34fd824cbae Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 17 Apr 2014 20:04:02 +0100 Subject: [PATCH 12/23] minor compile errors in ChartKBestExtractor --- contrib/other-builds/moses/.project | 10 ++++++++++ moses/ChartHypothesis.h | 6 +++--- moses/ChartKBestExtractor.cpp | 2 +- moses/ChartKBestExtractor.h | 1 + 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 1a9939c51..cf311ed9c 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -166,6 +166,16 @@ 1 PARENT-3-PROJECT_LOC/moses/ChartHypothesisCollection.h + + ChartKBestExtractor.cpp + 1 + PARENT-3-PROJECT_LOC/moses/ChartKBestExtractor.cpp + + + ChartKBestExtractor.h + 1 + PARENT-3-PROJECT_LOC/moses/ChartKBestExtractor.h + ChartManager.cpp 1 diff --git a/moses/ChartHypothesis.h b/moses/ChartHypothesis.h index 532d757c9..150b53fd0 100644 --- a/moses/ChartHypothesis.h +++ b/moses/ChartHypothesis.h @@ -75,9 +75,6 @@ protected: //! not implemented ChartHypothesis(const ChartHypothesis ©); - //! only used by ChartKBestExtractor - ChartHypothesis(const ChartHypothesis &, const ChartKBestExtractor &); - public: #ifdef USE_HYPO_POOL void *operator new(size_t /* num_bytes */) { @@ -96,6 +93,9 @@ public: } #endif + //! only used by ChartKBestExtractor + ChartHypothesis(const ChartHypothesis &, const ChartKBestExtractor &); + ChartHypothesis(const ChartTranslationOptions &, const RuleCubeItem &item, ChartManager &manager); diff --git a/moses/ChartKBestExtractor.cpp b/moses/ChartKBestExtractor.cpp index 60c066191..05f8920c5 100644 --- a/moses/ChartKBestExtractor.cpp +++ b/moses/ChartKBestExtractor.cpp @@ -73,7 +73,7 @@ void ChartKBestExtractor::Extract( for (KBestVec::const_iterator p = top->kBestList.begin(); p != top->kBestList.end(); ++p) { const Derivation &d = **p; - assert(d.edge->tail.size() == 1); // d should have exactly one predecessor. + assert(d.edge.tail.size() == 1); // d should have exactly one predecessor. assert(d.backPointers.size() == 1); std::size_t i = d.backPointers[0]; boost::shared_ptr pred = d.edge.tail[0]->kBestList[i]; diff --git a/moses/ChartKBestExtractor.h b/moses/ChartKBestExtractor.h index 07df7eacb..66430ec1e 100644 --- a/moses/ChartKBestExtractor.h +++ b/moses/ChartKBestExtractor.h @@ -19,6 +19,7 @@ #pragma once +#include #include "ChartHypothesis.h" #include "ScoreComponentCollection.h" From e22b68e2fde3d0bc7b345cf9b395de05353de2f0 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 17 Apr 2014 20:15:06 +0100 Subject: [PATCH 13/23] roll back change in mosesserver. Doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04 --- contrib/server/mosesserver.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 80eab8f20..03b02ef41 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -521,6 +521,12 @@ int main(int argc, char** argv) myRegistry.addMethod("updater", updater); myRegistry.addMethod("optimize", optimizer); + xmlrpc_c::serverAbyss myAbyssServer( + myRegistry, + port, // TCP port on which to listen + logfile + ); + /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04 xmlrpc_c::serverAbyss myAbyssServer( xmlrpc_c::serverAbyss::constrOpt() .registryPtr(&myRegistry) @@ -528,6 +534,7 @@ int main(int argc, char** argv) .logFileName(logfile) .allowOrigin("*") ); + */ cerr << "Listening on port " << port << endl; if (isSerial) { From 00a2bd395aa0f8e5b9d36dfce3651ecf55cbf234 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 17 Apr 2014 21:22:30 +0100 Subject: [PATCH 14/23] word alignment from server, thanks to Jyotesh Choudhari --- contrib/server/Jamfile | 2 +- contrib/server/mosesserver.cpp | 36 +++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/contrib/server/Jamfile b/contrib/server/Jamfile index 445c07ae8..6e641a2f7 100644 --- a/contrib/server/Jamfile +++ b/contrib/server/Jamfile @@ -35,7 +35,7 @@ if $(build-moses-server) = true xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --libs" ] ; xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --cflags" ] ; - exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt : $(xmlrpc-linkflags) $(xmlrpc-cxxflags) ; + exe mosesserver : mosesserver.cpp ../../moses//moses ../../moses-cmd/IOWrapper.cpp ../../OnDiskPt//OnDiskPt : $(xmlrpc-linkflags) $(xmlrpc-cxxflags) ; } else { alias mosesserver ; } diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 03b02ef41..105f09c13 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -12,6 +12,7 @@ #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h" #include "moses/TreeInput.h" #include "moses/LM/ORLM.h" +#include "moses-cmd/IOWrapper.h" #ifdef WITH_THREADS #include @@ -22,6 +23,7 @@ #include using namespace Moses; +using namespace MosesCmd; using namespace std; typedef std::map params_t; @@ -215,6 +217,8 @@ public: cerr << "Input: " << source << endl; si = params.find("align"); bool addAlignInfo = (si != params.end()); + si = params.find("word-align"); + bool addWordAlignInfo = (si != params.end()); si = params.find("sg"); bool addGraphInfo = (si != params.end()); si = params.find("topt"); @@ -278,6 +282,20 @@ public: if (addAlignInfo) { retData.insert(pair("align", xmlrpc_c::value_array(alignInfo))); } + if (addWordAlignInfo) { + stringstream wordAlignment; + OutputAlignment(wordAlignment, hypo); + vector alignments; + string alignmentPair; + while (wordAlignment >> alignmentPair) { + int pos = alignmentPair.find('-'); + map wordAlignInfo; + wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str())); + wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str())); + alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo)); + } + retData.insert(pair("word-align", alignments)); + } if(addGraphInfo) { insertGraphInfo(manager,retData); @@ -415,9 +433,25 @@ public: } nBestXMLItem["hyp"] = xmlrpc_c::value_string(out.str()); - if (addAlignmentInfo) + if (addAlignmentInfo) { nBestXMLItem["align"] = xmlrpc_c::value_array(alignInfo); + if ((int)edges.size() > 0) { + stringstream wordAlignment; + OutputAlignment(wordAlignment, edges[0]); + vector alignments; + string alignmentPair; + while (wordAlignment >> alignmentPair) { + int pos = alignmentPair.find('-'); + map wordAlignInfo; + wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str())); + wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str())); + alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo)); + } + nBestXMLItem["word-align"] = xmlrpc_c::value_array(alignments); + } + } + // weighted score nBestXMLItem["totalScore"] = xmlrpc_c::value_double(path.GetTotalScore()); nBestXml.push_back(xmlrpc_c::value_struct(nBestXMLItem)); From 568685cb66287dc0af72315df5095567a1854853 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Sat, 19 Apr 2014 10:29:41 +0100 Subject: [PATCH 15/23] ChartKBestExtractor: fix memory leak, clean-up code --- moses/ChartKBestExtractor.cpp | 154 +++++++++++++++++++--------------- moses/ChartKBestExtractor.h | 69 +++++++-------- 2 files changed, 124 insertions(+), 99 deletions(-) diff --git a/moses/ChartKBestExtractor.cpp b/moses/ChartKBestExtractor.cpp index 05f8920c5..72a894ba7 100644 --- a/moses/ChartKBestExtractor.cpp +++ b/moses/ChartKBestExtractor.cpp @@ -32,52 +32,48 @@ namespace Moses // Extract the k-best list from the search graph. void ChartKBestExtractor::Extract( - const std::vector &topHypos, std::size_t k, + const std::vector &topLevelHypos, std::size_t k, KBestVec &kBestList) { - typedef std::vector HypoVec; - kBestList.clear(); - if (topHypos.empty()) { + if (topLevelHypos.empty()) { return; } - // Create a new top-level ChartHypothesis that has the best hypothesis as its - // predecessor. This is the search hypergraph's target vertex. - HypoVec::const_iterator iter = topHypos.begin(); + // Create a new ChartHypothesis object, supremeHypo, that has the best + // top-level hypothesis as its predecessor and has the same score. + std::vector::const_iterator p = topLevelHypos.begin(); + const ChartHypothesis &bestTopLevelHypo = **p; boost::scoped_ptr supremeHypo( - new ChartHypothesis(**iter, *this)); + new ChartHypothesis(bestTopLevelHypo, *this)); // Do the same for each alternative top-level hypothesis, but add the new // ChartHypothesis objects as arcs from supremeHypo, as if they had been // recombined. - float prevScore = (*iter)->GetTotalScore(); - for (++iter; iter != topHypos.end(); ++iter) { - // Check that the first item in topHypos really was the best. - UTIL_THROW_IF2((*iter)->GetTotalScore() <= prevScore, - "top-level vertices are not correctly sorted"); + for (++p; p != topLevelHypos.end(); ++p) { + // Check that the first item in topLevelHypos really was the best. + UTIL_THROW_IF2((*p)->GetTotalScore() <= bestTopLevelHypo.GetTotalScore(), + "top-level hypotheses are not correctly sorted"); // Note: there's no need for a smart pointer here: supremeHypo will take // ownership of altHypo. - ChartHypothesis *altHypo = new ChartHypothesis(**iter, *this); + ChartHypothesis *altHypo = new ChartHypothesis(**p, *this); supremeHypo->AddArc(altHypo); } - // Create the target vertex corresponding to supremeHypo then generate - // it's k-best list. - boost::shared_ptr top = FindOrCreateVertex(*supremeHypo); - LazyKthBest(*top, k, k); + // Create the target vertex then lazily fill its k-best list. + boost::shared_ptr targetVertex = FindOrCreateVertex(*supremeHypo); + LazyKthBest(*targetVertex, k, k); // Copy the k-best list from the target vertex, but drop the top edge from // each derivation. - kBestList.reserve(top->kBestList.size()); - for (KBestVec::const_iterator p = top->kBestList.begin(); - p != top->kBestList.end(); ++p) { - const Derivation &d = **p; - assert(d.edge.tail.size() == 1); // d should have exactly one predecessor. - assert(d.backPointers.size() == 1); - std::size_t i = d.backPointers[0]; - boost::shared_ptr pred = d.edge.tail[0]->kBestList[i]; - kBestList.push_back(pred); + kBestList.reserve(targetVertex->kBestList.size()); + for (std::vector >::const_iterator + q = targetVertex->kBestList.begin(); + q != targetVertex->kBestList.end(); ++q) { + const boost::shared_ptr d(*q); + assert(d); + assert(d->subderivations.size() == 1); + kBestList.push_back(d->subderivations[0]); } } @@ -96,8 +92,7 @@ Phrase ChartKBestExtractor::GetOutputPhrase(const Derivation &d) const Word &word = phrase.GetWord(pos); if (word.IsNonTerminal()) { std::size_t nonTermInd = nonTermIndexMap[pos]; - const Derivation &subderivation = - *d.edge.tail[nonTermInd]->kBestList[d.backPointers[nonTermInd]]; + const Derivation &subderivation = *d.subderivations[nonTermInd]; Phrase subPhrase = GetOutputPhrase(subderivation); ret.Append(subPhrase); } else { @@ -142,26 +137,6 @@ ChartKBestExtractor::UnweightedHyperarc ChartKBestExtractor::CreateEdge( return edge; } -void ChartKBestExtractor::GetCandidates(Vertex &v, std::size_t k) -{ - // Create a derivation for v's best incoming edge. - UnweightedHyperarc bestEdge = CreateEdge(v.hypothesis); - boost::shared_ptr d(new Derivation(bestEdge)); - v.candidates.push(d); - v.seen.insert(d); - // Create derivations for the rest of v's incoming edges. - const ChartArcList *arcList = v.hypothesis.GetArcList(); - if (arcList) { - for (std::size_t i = 0; i < arcList->size(); ++i) { - const ChartHypothesis &recombinedHypo = *(*arcList)[i]; - UnweightedHyperarc edge = CreateEdge(recombinedHypo); - boost::shared_ptr d(new Derivation(edge)); - v.candidates.push(d); - v.seen.insert(d); - } - } -} - // Look for the vertex corresponding to a given ChartHypothesis, creating // a new one if necessary. boost::shared_ptr @@ -174,14 +149,51 @@ ChartKBestExtractor::FindOrCreateVertex(const ChartHypothesis &h) return sp; // Vertex was already in m_vertexMap. } sp.reset(new Vertex(h)); + // Create the 1-best derivation and add it to the vertex's kBestList. + UnweightedHyperarc bestEdge; + bestEdge.head = sp; + const std::vector &prevHypos = h.GetPrevHypos(); + bestEdge.tail.resize(prevHypos.size()); + for (std::size_t i = 0; i < prevHypos.size(); ++i) { + const ChartHypothesis *prevHypo = prevHypos[i]; + bestEdge.tail[i] = FindOrCreateVertex(*prevHypo); + } + boost::shared_ptr bestDerivation(new Derivation(bestEdge)); + std::pair q = + m_derivations.insert(bestDerivation); + assert(q.second); + sp->kBestList.push_back(bestDerivation); return sp; } +// Create the 1-best derivation for each edge in BS(v) (except the best one) +// and add it to v's candidate queue. +void ChartKBestExtractor::GetCandidates(Vertex &v, std::size_t k) +{ + // Create derivations for all of v's incoming edges except the best. This + // means everything in v.hypothesis.GetArcList() and not the edge defined + // by v.hypothesis itself. The 1-best derivation for that edge will already + // have been created. + const ChartArcList *arcList = v.hypothesis.GetArcList(); + if (arcList) { + for (std::size_t i = 0; i < arcList->size(); ++i) { + const ChartHypothesis &recombinedHypo = *(*arcList)[i]; + boost::shared_ptr w = FindOrCreateVertex(recombinedHypo); + assert(w->kBestList.size() == 1); + v.candidates.push(w->kBestList[0]); + } + } +} + +// Lazily fill v's k-best list. void ChartKBestExtractor::LazyKthBest(Vertex &v, std::size_t k, std::size_t globalK) { // If this is the first visit to vertex v then initialize the priority queue. if (v.visited == false) { + // The 1-best derivation should already be in v's k-best list. + assert(v.kBestList.size() == 1); + // Initialize v's priority queue. GetCandidates(v, globalK); v.visited = true; } @@ -191,49 +203,57 @@ void ChartKBestExtractor::LazyKthBest(Vertex &v, std::size_t k, if (!v.kBestList.empty()) { // Update the priority queue by adding the successors of the last // derivation (unless they've been seen before). - const Derivation &d = *v.kBestList.back(); - LazyNext(v, d, globalK); + boost::shared_ptr d(v.kBestList.back()); + LazyNext(v, *d, globalK); } // Check if there are any derivations left in the queue. if (v.candidates.empty()) { break; } // Get the next best derivation and delete it from the queue. - boost::shared_ptr d = v.candidates.top(); + boost::weak_ptr d = v.candidates.top(); v.candidates.pop(); // Add it to the k-best list. v.kBestList.push_back(d); } } +// Create the neighbours of Derivation d and add them to v's candidate queue. void ChartKBestExtractor::LazyNext(Vertex &v, const Derivation &d, std::size_t globalK) { - // Create the neighbours of Derivation d. - for (std::size_t i = 0; i < d.backPointers.size(); ++i) { - Vertex &predVertex = *d.edge.tail[i]; - // Ensure that predVertex's k-best list contains enough derivations. + for (std::size_t i = 0; i < d.edge.tail.size(); ++i) { + Vertex &pred = *d.edge.tail[i]; + // Ensure that pred's k-best list contains enough derivations. std::size_t k = d.backPointers[i] + 2; - LazyKthBest(predVertex, k, globalK); - if (predVertex.kBestList.size() < k) { - // predVertex's derivations have been exhausted. + LazyKthBest(pred, k, globalK); + if (pred.kBestList.size() < k) { + // pred's derivations have been exhausted. continue; } // Create the neighbour. boost::shared_ptr next(new Derivation(d, i)); // Check if it has been created before. - std::pair p = v.seen.insert(next); + std::pair p = m_derivations.insert(next); if (p.second) { v.candidates.push(next); // Haven't previously seen it. } } } -// Construct a Derivation corresponding to a ChartHypothesis. +// Construct the 1-best Derivation that ends at edge e. ChartKBestExtractor::Derivation::Derivation(const UnweightedHyperarc &e) { edge = e; - backPointers.resize(edge.tail.size(), 0); + std::size_t arity = edge.tail.size(); + backPointers.resize(arity, 0); + subderivations.reserve(arity); + for (std::size_t i = 0; i < arity; ++i) { + const Vertex &pred = *edge.tail[i]; + assert(pred.kBestList.size() == 1); + boost::shared_ptr sub(pred.kBestList[0]); + subderivations.push_back(sub); + } scoreBreakdown = edge.head->hypothesis.GetScoreBreakdown(); score = edge.head->hypothesis.GetTotalScore(); } @@ -244,14 +264,16 @@ ChartKBestExtractor::Derivation::Derivation(const Derivation &d, std::size_t i) edge.head = d.edge.head; edge.tail = d.edge.tail; backPointers = d.backPointers; + subderivations = d.subderivations; std::size_t j = ++backPointers[i]; scoreBreakdown = d.scoreBreakdown; // Deduct the score of the old subderivation. - const Derivation &oldSubderivation = *(edge.tail[i]->kBestList[j-1]); - scoreBreakdown.MinusEquals(oldSubderivation.scoreBreakdown); + scoreBreakdown.MinusEquals(subderivations[i]->scoreBreakdown); + // Update the subderivation pointer. + boost::shared_ptr newSub(edge.tail[i]->kBestList[j]); + subderivations[i] = newSub; // Add the score of the new subderivation. - const Derivation &newSubderivation = *(edge.tail[i]->kBestList[j]); - scoreBreakdown.PlusEquals(newSubderivation.scoreBreakdown); + scoreBreakdown.PlusEquals(subderivations[i]->scoreBreakdown); score = scoreBreakdown.GetWeightedScore(); } diff --git a/moses/ChartKBestExtractor.h b/moses/ChartKBestExtractor.h index 66430ec1e..05b016d50 100644 --- a/moses/ChartKBestExtractor.h +++ b/moses/ChartKBestExtractor.h @@ -24,6 +24,7 @@ #include "ScoreComponentCollection.h" #include +#include #include #include @@ -53,17 +54,46 @@ public: UnweightedHyperarc edge; std::vector backPointers; + std::vector > subderivations; ScoreComponentCollection scoreBreakdown; float score; }; struct DerivationOrderer { - bool operator()(const boost::shared_ptr &d1, - const boost::shared_ptr &d2) const { - return d1->score < d2->score; + bool operator()(const boost::weak_ptr &d1, + const boost::weak_ptr &d2) const { + boost::shared_ptr s1(d1); + boost::shared_ptr s2(d2); + return s1->score < s2->score; } }; + struct Vertex { + typedef std::priority_queue, + std::vector >, + DerivationOrderer> DerivationQueue; + + Vertex(const ChartHypothesis &h) : hypothesis(h), visited(false) {} + + const ChartHypothesis &hypothesis; + std::vector > kBestList; + DerivationQueue candidates; + bool visited; + }; + + typedef std::vector > KBestVec; + + // Extract the k-best list from the search hypergraph given the full, sorted + // list of top-level vertices. + void Extract(const std::vector &topHypos, + std::size_t k, KBestVec &); + + static Phrase GetOutputPhrase(const Derivation &); + +private: + typedef boost::unordered_map > VertexMap; + struct DerivationHasher { std::size_t operator()(const boost::shared_ptr &d) const { std::size_t seed = 0; @@ -83,36 +113,8 @@ public: } }; - struct Vertex { - typedef std::priority_queue, - std::vector >, - DerivationOrderer> DerivationQueue; - - typedef boost::unordered_set, - DerivationHasher, - DerivationEqualityPred> DerivationSet; - - Vertex(const ChartHypothesis &h) : hypothesis(h), visited(false) {} - - const ChartHypothesis &hypothesis; - std::vector > kBestList; - DerivationQueue candidates; - DerivationSet seen; - bool visited; - }; - - typedef std::vector > KBestVec; - - // Extract the k-best list from the search hypergraph given the full, sorted - // list of top-level vertices. - void Extract(const std::vector &topHypos, - std::size_t k, KBestVec &); - - static Phrase GetOutputPhrase(const Derivation &); - -private: - typedef boost::unordered_map > VertexMap; + typedef boost::unordered_set, DerivationHasher, + DerivationEqualityPred> DerivationSet; UnweightedHyperarc CreateEdge(const ChartHypothesis &); boost::shared_ptr FindOrCreateVertex(const ChartHypothesis &); @@ -121,6 +123,7 @@ private: void LazyNext(Vertex &, const Derivation &, std::size_t); VertexMap m_vertexMap; + DerivationSet m_derivations; }; } // namespace Moses From 76a4609cff1b5bc9a5581e83dcddaf21d696682d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 20 Apr 2014 23:30:09 +0100 Subject: [PATCH 16/23] add script to creat pt with only certain scores --- scripts/other/delete-scores.perl | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100755 scripts/other/delete-scores.perl diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl new file mode 100755 index 000000000..442173026 --- /dev/null +++ b/scripts/other/delete-scores.perl @@ -0,0 +1,61 @@ +#!/usr/bin/perl + +use strict; +use Getopt::Long "GetOptions"; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +sub trim($); +sub DeleteScore; + +my $keepScoresStr; +GetOptions( + "keep-scores=s" => \$keepScoresStr +) or exit(1); + +my @keepScores = split(/,/, $keepScoresStr); + +#MAIN LOOP +while (my $line = ) { + chomp($line); + #print STDERR "line=$line\n"; + + my @toks = split(/\|/, $line); + my @scores = split(/ /, $toks[6]); + + $toks[6] = DeleteScore($toks[6], \@keepScores); + + # output + print $toks[0]; + for (my $i = 1; $i < scalar(@toks); ++$i) { + print "|" .$toks[$i]; + } + print "\n"; +} + +###################### +# Perl trim function to remove whitespace from the start and end of the string +sub trim($) { + my $string = shift; + $string =~ s/^\s+//; + $string =~ s/\s+$//; + return $string; +} + +sub DeleteScore +{ + my $string = $_[0]; + my @keepScores = @{$_[1]}; + + $string = trim($string); + my @toks = split(/ /, $string); + + $string = ""; + for (my $i = 0; $i < scalar(@keepScores); ++$i) { + $string .= $toks[ $keepScores[$i] ] ." "; + } + $string = " " .$string; + + return $string; +} From b308bd56579a6791ccd8d0d326acb0e0b74d1b20 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 21 Apr 2014 10:19:00 +0100 Subject: [PATCH 17/23] moses_chart: enable ChartKBestExtractor. --- moses-chart-cmd/IOWrapper.cpp | 133 +++++++++++++++++++++++++++++++++- moses-chart-cmd/IOWrapper.h | 5 +- moses-chart-cmd/Main.cpp | 4 +- moses/ChartHypothesis.h | 7 +- moses/ChartKBestExtractor.cpp | 17 ++--- moses/ChartManager.cpp | 60 +++++++++++++++ moses/ChartManager.h | 2 + 7 files changed, 212 insertions(+), 16 deletions(-) diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp index 56c166422..81612ed1f 100644 --- a/moses-chart-cmd/IOWrapper.cpp +++ b/moses-chart-cmd/IOWrapper.cpp @@ -553,7 +553,7 @@ void IOWrapper::OutputDetailedTreeFragmentsTranslationReport( //DIMw void IOWrapper::OutputDetailedAllTranslationReport( - const ChartTrellisPathList &nBestList, + const std::vector > &nBestList, const ChartManager &manager, const Sentence &sentence, long translationId) @@ -793,6 +793,58 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long tran m_nBestOutputCollector->Write(translationId, out.str()); } +void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList, + long translationId) +{ + std::ostringstream out; + + if (m_nBestOutputCollector->OutputIsCout()) { + // Set precision only if we're writing the n-best list to cout. This is to + // preserve existing behaviour, but should probably be done either way. + IOWrapper::FixPrecision(out); + } + + bool includeWordAlignment = + StaticData::Instance().PrintAlignmentInfoInNbest(); + + for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin(); + p != nBestList.end(); ++p) { + const ChartKBestExtractor::Derivation &derivation = **p; + + // get the derivation's target-side yield + Phrase outputPhrase = ChartKBestExtractor::GetOutputPhrase(derivation); + + // delete and + UTIL_THROW_IF2(outputPhrase.GetSize() < 2, + "Output phrase should have contained at least 2 words (beginning and end-of-sentence)"); + outputPhrase.RemoveWord(0); + outputPhrase.RemoveWord(outputPhrase.GetSize() - 1); + + // print the translation ID, surface factors, and scores + out << translationId << " ||| "; + OutputSurface(out, outputPhrase, m_outputFactorOrder, false); + out << " ||| "; + OutputAllFeatureScores(derivation.scoreBreakdown, out); + out << " ||| " << derivation.score; + + // optionally, print word alignments + if (includeWordAlignment) { + out << " ||| "; + Alignments align; + OutputAlignmentNBest(align, derivation, 0); + for (Alignments::const_iterator q = align.begin(); q != align.end(); + ++q) { + out << q->first << "-" << q->second << " "; + } + } + + out << std::endl; + } + + assert(m_nBestOutputCollector); + m_nBestOutputCollector->Write(translationId, out.str()); +} + void IOWrapper::OutputNBestList(const std::vector &nbest, long translationId) { std::ostringstream out; @@ -927,6 +979,85 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT return totalTargetSize; } +size_t IOWrapper::OutputAlignmentNBest( + Alignments &retAlign, + const Moses::ChartKBestExtractor::Derivation &derivation, + size_t startTarget) +{ + const ChartHypothesis &hypo = derivation.edge.head->hypothesis; + + size_t totalTargetSize = 0; + size_t startSource = hypo.GetCurrSourceRange().GetStartPos(); + + const TargetPhrase &tp = hypo.GetCurrTargetPhrase(); + + size_t thisSourceSize = CalcSourceSize(&hypo); + + // position of each terminal word in translation rule, irrespective of alignment + // if non-term, number is undefined + vector sourceOffsets(thisSourceSize, 0); + vector targetOffsets(tp.GetSize(), 0); + + const AlignmentInfo &aiNonTerm = hypo.GetCurrTargetPhrase().GetAlignNonTerm(); + vector sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap(); + const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap(); + + UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(), + "Error"); + + size_t targetInd = 0; + for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) { + if (tp.GetWord(targetPos).IsNonTerminal()) { + UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error"); + size_t sourceInd = targetPos2SourceInd[targetPos]; + size_t sourcePos = sourceInd2pos[sourceInd]; + + const Moses::ChartKBestExtractor::Derivation &subderivation = + *derivation.subderivations[sourceInd]; + + // calc source size + size_t sourceSize = subderivation.edge.head->hypothesis.GetCurrSourceRange().GetNumWordsCovered(); + sourceOffsets[sourcePos] = sourceSize; + + // calc target size. + // Recursively look thru child hypos + size_t currStartTarget = startTarget + totalTargetSize; + size_t targetSize = OutputAlignmentNBest(retAlign, subderivation, + currStartTarget); + targetOffsets[targetPos] = targetSize; + + totalTargetSize += targetSize; + ++targetInd; + } else { + ++totalTargetSize; + } + } + + // convert position within translation rule to absolute position within + // source sentence / output sentence + ShiftOffsets(sourceOffsets, startSource); + ShiftOffsets(targetOffsets, startTarget); + + // get alignments from this hypo + const AlignmentInfo &aiTerm = hypo.GetCurrTargetPhrase().GetAlignTerm(); + + // add to output arg, offsetting by source & target + AlignmentInfo::const_iterator iter; + for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) { + const std::pair &align = *iter; + size_t relSource = align.first; + size_t relTarget = align.second; + size_t absSource = sourceOffsets[relSource]; + size_t absTarget = targetOffsets[relTarget]; + + pair alignPoint(absSource, absTarget); + pair ret = retAlign.insert(alignPoint); + UTIL_THROW_IF2(!ret.second, "Error"); + } + + return totalTargetSize; +} + void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothesis *hypo) { ostringstream out; diff --git a/moses-chart-cmd/IOWrapper.h b/moses-chart-cmd/IOWrapper.h index 9e09ef00f..bd8264eb6 100644 --- a/moses-chart-cmd/IOWrapper.h +++ b/moses-chart-cmd/IOWrapper.h @@ -40,6 +40,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "moses/TypeDef.h" #include "moses/Sentence.h" #include "moses/FactorTypeSet.h" +#include "moses/ChartKBestExtractor.h" #include "moses/ChartTrellisPathList.h" #include "moses/OutputCollector.h" #include "moses/ChartHypothesis.h" @@ -90,6 +91,7 @@ protected: typedef std::set< std::pair > Alignments; size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget); + std::size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartKBestExtractor::Derivation &derivation, std::size_t startTarget); size_t OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget); void OutputAlignment(std::vector< std::set > &retAlignmentsS2T, const Moses::AlignmentInfo &ai); void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId); @@ -129,12 +131,13 @@ public: void OutputBestHypo(const std::vector& mbrBestHypo, long translationId); void OutputBestNone(long translationId); void OutputNBestList(const Moses::ChartTrellisPathList &nBestList, long translationId); + void OutputNBestList(const std::vector > &nBestList, long translationId); void OutputNBestList(const std::vector &nbest, long translationId); void OutputDetailedTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId); void OutputDetailedTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId); void OutputDetailedTreeFragmentsTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId); void OutputDetailedTreeFragmentsTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId); - void OutputDetailedAllTranslationReport(const Moses::ChartTrellisPathList &nBestList, const Moses::ChartManager &manager, const Moses::Sentence &sentence, long translationId); + void OutputDetailedAllTranslationReport(const std::vector > &nBestList, const Moses::ChartManager &manager, const Moses::Sentence &sentence, long translationId); void Backtrack(const Moses::ChartHypothesis *hypo); void ResetTranslationId(); diff --git a/moses-chart-cmd/Main.cpp b/moses-chart-cmd/Main.cpp index f2baff0fa..fd82b5692 100644 --- a/moses-chart-cmd/Main.cpp +++ b/moses-chart-cmd/Main.cpp @@ -151,7 +151,7 @@ public: if (staticData.IsDetailedAllTranslationReportingEnabled()) { const Sentence &sentence = dynamic_cast(*m_source); size_t nBestSize = staticData.GetNBestSize(); - ChartTrellisPathList nBestList; + std::vector > nBestList; manager.CalcNBest(nBestSize, nBestList, staticData.GetDistinctNBest()); m_ioWrapper.OutputDetailedAllTranslationReport(nBestList, manager, sentence, translationId); } @@ -160,7 +160,7 @@ public: size_t nBestSize = staticData.GetNBestSize(); if (nBestSize > 0) { VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl); - ChartTrellisPathList nBestList; + std::vector > nBestList; manager.CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest()); m_ioWrapper.OutputNBestList(nBestList, translationId); IFVERBOSE(2) { diff --git a/moses/ChartHypothesis.h b/moses/ChartHypothesis.h index 150b53fd0..12050e764 100644 --- a/moses/ChartHypothesis.h +++ b/moses/ChartHypothesis.h @@ -45,6 +45,7 @@ typedef std::vector ChartArcList; class ChartHypothesis { friend std::ostream& operator<<(std::ostream&, const ChartHypothesis&); + friend class ChartKBestExtractor; protected: #ifdef USE_HYPO_POOL @@ -75,6 +76,9 @@ protected: //! not implemented ChartHypothesis(const ChartHypothesis ©); + //! only used by ChartKBestExtractor + ChartHypothesis(const ChartHypothesis &, const ChartKBestExtractor &); + public: #ifdef USE_HYPO_POOL void *operator new(size_t /* num_bytes */) { @@ -93,9 +97,6 @@ public: } #endif - //! only used by ChartKBestExtractor - ChartHypothesis(const ChartHypothesis &, const ChartKBestExtractor &); - ChartHypothesis(const ChartTranslationOptions &, const RuleCubeItem &item, ChartManager &manager); diff --git a/moses/ChartKBestExtractor.cpp b/moses/ChartKBestExtractor.cpp index 72a894ba7..3a16198fc 100644 --- a/moses/ChartKBestExtractor.cpp +++ b/moses/ChartKBestExtractor.cpp @@ -200,21 +200,20 @@ void ChartKBestExtractor::LazyKthBest(Vertex &v, std::size_t k, // Add derivations to the k-best list until it contains k or there are none // left to add. while (v.kBestList.size() < k) { - if (!v.kBestList.empty()) { - // Update the priority queue by adding the successors of the last - // derivation (unless they've been seen before). - boost::shared_ptr d(v.kBestList.back()); - LazyNext(v, *d, globalK); - } + assert(!v.kBestList.empty()); + // Update the priority queue by adding the successors of the last + // derivation (unless they've been seen before). + boost::shared_ptr d(v.kBestList.back()); + LazyNext(v, *d, globalK); // Check if there are any derivations left in the queue. if (v.candidates.empty()) { break; } // Get the next best derivation and delete it from the queue. - boost::weak_ptr d = v.candidates.top(); + boost::weak_ptr next = v.candidates.top(); v.candidates.pop(); // Add it to the k-best list. - v.kBestList.push_back(d); + v.kBestList.push_back(next); } } @@ -250,7 +249,7 @@ ChartKBestExtractor::Derivation::Derivation(const UnweightedHyperarc &e) subderivations.reserve(arity); for (std::size_t i = 0; i < arity; ++i) { const Vertex &pred = *edge.tail[i]; - assert(pred.kBestList.size() == 1); + assert(pred.kBestList.size() >= 1); boost::shared_ptr sub(pred.kBestList[0]); subderivations.push_back(sub); } diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp index 0e303390e..7162099d4 100644 --- a/moses/ChartManager.cpp +++ b/moses/ChartManager.cpp @@ -23,6 +23,7 @@ #include "ChartManager.h" #include "ChartCell.h" #include "ChartHypothesis.h" +#include "ChartKBestExtractor.h" #include "ChartTranslationOptions.h" #include "ChartTrellisDetourQueue.h" #include "ChartTrellisNode.h" @@ -261,6 +262,65 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi } } +/** Calculate the n-best paths through the output hypergraph. + * Return the list of paths with the variable ret + * \param n how may paths to return + * \param ret return argument + * \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths) + */ +void ChartManager::CalcNBest( + std::size_t n, + std::vector > &nBestList, + bool onlyDistinct) const +{ + nBestList.clear(); + if (n == 0 || m_source.GetSize() == 0) { + return; + } + + // Get the list of top-level hypotheses, sorted by score. + WordsRange range(0, m_source.GetSize()-1); + const ChartCell &lastCell = m_hypoStackColl.Get(range); + boost::scoped_ptr > topLevelHypos( + lastCell.GetAllSortedHypotheses()); + if (!topLevelHypos) { + return; + } + + ChartKBestExtractor extractor; + + if (!onlyDistinct) { + // Return the n-best list as is, including duplicate translations. + extractor.Extract(*topLevelHypos, n, nBestList); + return; + } + + // Determine how many derivations to extract. If the n-best list is + // restricted to distinct translations then this limit should be bigger + // than n. The n-best factor determines how much bigger the limit should be, + // with 0 being 'unlimited.' This actually sets a large-ish limit in case + // too many translations are identical. + const StaticData &staticData = StaticData::Instance(); + const std::size_t nBestFactor = staticData.GetNBestFactor(); + std::size_t numDerivations = (nBestFactor == 0) ? n*1000 : n*nBestFactor; + + // Extract the derivations. + ChartKBestExtractor::KBestVec bigList; + bigList.reserve(numDerivations); + extractor.Extract(*topLevelHypos, numDerivations, bigList); + + // Copy derivations into nBestList, skipping ones with repeated translations. + std::set distinct; + for (ChartKBestExtractor::KBestVec::const_iterator p = bigList.begin(); + p != bigList.end(); ++p) { + boost::shared_ptr derivation = *p; + Phrase translation = ChartKBestExtractor::GetOutputPhrase(*derivation); + if (distinct.insert(translation).second) { + nBestList.push_back(derivation); + } + } +} + void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const { size_t size = m_source.GetSize(); diff --git a/moses/ChartManager.h b/moses/ChartManager.h index 6beffc45e..27914e207 100644 --- a/moses/ChartManager.h +++ b/moses/ChartManager.h @@ -30,6 +30,7 @@ #include "SentenceStats.h" #include "ChartTranslationOptionList.h" #include "ChartParser.h" +#include "ChartKBestExtractor.h" #include @@ -71,6 +72,7 @@ public: void AddXmlChartOptions(); const ChartHypothesis *GetBestHypothesis() const; void CalcNBest(size_t count, ChartTrellisPathList &ret, bool onlyDistinct=0) const; + void CalcNBest(size_t n, std::vector > &nBestList, bool onlyDistinct=false) const; void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const; void FindReachableHypotheses( const ChartHypothesis *hypo, std::map &reachable ) const; /* auxilliary function for GetSearchGraph */ From ff8ac92be18cfb96e71669ad525fc830cdd14800 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 21 Apr 2014 14:46:22 +0100 Subject: [PATCH 18/23] moses_chart: oops, don't output more distinct translations than requested --- moses/ChartManager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp index 7162099d4..623968dfc 100644 --- a/moses/ChartManager.cpp +++ b/moses/ChartManager.cpp @@ -312,7 +312,7 @@ void ChartManager::CalcNBest( // Copy derivations into nBestList, skipping ones with repeated translations. std::set distinct; for (ChartKBestExtractor::KBestVec::const_iterator p = bigList.begin(); - p != bigList.end(); ++p) { + nBestList.size() < n && p != bigList.end(); ++p) { boost::shared_ptr derivation = *p; Phrase translation = ChartKBestExtractor::GetOutputPhrase(*derivation); if (distinct.insert(translation).second) { From 00505ba048d119f54e57e179ef40989a38143c12 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 Apr 2014 11:36:25 +0100 Subject: [PATCH 19/23] minor leak --- moses/ChartTranslationOptionList.cpp | 10 ++++++++-- moses/TranslationModel/RuleTable/LoaderStandard.cpp | 5 ++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/moses/ChartTranslationOptionList.cpp b/moses/ChartTranslationOptionList.cpp index e83fbac79..b86312b9d 100644 --- a/moses/ChartTranslationOptionList.cpp +++ b/moses/ChartTranslationOptionList.cpp @@ -163,8 +163,14 @@ float ChartTranslationOptionList::GetBestScore(const ChartCellLabel *chartCell) { const HypoList *stack = chartCell->GetStack().cube; assert(stack); - assert(!stack->empty()); - const ChartHypothesis &bestHypo = **(stack->begin()); + //assert(!stack->empty()); + if (stack->empty()) { + return 0; + } + else { + const ChartHypothesis &bestHypo = **(stack->begin()); + return bestHypo.GetTotalScore(); + } const ChartHypothesis &bestHypo = **(stack->begin()); return bestHypo.GetTotalScore(); } diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp index 9d2e3fa20..47f7378d1 100644 --- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp +++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp @@ -218,7 +218,7 @@ bool RuleTableLoaderStandard::Load(FormatType format // parse source & find pt node // constituent labels - Word *sourceLHS; + Word *sourceLHS = NULL; Word *targetLHS; // create target phrase obj @@ -251,6 +251,9 @@ bool RuleTableLoaderStandard::Load(FormatType format TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); + // not implemented correctly in memory pt. just delete it for now + delete sourceLHS; + count++; } From 4ee4e07c1bf2f1fdf56fadf67a606b8945989978 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Wed, 23 Apr 2014 13:50:08 +0100 Subject: [PATCH 20/23] minor ems fixes --- scripts/ems/web/analysis.php | 3 ++- scripts/ems/web/analysis_diff.php | 3 ++- scripts/ems/web/index.php | 3 ++- scripts/ems/web/overview.php | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php index 726e30fbd..a64d5977f 100644 --- a/scripts/ems/web/analysis.php +++ b/scripts/ems/web/analysis.php @@ -436,7 +436,8 @@ function ngram_summary() { $score_line = ""; for($i=0;$iresult[$set]); for($i=0;$i

'.$title."

\n"; } -if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) { +if (array_key_exists("setStepStatus",$_GET)) { set_step_status($_GET["setStepStatus"]); } +else if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) { load_experiment_info(); load_comment(); diff --git a/scripts/ems/web/overview.php b/scripts/ems/web/overview.php index c49e74be6..e56ed6f08 100644 --- a/scripts/ems/web/overview.php +++ b/scripts/ems/web/overview.php @@ -295,7 +295,8 @@ function output_score($id,$info) { $each_score = explode(" ; ",$score); for($i=0;$i0) { print "
"; } $opened_a_tag = 0; if ($set != "avg") { From 6a9eb6c848f12711b0a7ac2994fe06bf6491fd23 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 Apr 2014 15:12:27 +0100 Subject: [PATCH 21/23] minor leak showing for lex reordering. Just refactor --- moses/PrefixTreeMap.cpp | 30 ++++++++++++++++++++++-------- moses/PrefixTreeMap.h | 14 ++++---------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/moses/PrefixTreeMap.cpp b/moses/PrefixTreeMap.cpp index c8edce726..ee7565d8b 100644 --- a/moses/PrefixTreeMap.cpp +++ b/moses/PrefixTreeMap.cpp @@ -5,6 +5,8 @@ #include #endif +using namespace std; + namespace Moses { void GenericCandidate::readBin(FILE* f) @@ -62,6 +64,17 @@ void Candidates::readBin(FILE* f) const LabelId PrefixTreeMap::MagicWord = std::numeric_limits::max() - 1; +////////////////////////////////////////////////////////////////// +PrefixTreeMap::~PrefixTreeMap() { + if(m_FileSrc) { + fClose(m_FileSrc); + } + if(m_FileTgt) { + fClose(m_FileTgt); + } + FreeMemory(); +} + void PrefixTreeMap::FreeMemory() { @@ -75,20 +88,21 @@ void PrefixTreeMap::FreeMemory() m_PtrPool.reset(); } -static WordVoc* ReadVoc(const std::string& filename) +WordVoc &ReadVoc(std::map &vocs, const std::string& filename) { - static std::map vocs; #ifdef WITH_THREADS boost::mutex mutex; boost::mutex::scoped_lock lock(mutex); #endif - std::map::iterator vi = vocs.find(filename); + std::map::iterator vi = vocs.find(filename); if (vi == vocs.end()) { - WordVoc* voc = new WordVoc(); - voc->Read(filename); - vocs[filename] = voc; + WordVoc &voc = vocs[filename]; + voc.Read(filename); + return voc; + } + else { + return vi->second; } - return vocs[filename]; } int PrefixTreeMap::Read(const std::string& fileNameStem, int numVocs) @@ -133,7 +147,7 @@ int PrefixTreeMap::Read(const std::string& fileNameStem, int numVocs) sprintf(num, "%d", i); //m_Voc[i] = new WordVoc(); //m_Voc[i]->Read(ifv + num); - m_Voc[i] = ReadVoc(ifv + num); + m_Voc[i] = &ReadVoc(m_vocs, ifv + num); } TRACE_ERR("binary file loaded, default OFF_T: "<< PTF::getDefault()<<"\n"); diff --git a/moses/PrefixTreeMap.h b/moses/PrefixTreeMap.h index 06066878d..d6262ca65 100644 --- a/moses/PrefixTreeMap.h +++ b/moses/PrefixTreeMap.h @@ -99,18 +99,11 @@ public: PrefixTreeMap() : m_FileSrc(0), m_FileTgt(0) { PTF::setDefault(InvalidOffT); } - ~PrefixTreeMap() { - if(m_FileSrc) { - fClose(m_FileSrc); - } - if(m_FileTgt) { - fClose(m_FileTgt); - } - FreeMemory(); - } + ~PrefixTreeMap(); + public: static const LabelId MagicWord; -public: + void FreeMemory(); int Read(const std::string& fileNameStem, int numVocs = -1); @@ -135,6 +128,7 @@ private: std::vector m_Voc; ObjectPool m_PtrPool; + std::map m_vocs; }; } From 2c14b506b450bcdcbb903158137a51185808211f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 Apr 2014 16:11:09 +0100 Subject: [PATCH 22/23] merge problem --- moses/ChartTranslationOptionList.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/moses/ChartTranslationOptionList.cpp b/moses/ChartTranslationOptionList.cpp index b86312b9d..89955dcee 100644 --- a/moses/ChartTranslationOptionList.cpp +++ b/moses/ChartTranslationOptionList.cpp @@ -163,15 +163,9 @@ float ChartTranslationOptionList::GetBestScore(const ChartCellLabel *chartCell) { const HypoList *stack = chartCell->GetStack().cube; assert(stack); - //assert(!stack->empty()); - if (stack->empty()) { - return 0; - } - else { - const ChartHypothesis &bestHypo = **(stack->begin()); - return bestHypo.GetTotalScore(); - } const ChartHypothesis &bestHypo = **(stack->begin()); - return bestHypo.GetTotalScore(); + assert(!stack->empty()); + const ChartHypothesis &bestHypo = **(stack->begin()); + return bestHypo.GetTotalScore(); } void ChartTranslationOptionList::Evaluate(const InputType &input, const InputPath &inputPath) From d7380d6d9e0c5f88eb0ee4973f49e4c4e3900b9f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 Apr 2014 17:11:25 +0100 Subject: [PATCH 23/23] don't add label to m_targetLabelSet if no hypotheses. Assert error in parsing --- moses/ChartCell.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/moses/ChartCell.cpp b/moses/ChartCell.cpp index 6603139f6..125efd204 100644 --- a/moses/ChartCell.cpp +++ b/moses/ChartCell.cpp @@ -114,8 +114,11 @@ void ChartCell::SortHypotheses() MapType::iterator iter; for (iter = m_hypoColl.begin(); iter != m_hypoColl.end(); ++iter) { ChartHypothesisCollection &coll = iter->second; - coll.SortHypotheses(); - m_targetLabelSet.AddConstituent(iter->first, &coll.GetSortedHypotheses()); + + if (coll.GetSize()) { + coll.SortHypotheses(); + m_targetLabelSet.AddConstituent(iter->first, &coll.GetSortedHypotheses()); + } } }