From 6bea23357c1d5a9a50382330d14f4c734f94ac98 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 09:28:38 +0100 Subject: [PATCH 1/6] Ongoing moses/phrase-extract refactoring --- phrase-extract/pcfg-common/pcfg_tree.h | 79 ---------- phrase-extract/pcfg-common/syntax_tree.h | 93 ------------ phrase-extract/pcfg-common/typedef.h | 1 - phrase-extract/pcfg-common/xml_tree_parser.cc | 89 ------------ phrase-extract/pcfg-common/xml_tree_parser.h | 59 -------- phrase-extract/pcfg-common/xml_tree_writer.h | 135 ------------------ phrase-extract/pcfg-extract/Jamfile | 2 +- phrase-extract/pcfg-extract/pcfg_extract.cc | 34 ++--- phrase-extract/pcfg-extract/rule_extractor.cc | 16 +-- phrase-extract/pcfg-extract/rule_extractor.h | 6 +- phrase-extract/pcfg-score/pcfg_score.cc | 19 +-- phrase-extract/pcfg-score/tree_scorer.cc | 66 +++++++-- phrase-extract/pcfg-score/tree_scorer.h | 10 +- .../syntax-common/xml_tree_parser.cc | 5 +- .../syntax-common/xml_tree_parser.h | 36 ++++- .../syntax-common/xml_tree_writer.cc | 82 +++++++++++ .../syntax-common/xml_tree_writer.h | 27 ++++ 17 files changed, 245 insertions(+), 514 deletions(-) delete mode 100644 phrase-extract/pcfg-common/pcfg_tree.h delete mode 100644 phrase-extract/pcfg-common/syntax_tree.h delete mode 100644 phrase-extract/pcfg-common/xml_tree_parser.cc delete mode 100644 phrase-extract/pcfg-common/xml_tree_parser.h delete mode 100644 phrase-extract/pcfg-common/xml_tree_writer.h create mode 100644 phrase-extract/syntax-common/xml_tree_writer.cc create mode 100644 phrase-extract/syntax-common/xml_tree_writer.h diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h deleted file mode 100644 index ce28eb8dd..000000000 --- a/phrase-extract/pcfg-common/pcfg_tree.h +++ /dev/null @@ -1,79 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_PCFG_TREE_H_ -#define PCFG_PCFG_TREE_H_ - -#include - -#include "syntax_tree.h" -#include "xml_tree_writer.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -template -class PcfgTreeBase : public SyntaxTreeBase { - public: - typedef std::string LabelType; - typedef SyntaxTreeBase BaseType; - - PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {} - - double score() const { return score_; } - void set_score(double s) { score_ = s; } - - private: - double score_; -}; - -class PcfgTree : public PcfgTreeBase { - public: - typedef PcfgTreeBase BaseType; - PcfgTree(const BaseType::LabelType &label) : BaseType(label) {} -}; - -// Specialise XmlOutputHandler for PcfgTree. -template<> -class XmlOutputHandler { - public: - typedef std::map AttributeMap; - - void GetLabel(const PcfgTree &tree, std::string &label) const { - label = tree.label(); - } - - void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const { - attribute_map.clear(); - double score = tree.score(); - if (score != 0.0) { - std::ostringstream out; - out << tree.score(); - attribute_map["pcfg"] = out.str(); - } - } -}; - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h deleted file mode 100644 index c0c6eaef9..000000000 --- a/phrase-extract/pcfg-common/syntax_tree.h +++ /dev/null @@ -1,93 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_SYNTAX_TREE_H_ -#define PCFG_SYNTAX_TREE_H_ - -#include -#include - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -// Base class for SyntaxTree, AgreementTree, and friends. -template -class SyntaxTreeBase { - public: - // Constructors - SyntaxTreeBase(const T &label) - : label_(label) - , children_() - , parent_(0) {} - - SyntaxTreeBase(const T &label, const std::vector &children) - : label_(label) - , children_(children) - , parent_(0) {} - - // Destructor - virtual ~SyntaxTreeBase(); - - const T &label() const { return label_; } - const DerivedType *parent() const { return parent_; } - DerivedType *parent() { return parent_; } - const std::vector &children() const { return children_; } - std::vector &children() { return children_; } - - void set_label(const T &label) { label_ = label; } - void set_parent(DerivedType *parent) { parent_ = parent; } - void set_children(const std::vector &c) { children_ = c; } - - bool IsLeaf() const { return children_.empty(); } - - bool IsPreterminal() const { - return children_.size() == 1 && children_[0]->IsLeaf(); - } - - void AddChild(DerivedType *child) { children_.push_back(child); } - - private: - T label_; - std::vector children_; - DerivedType *parent_; -}; - -template -class SyntaxTree : public SyntaxTreeBase > { - public: - typedef SyntaxTreeBase > BaseType; - SyntaxTree(const T &label) : BaseType(label) {} - SyntaxTree(const T &label, const std::vector &children) - : BaseType(label, children) {} -}; - -template -SyntaxTreeBase::~SyntaxTreeBase() { - for (std::size_t i = 0; i < children_.size(); ++i) { - delete children_[i]; - } -} - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h index e738163df..1280b89cf 100644 --- a/phrase-extract/pcfg-common/typedef.h +++ b/phrase-extract/pcfg-common/typedef.h @@ -24,7 +24,6 @@ #include #include "syntax-common/numbered_set.h" -#include "syntax_tree.h" namespace MosesTraining { namespace Syntax { diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc deleted file mode 100644 index f15a04811..000000000 --- a/phrase-extract/pcfg-common/xml_tree_parser.cc +++ /dev/null @@ -1,89 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#include "xml_tree_parser.h" - -#include -#include - -#include "tables-core.h" -#include "XmlException.h" -#include "XmlTree.h" -#include "util/tokenize.hh" - -#include "syntax-common/exception.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -XmlTreeParser::XmlTreeParser() { -} - -std::auto_ptr XmlTreeParser::Parse(const std::string &line) { - m_line = line; - m_tree.Clear(); - try { - if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) { - throw Exception(""); - } - } catch (const XmlException &e) { - throw Exception(e.getMsg()); - } - m_tree.ConnectNodes(); - SyntaxNode *root = m_tree.GetTop(); - if (!root) { - // There is no XML tree. - return std::auto_ptr(); - } - m_words = util::tokenize(m_line); - return ConvertTree(*root, m_words); -} - -// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree. -std::auto_ptr XmlTreeParser::ConvertTree( - const SyntaxNode &tree, - const std::vector &words) { - std::auto_ptr root(new PcfgTree(tree.GetLabel())); - const std::vector &children = tree.GetChildren(); - if (children.empty()) { - if (tree.GetStart() != tree.GetEnd()) { - std::ostringstream msg; - msg << "leaf node covers multiple words (" << tree.GetStart() - << "-" << tree.GetEnd() << "): this is currently unsupported"; - throw Exception(msg.str()); - } - std::auto_ptr leaf(new PcfgTree(words[tree.GetStart()])); - leaf->set_parent(root.get()); - root->AddChild(leaf.release()); - } else { - for (std::vector::const_iterator p = children.begin(); - p != children.end(); ++p) { - assert(*p); - std::auto_ptr child = ConvertTree(**p, words); - child->set_parent(root.get()); - root->AddChild(child.release()); - } - } - return root; -} - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h deleted file mode 100644 index 8605c0691..000000000 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ /dev/null @@ -1,59 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_XML_TREE_PARSER_H_ -#define PCFG_XML_TREE_PARSER_H_ - -#include -#include -#include -#include -#include - -#include "pcfg_tree.h" -#include "SyntaxNode.h" -#include "SyntaxNodeCollection.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -// Parses a string in Moses' XML parse tree format and returns a PcfgTree -// object. -class XmlTreeParser { - public: - XmlTreeParser(); - std::auto_ptr Parse(const std::string &); - private: - std::auto_ptr ConvertTree(const MosesTraining::SyntaxNode &, - const std::vector &); - - std::set m_labelSet; - std::map m_topLabelSet; - std::string m_line; - MosesTraining::SyntaxNodeCollection m_tree; - std::vector m_words; -}; - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h deleted file mode 100644 index 8582e544f..000000000 --- a/phrase-extract/pcfg-common/xml_tree_writer.h +++ /dev/null @@ -1,135 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_XML_TREE_WRITER_H_ -#define PCFG_XML_TREE_WRITER_H_ - -#include -#include -#include -#include -#include -#include - -#include "XmlTree.h" - -#include "syntax_tree.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -template -class XmlOutputHandler { - public: - typedef std::map AttributeMap; - - void GetLabel(const InputTree &, std::string &) const; - void GetAttributes(const InputTree &, AttributeMap &) const; -}; - -template -class XmlTreeWriter : public XmlOutputHandler { - public: - typedef XmlOutputHandler Base; - void Write(const InputTree &, std::ostream &) const; - private: - std::string Escape(const std::string &) const; -}; - -template -void XmlTreeWriter::Write(const InputTree &tree, - std::ostream &out) const { - assert(!tree.IsLeaf()); - - // Opening tag - - std::string label; - Base::GetLabel(tree, label); - out << "first << "=\"" << p->second << "\""; - } - - out << ">"; - - // Children - - const std::vector &children = tree.children(); - for (typename std::vector::const_iterator p = children.begin(); - p != children.end(); ++p) { - InputTree &child = **p; - if (child.IsLeaf()) { - Base::GetLabel(child, label); - out << " " << Escape(label); - } else { - out << " "; - Write(**p, out); - } - } - - // Closing tag - out << " "; - - if (tree.parent() == 0) { - out << std::endl; - } -} - -// Escapes XML special characters. -template -std::string XmlTreeWriter::Escape(const std::string &s) const { - std::string t; - std::size_t len = s.size(); - t.reserve(len); - for (std::size_t i = 0; i < len; ++i) { - if (s[i] == '<') { - t += "<"; - } else if (s[i] == '>') { - t += ">"; - } else if (s[i] == '[') { - t += "["; - } else if (s[i] == ']') { - t += "]"; - } else if (s[i] == '|') { - t += "|"; - } else if (s[i] == '&') { - t += "&"; - } else if (s[i] == '\'') { - t += "'"; - } else if (s[i] == '"') { - t += """; - } else { - t += s[i]; - } - } - return t; -} - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-extract/Jamfile b/phrase-extract/pcfg-extract/Jamfile index 61f056599..2442b967a 100644 --- a/phrase-extract/pcfg-extract/Jamfile +++ b/phrase-extract/pcfg-extract/Jamfile @@ -1 +1 @@ -exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : .. ; +exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : .. ; diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc index 29d63b994..8e7a40e07 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.cc +++ b/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -19,20 +19,6 @@ #include "pcfg_extract.h" -#include "options.h" -#include "rule_collection.h" -#include "rule_extractor.h" - -#include "syntax-common/exception.h" - -#include "pcfg-common/pcfg.h" -#include "pcfg-common/pcfg_tree.h" -#include "pcfg-common/syntax_tree.h" -#include "pcfg-common/typedef.h" -#include "pcfg-common/xml_tree_parser.h" - -#include - #include #include #include @@ -43,6 +29,20 @@ #include #include +#include + +#include "syntax-common/exception.h" +#include "syntax-common/xml_tree_parser.h" + +#include "SyntaxTree.h" + +#include "pcfg-common/pcfg.h" +#include "pcfg-common/typedef.h" + +#include "options.h" +#include "rule_collection.h" +#include "rule_extractor.h" + namespace MosesTraining { namespace Syntax @@ -60,10 +60,12 @@ int PcfgExtract::Main(int argc, char *argv[]) Vocabulary non_term_vocab; RuleExtractor rule_extractor(non_term_vocab); RuleCollection rule_collection; - XmlTreeParser parser; + std::set label_set; + std::map top_label_set; + XmlTreeParser parser(label_set, top_label_set); std::string line; std::size_t line_num = 0; - std::auto_ptr tree; + std::auto_ptr tree; while (std::getline(std::cin, line)) { ++line_num; try { diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc index bd2c48c8a..39da54ef2 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.cc +++ b/phrase-extract/pcfg-extract/rule_extractor.cc @@ -19,8 +19,6 @@ #include "rule_extractor.h" -#include "pcfg-common/pcfg_tree.h" - namespace MosesTraining { namespace Syntax @@ -33,21 +31,21 @@ RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab) { } -void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const +void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const { - if (tree.IsPreterminal() || tree.IsLeaf()) { + if (tree.IsLeaf() || tree.children()[0]->IsLeaf()) { return; } - std::size_t lhs = non_term_vocab_.Insert(tree.label()); + std::size_t lhs = non_term_vocab_.Insert(tree.value().GetLabel()); std::vector rhs; - const std::vector &children = tree.children(); + const std::vector &children = tree.children(); rhs.reserve(children.size()); - for (std::vector::const_iterator p(children.begin()); + for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { - const PcfgTree &child = **p; - rhs.push_back(non_term_vocab_.Insert(child.label())); + const SyntaxTree &child = **p; + rhs.push_back(non_term_vocab_.Insert(child.value().GetLabel())); Extract(child, rc); } rc.Add(lhs, rhs); diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h index f35460909..d32d76992 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.h +++ b/phrase-extract/pcfg-extract/rule_extractor.h @@ -21,6 +21,8 @@ #ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_ #define PCFG_EXTRACT_RULE_EXTRACTOR_H_ +#include "SyntaxTree.h" + #include "pcfg-common/typedef.h" #include "rule_collection.h" @@ -32,14 +34,12 @@ namespace Syntax namespace PCFG { -class PcfgTree; - // Extracts PCFG rules from syntax trees and adds them to a RuleCollection. class RuleExtractor { public: RuleExtractor(Vocabulary &); - void Extract(const PcfgTree &, RuleCollection &) const; + void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const; private: Vocabulary &non_term_vocab_; }; diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc index 314e0fb38..d656d2882 100644 --- a/phrase-extract/pcfg-score/pcfg_score.cc +++ b/phrase-extract/pcfg-score/pcfg_score.cc @@ -33,13 +33,14 @@ #include +#include "SyntaxTree.h" + #include "syntax-common/exception.h" +#include "syntax-common/xml_tree_parser.h" +#include "syntax-common/xml_tree_writer.h" #include "pcfg-common/pcfg.h" -#include "pcfg-common/pcfg_tree.h" -#include "pcfg-common/syntax_tree.h" #include "pcfg-common/typedef.h" -#include "pcfg-common/xml_tree_parser.h" namespace MosesTraining { @@ -65,15 +66,17 @@ int PcfgScore::Main(int argc, char *argv[]) // Score corpus according to PCFG. TreeScorer scorer(pcfg, non_term_vocab); - XmlTreeParser parser; - XmlTreeWriter writer; + std::set label_set; + std::map top_label_set; + XmlTreeParser parser(label_set, top_label_set); + XmlTreeWriter writer(std::cout); std::string line; std::size_t line_num = 0; - std::auto_ptr tree; + std::auto_ptr tree; while (std::getline(std::cin, line)) { ++line_num; try { - tree = parser.Parse(line); + tree = parser.Parse(line, true); } catch (Exception &e) { std::ostringstream msg; msg << "line " << line_num << ": " << e.msg(); @@ -93,7 +96,7 @@ int PcfgScore::Main(int argc, char *argv[]) std::cout << line << std::endl; continue; } - writer.Write(*tree, std::cout); + writer.Write(*tree); } return 0; diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc index 74d6e79ef..61ae16e4c 100644 --- a/phrase-extract/pcfg-score/tree_scorer.cc +++ b/phrase-extract/pcfg-score/tree_scorer.cc @@ -20,6 +20,7 @@ #include "tree_scorer.h" #include +#include namespace MosesTraining { @@ -34,30 +35,41 @@ TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab) { } -bool TreeScorer::Score(PcfgTree &root) const +bool TreeScorer::Score(SyntaxTree &root) { - if (root.IsPreterminal() || root.IsLeaf()) { + scores_.clear(); + ZeroScores(root); + if (!CalcScores(root)) { + return false; + } + SetAttributes(root); + return true; +} + +bool TreeScorer::CalcScores(SyntaxTree &root) +{ + if (root.IsLeaf() || root.children()[0]->IsLeaf()) { return true; } - const std::vector &children = root.children(); + const std::vector &children = root.children(); double log_prob = 0.0; std::vector key; key.reserve(children.size()+1); - key.push_back(non_term_vocab_.Lookup(root.label())); + key.push_back(non_term_vocab_.Lookup(root.value().GetLabel())); - for (std::vector::const_iterator p(children.begin()); + for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { - PcfgTree *child = *p; + SyntaxTree *child = *p; assert(!child->IsLeaf()); - key.push_back(non_term_vocab_.Lookup(child->label())); - if (!Score(*child)) { + key.push_back(non_term_vocab_.Lookup(child->value().GetLabel())); + if (!CalcScores(*child)) { return false; } - if (!child->IsPreterminal()) { - log_prob += child->score(); + if (!child->children()[0]->IsLeaf()) { + log_prob += scores_[child]; } } double rule_score; @@ -66,10 +78,42 @@ bool TreeScorer::Score(PcfgTree &root) const return false; } log_prob += rule_score; - root.set_score(log_prob); + scores_[&root] = log_prob; return true; } +void TreeScorer::SetAttributes(SyntaxTree &root) +{ + // Terminals don't need attributes. + if (root.IsLeaf()) { + return; + } + // Preterminals don't need attributes (they have the implicit score 0.0). + if (root.children()[0]->IsLeaf()) { + return; + } + double score = scores_[&root]; + if (score != 0.0) { + std::ostringstream out; + out << score; + root.value().attributes["pcfg"] = out.str(); + } + for (std::vector::const_iterator p(root.children().begin()); + p != root.children().end(); ++p) { + SetAttributes(**p); + } +} + +void TreeScorer::ZeroScores(SyntaxTree &root) +{ + scores_[&root] = 0.0f; + const std::vector &children = root.children(); + for (std::vector::const_iterator p(children.begin()); + p != children.end(); ++p) { + ZeroScores(**p); + } +} + } // namespace PCFG } // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h index 8b1afcc3a..cf9fdd1a3 100644 --- a/phrase-extract/pcfg-score/tree_scorer.h +++ b/phrase-extract/pcfg-score/tree_scorer.h @@ -21,8 +21,9 @@ #ifndef PCFG_SCORE_TREE_SCORER_H_ #define PCFG_SCORE_TREE_SCORER_H_ +#include "SyntaxTree.h" + #include "pcfg-common/pcfg.h" -#include "pcfg-common/pcfg_tree.h" #include "pcfg-common/typedef.h" namespace MosesTraining @@ -39,11 +40,16 @@ public: // Score tree according to PCFG. Returns false if unsuccessful (due to // missing rule). - bool Score(PcfgTree &) const; + bool Score(SyntaxTree &); private: const Pcfg &pcfg_; const Vocabulary &non_term_vocab_; + std::map scores_; + + bool CalcScores(SyntaxTree &); + void SetAttributes(SyntaxTree &); + void ZeroScores(SyntaxTree &); }; } // namespace PCFG diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index bf3c6d87e..6eeb110e9 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -20,13 +20,14 @@ XmlTreeParser::XmlTreeParser(std::set &labelSet, { } -std::auto_ptr XmlTreeParser::Parse(const std::string &line) +std::auto_ptr XmlTreeParser::Parse(const std::string &line, + bool unescape) { line_ = line; node_collection_.Clear(); try { if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_, - top_label_set_, false)) { + top_label_set_, unescape)) { throw Exception(""); } } catch (const XmlException &e) { diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index e0b75c830..0f671c65a 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -15,18 +15,42 @@ namespace MosesTraining { namespace Syntax { -// Parses a string in Moses' XML parse tree format and returns a SyntaxTree -// object. This is a wrapper around the ProcessAndStripXMLTags function. +/** Parses string representations of parse trees in Moses' XML format and + * converts them to SyntaxTree objects. + * + * This is a thin wrapper around the ProcessAndStripXMLTags function. After + * calling Parse(), the output of the ProcessAndStripXMLTags function (the + * sentence, node collection, label set, and top label set) are available via + * accessors. + */ class XmlTreeParser { public: XmlTreeParser(std::set &, std::map &); - std::auto_ptr Parse(const std::string &); + //! Parse a single sentence and return a SyntaxTree (with words attached). + std::auto_ptr Parse(const std::string &, bool=false); - const std::vector& GetWords() { - return words_; - } + // TODO + //! Get the sentence string (see ProcessAndStripXMLTags) + //const std::string &sentence() const; + // FIXME + //! Get the sentence as a vector of tokens + const std::vector& GetWords() { return words_; } + + // TODO + //! Get the node collection (see ProcessAndStripXMLTags) + const SyntaxNodeCollection &node_collection() const; + + // TODO + //! Get the label set (see ProcessAndStripXMLTags) + const std::set &label_set() const; + + // TODO + //! Get the top label set (see ProcessAndStripXMLTags) + const std::map &top_label_set() const; + + // FIXME const SyntaxNodeCollection &GetNodeCollection() const { return node_collection_; } diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc new file mode 100644 index 000000000..3c16cb2eb --- /dev/null +++ b/phrase-extract/syntax-common/xml_tree_writer.cc @@ -0,0 +1,82 @@ +#include "xml_tree_writer.h" + +#include +#include +#include +#include + +#include "SyntaxTree.h" +#include "XmlTree.h" + + +namespace MosesTraining { +namespace Syntax { + +void XmlTreeWriter::Write(const SyntaxTree &tree) const { + assert(!tree.IsLeaf()); + + // Opening tag + out_ << "first != "label") { + out_ << " " << p->first << "=\"" << p->second << "\""; + } + } + out_ << ">"; + + // Children + for (std::vector::const_iterator p = tree.children().begin(); + p != tree.children().end(); ++p) { + SyntaxTree &child = **p; + if (child.IsLeaf()) { + out_ << " " << Escape(child.value().GetLabel()); + } else { + out_ << " "; + Write(child); + } + } + + // Closing tag + out_ << " "; + + if (tree.parent() == 0) { + out_ << std::endl; + } +} + +// Escapes XML special characters. +std::string XmlTreeWriter::Escape(const std::string &s) const { + if (!escape_) { + return s; + } + std::string t; + std::size_t len = s.size(); + t.reserve(len); + for (std::size_t i = 0; i < len; ++i) { + if (s[i] == '<') { + t += "<"; + } else if (s[i] == '>') { + t += ">"; + } else if (s[i] == '[') { + t += "["; + } else if (s[i] == ']') { + t += "]"; + } else if (s[i] == '|') { + t += "|"; + } else if (s[i] == '&') { + t += "&"; + } else if (s[i] == '\'') { + t += "'"; + } else if (s[i] == '"') { + t += """; + } else { + t += s[i]; + } + } + return t; +} + +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/xml_tree_writer.h b/phrase-extract/syntax-common/xml_tree_writer.h new file mode 100644 index 000000000..b39d01fab --- /dev/null +++ b/phrase-extract/syntax-common/xml_tree_writer.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +#include "SyntaxTree.h" + +namespace MosesTraining { +namespace Syntax { + +class XmlTreeWriter { + public: + XmlTreeWriter(std::ostream &out, bool escape=true) + : out_(out) + , escape_(escape) {} + + void Write(const SyntaxTree &) const; + + private: + std::string Escape(const std::string &) const; + + std::ostream &out_; + bool escape_; +}; + +} // namespace Syntax +} // namespace MosesTraining From 2e21f051f217a6b835433cbc456bdcc841187ec0 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 10:05:36 +0100 Subject: [PATCH 2/6] Ongoing moses/phrase-extract refactoring --- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 43 +++++++-------- .../filter-rule-table/FilterRuleTable.cpp | 4 +- phrase-extract/pcfg-extract/pcfg_extract.cc | 4 +- phrase-extract/pcfg-score/pcfg_score.cc | 4 +- .../syntax-common/xml_tree_parser.cc | 15 ++---- .../syntax-common/xml_tree_parser.h | 53 ++++++++----------- 6 files changed, 49 insertions(+), 74 deletions(-) diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 2293371ac..c48a37367 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -119,14 +119,6 @@ int ExtractGHKM::Main(int argc, char *argv[]) OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream); } - // Target label sets for producing glue grammar. - std::set targetLabelSet; - std::map targetTopLabelSet; - - // Source label sets for producing glue grammar. - std::set sourceLabelSet; - std::map sourceTopLabelSet; - // Word count statistics for producing unknown word labels. std::map targetWordCount; std::map targetWordLabel; @@ -139,8 +131,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::string sourceLine; std::string alignmentLine; Alignment alignment; - Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet); - Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet); + Syntax::XmlTreeParser targetXmlTreeParser; + Syntax::XmlTreeParser sourceXmlTreeParser; ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options); StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options); size_t lineNum = options.sentenceOffset; @@ -194,7 +186,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) } Error(oss.str()); } - sourceTokens = sourceXmlTreeParser.GetWords(); + sourceTokens = sourceXmlTreeParser.words(); } // Read word alignments. @@ -240,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Initialize phrase orientation scoring object PhraseOrientation phraseOrientation(sourceTokens.size(), - targetXmlTreeParser.GetWords().size(), alignment); + targetXmlTreeParser.words().size(), alignment); // Write the rules, subject to scope pruning. const std::vector &targetNodes = graph.GetTargetNodes(); @@ -272,7 +264,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // SCFG output. ScfgRule *r = 0; if (options.sourceLabels) { - r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection()); + r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection()); } else { r = new ScfgRule(**q); } @@ -315,14 +307,14 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::map sourceLabels; if (options.sourceLabels && !options.sourceLabelSetFile.empty()) { - - sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side) - sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side) - sourceLabelSet.insert("TOPLABEL"); // as used in the glue grammar - sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar + std::set extendedLabelSet = sourceXmlTreeParser.label_set(); + extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side) + extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side) + extendedLabelSet.insert("TOPLABEL"); // as used in the glue grammar + extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar size_t index = 0; - for (std::set::const_iterator iter=sourceLabelSet.begin(); - iter!=sourceLabelSet.end(); ++iter, ++index) { + for (std::set::const_iterator iter=extendedLabelSet.begin(); + iter!=extendedLabelSet.end(); ++iter, ++index) { sourceLabels.insert(std::pair(*iter,index)); } WriteSourceLabelSet(sourceLabels, sourceLabelSetStream); @@ -332,14 +324,18 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::map strippedTargetTopLabelSet; if (options.stripBitParLabels && (!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) { - StripBitParLabels(targetLabelSet, targetTopLabelSet, strippedTargetLabelSet, strippedTargetTopLabelSet); + StripBitParLabels(targetXmlTreeParser.label_set(), + targetXmlTreeParser.top_label_set(), + strippedTargetLabelSet, strippedTargetTopLabelSet); } if (!options.glueGrammarFile.empty()) { if (options.stripBitParLabels) { WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream); } else { - WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream); + WriteGlueGrammar(targetXmlTreeParser.label_set(), + targetXmlTreeParser.top_label_set(), + sourceLabels, options, glueGrammarStream); } } @@ -355,7 +351,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) if (options.stripBitParLabels) { WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream); } else { - WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream); + WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(), + unknownWordSoftMatchesStream); } } diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp index 0c6f132f8..32d2019cf 100644 --- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp +++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp @@ -126,9 +126,7 @@ void FilterRuleTable::ReadTestSet( void FilterRuleTable::ReadTestSet( std::istream &input, std::vector > &sentences) { - std::set labelSet; - std::map topLabelSet; - XmlTreeParser parser(labelSet, topLabelSet); + XmlTreeParser parser; int lineNum = 0; std::string line; while (std::getline(input, line)) { diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc index 8e7a40e07..87419edb7 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.cc +++ b/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -60,9 +60,7 @@ int PcfgExtract::Main(int argc, char *argv[]) Vocabulary non_term_vocab; RuleExtractor rule_extractor(non_term_vocab); RuleCollection rule_collection; - std::set label_set; - std::map top_label_set; - XmlTreeParser parser(label_set, top_label_set); + XmlTreeParser parser; std::string line; std::size_t line_num = 0; std::auto_ptr tree; diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc index d656d2882..e11f73f70 100644 --- a/phrase-extract/pcfg-score/pcfg_score.cc +++ b/phrase-extract/pcfg-score/pcfg_score.cc @@ -66,9 +66,7 @@ int PcfgScore::Main(int argc, char *argv[]) // Score corpus according to PCFG. TreeScorer scorer(pcfg, non_term_vocab); - std::set label_set; - std::map top_label_set; - XmlTreeParser parser(label_set, top_label_set); + XmlTreeParser parser; XmlTreeWriter writer(std::cout); std::string line; std::size_t line_num = 0; diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index 6eeb110e9..34f566a03 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -10,23 +10,18 @@ #include "XmlException.h" #include "XmlTree.h" +#include "exception.h" + namespace MosesTraining { namespace Syntax { -XmlTreeParser::XmlTreeParser(std::set &labelSet, - std::map &topLabelSet) - : label_set_(labelSet) - , top_label_set_(topLabelSet) -{ -} - std::auto_ptr XmlTreeParser::Parse(const std::string &line, bool unescape) { - line_ = line; + sentence_ = line; node_collection_.Clear(); try { - if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_, + if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_, top_label_set_, unescape)) { throw Exception(""); } @@ -34,7 +29,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line, throw Exception(e.getMsg()); } std::auto_ptr root = node_collection_.ExtractTree(); - words_ = util::tokenize(line_); + words_ = util::tokenize(sentence_); AttachWords(words_, *root); return root; } diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index 0f671c65a..48ea056b8 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -6,12 +6,9 @@ #include #include -#include "SyntaxNode.h" #include "SyntaxNodeCollection.h" #include "SyntaxTree.h" -#include "exception.h" - namespace MosesTraining { namespace Syntax { @@ -25,44 +22,36 @@ namespace Syntax { */ class XmlTreeParser { public: - XmlTreeParser(std::set &, std::map &); - //! Parse a single sentence and return a SyntaxTree (with words attached). - std::auto_ptr Parse(const std::string &, bool=false); + std::auto_ptr Parse(const std::string &, bool unescape=false); - // TODO - //! Get the sentence string (see ProcessAndStripXMLTags) - //const std::string &sentence() const; + //! Get the sentence string (as returned by ProcessAndStripXMLTags). + const std::string &sentence() const { return sentence_; } - // FIXME - //! Get the sentence as a vector of tokens - const std::vector& GetWords() { return words_; } + //! Get the sentence as a vector of words. + const std::vector &words() const { return words_; } - // TODO - //! Get the node collection (see ProcessAndStripXMLTags) - const SyntaxNodeCollection &node_collection() const; - - // TODO - //! Get the label set (see ProcessAndStripXMLTags) - const std::set &label_set() const; - - // TODO - //! Get the top label set (see ProcessAndStripXMLTags) - const std::map &top_label_set() const; - - // FIXME - const SyntaxNodeCollection &GetNodeCollection() const { + //! Get the node collection (as returned by ProcessAndStripXMLTags). + const SyntaxNodeCollection &node_collection() const { return node_collection_; } - private: - std::set &label_set_; - std::map &top_label_set_; - std::string line_; - SyntaxNodeCollection node_collection_; - std::vector words_; + //! Get the label set (as returned by ProcessAndStripXMLTags). + const std::set &label_set() const { return label_set_; } + //! Get the top label set (as returned by ProcessAndStripXMLTags). + const std::map &top_label_set() const { + return top_label_set_; + } + + private: void AttachWords(const std::vector &, SyntaxTree &); + + std::string sentence_; + SyntaxNodeCollection node_collection_; + std::set label_set_; + std::map top_label_set_; + std::vector words_; }; } // namespace Syntax From 5e09d3dc71ab8391c651418c01aa5c324e53683b Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 10:33:46 +0100 Subject: [PATCH 3/6] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNode.h | 25 +------------- phrase-extract/SyntaxNodeCollection.cpp | 43 ------------------------- phrase-extract/SyntaxNodeCollection.h | 10 +----- phrase-extract/XmlTree.cpp | 5 --- phrase-extract/extract-rules-main.cpp | 19 ++++++++--- 5 files changed, 17 insertions(+), 85 deletions(-) diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h index 5f57e1790..883f9724f 100644 --- a/phrase-extract/SyntaxNode.h +++ b/phrase-extract/SyntaxNode.h @@ -32,9 +32,6 @@ class SyntaxNode protected: int m_start, m_end; std::string m_label; - std::vector< SyntaxNode* > m_children; - SyntaxNode* m_parent; - float m_pcfgScore; public: typedef std::map AttributeMap; @@ -43,9 +40,7 @@ public: SyntaxNode( int startPos, int endPos, std::string label ) :m_start(startPos) ,m_end(endPos) - ,m_label(label) - ,m_parent(0) - ,m_pcfgScore(0.0f) { + ,m_label(label) { } int GetStart() const { return m_start; @@ -56,24 +51,6 @@ public: std::string GetLabel() const { return m_label; } - float GetPcfgScore() const { - return m_pcfgScore; - } - void SetPcfgScore(float score) { - m_pcfgScore = score; - } - SyntaxNode *GetParent() { - return m_parent; - } - void SetParent(SyntaxNode *parent) { - m_parent = parent; - } - void AddChild(SyntaxNode* child) { - m_children.push_back(child); - } - const std::vector< SyntaxNode* > &GetChildren() const { - return m_children; - } }; } // namespace MosesTraining diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 60a2f6c2f..e1c9c44e1 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -33,7 +33,6 @@ SyntaxNodeCollection::~SyntaxNodeCollection() void SyntaxNodeCollection::Clear() { - m_top = 0; // loop through all m_nodes, delete them for(size_t i=0; i& SyntaxNodeCollection::GetNodes( int startPos, return endIndex->second; } -void SyntaxNodeCollection::ConnectNodes() -{ - typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator; - - SyntaxNode *prev = 0; - // Iterate over all start indices from lowest to highest. - for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) { - const SyntaxTreeIndex2 &inner = p->second; - // Iterate over all end indices from highest to lowest. - for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) { - const std::vector &nodes = q->second; - // Iterate over all nodes that cover the same span in order of tree - // depth, top-most first. - for (std::vector::const_reverse_iterator r = nodes.rbegin(); - r != nodes.rend(); ++r) { - SyntaxNode *node = *r; - if (!prev) { - // node is the root. - m_top = node; - node->SetParent(0); - } else if (prev->GetStart() == node->GetStart()) { - // prev is the parent of node. - assert(prev->GetEnd() >= node->GetEnd()); - node->SetParent(prev); - prev->AddChild(node); - } else { - // prev is a descendant of node's parent. The lowest common - // ancestor of prev and node will be node's parent. - SyntaxNode *ancestor = prev->GetParent(); - while (ancestor->GetEnd() < node->GetEnd()) { - ancestor = ancestor->GetParent(); - } - assert(ancestor); - node->SetParent(ancestor); - ancestor->AddChild(node); - } - prev = node; - } - } - } -} - std::auto_ptr SyntaxNodeCollection::ExtractTree() { std::map nodeToTree; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index a0d19841c..c8ca67d3d 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -38,7 +38,6 @@ class SyntaxNodeCollection { protected: std::vector< SyntaxNode* > m_nodes; - SyntaxNode* m_top; typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2; typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2; @@ -49,18 +48,12 @@ protected: std::vector< SyntaxNode* > m_emptyNode; public: - SyntaxNodeCollection() - : m_top(0) // m_top doesn't get set unless ConnectNodes is called. - , m_size(0) {} + SyntaxNodeCollection() : m_size(0) {} ~SyntaxNodeCollection(); SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); - SyntaxNode *GetTop() { - return m_top; - } - ParentNodes Parse(); bool HasNode( int startPos, int endPos ) const; const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; @@ -70,7 +63,6 @@ public: size_t GetNumWords() const { return m_size; } - void ConnectNodes(); void Clear(); std::auto_ptr ExtractTree(); diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index d3c5da900..ffbbd453a 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -398,10 +398,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, string label = ParseXmlTagAttribute(tagContent,"label"); labelCollection.insert( label ); - string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg"); - float pcfgScore = pcfgString == "" ? 0.0f - : std::atof(pcfgString.c_str()); - // report what we have processed so far if (0) { cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; @@ -409,7 +405,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label ); - node->SetPcfgScore(pcfgScore); ParseXmlTagAttributes(tagContent, node->attributes); } } diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 825f12d89..8f1ff758b 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -110,6 +110,8 @@ void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence ); void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection); void writeUnknownWordLabel(const string &); +double getPcfgScore(const SyntaxNode &); + int main(int argc, char* argv[]) { @@ -564,8 +566,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int } if (m_options.pcfgScore) { - double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore(); - logPCFGScore -= score; + logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]); } currPos = hole.GetEnd(1); @@ -689,7 +690,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS // target if (m_options.pcfgScore) { - double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore(); + double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]); rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS) + " [" + targetLabel + "]"; rule.pcfgScore = std::exp(logPCFGScore); @@ -973,7 +974,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count rule.target += "[" + targetLabel + "]"; if (m_options.pcfgScore) { - double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore(); + double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]); rule.pcfgScore = std::exp(logPCFGScore); } @@ -1194,3 +1195,13 @@ void writeUnknownWordLabel(const string & fileName) outFile.close(); } + +double getPcfgScore(const SyntaxNode &node) +{ + double score = 0.0f; + SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg"); + if (p != node.attributes.end()) { + score = std::atof(p->second.c_str()); + } + return score; +} From ed321791a75c6177b218a0098d184c308bc9c561 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 11:10:45 +0100 Subject: [PATCH 4/6] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNode.h | 36 +++++---------- phrase-extract/SyntaxNodeCollection.cpp | 8 ++-- phrase-extract/XmlTree.cpp | 2 +- .../extract-ghkm/AlignmentGraph.cpp | 3 +- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 6 +-- phrase-extract/extract-ghkm/ScfgRule.cpp | 2 +- phrase-extract/extract-rules-main.cpp | 16 +++---- .../filter-rule-table/TreeTsgFilter.cpp | 2 +- phrase-extract/pcfg-extract/rule_extractor.cc | 4 +- phrase-extract/pcfg-score/tree_scorer.cc | 4 +- phrase-extract/relax-parse-main.cpp | 44 +++++++++---------- .../syntax-common/xml_tree_parser.cc | 6 +-- .../syntax-common/xml_tree_writer.cc | 4 +- 13 files changed, 62 insertions(+), 75 deletions(-) diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h index 883f9724f..f38e94713 100644 --- a/phrase-extract/SyntaxNode.h +++ b/phrase-extract/SyntaxNode.h @@ -20,37 +20,23 @@ #pragma once #include -#include #include -#include -namespace MosesTraining -{ +namespace MosesTraining { -class SyntaxNode -{ -protected: - int m_start, m_end; - std::string m_label; -public: +struct SyntaxNode { typedef std::map AttributeMap; - AttributeMap attributes; + SyntaxNode(const std::string &label_, int start_, int end_) + : label(label_) + , start(start_) + , end(end_) { + } - SyntaxNode( int startPos, int endPos, std::string label ) - :m_start(startPos) - ,m_end(endPos) - ,m_label(label) { - } - int GetStart() const { - return m_start; - } - int GetEnd() const { - return m_end; - } - std::string GetLabel() const { - return m_label; - } + std::string label; + int start; + int end; + AttributeMap attributes; }; } // namespace MosesTraining diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index e1c9c44e1..7421cc0ed 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -44,7 +44,7 @@ void SyntaxNodeCollection::Clear() SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, const std::string &label) { - SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); + SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); m_size = std::max(endPos+1, m_size); @@ -141,16 +141,16 @@ std::auto_ptr SyntaxNodeCollection::ExtractTree() // node is the root. root = tree; tree->parent() = 0; - } else if (prevNode->GetStart() == node->GetStart()) { + } else if (prevNode->start == node->start) { // prevNode is the parent of node. - assert(prevNode->GetEnd() >= node->GetEnd()); + assert(prevNode->end >= node->end); tree->parent() = prevTree; prevTree->children().push_back(tree); } else { // prevNode is a descendant of node's parent. The lowest common // ancestor of prevNode and node will be node's parent. SyntaxTree *ancestor = prevTree->parent(); - while (ancestor->value().GetEnd() < tree->value().GetEnd()) { + while (ancestor->value().end < tree->value().end) { ancestor = ancestor->parent(); } assert(ancestor); diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index ffbbd453a..d8b77b6e6 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -419,7 +419,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 ); for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) { SyntaxNode *n = *node; - const string &label = n->GetLabel(); + const string &label = n->label; if (topLabelCollection.find( label ) == topLabelCollection.end()) topLabelCollection[ label ] = 0; topLabelCollection[ label ]++; diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 1a3c23de5..7c179295f 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -213,7 +214,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root) { NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE; - std::auto_ptr n(new Node(root->value().GetLabel(), nodeType)); + std::auto_ptr n(new Node(root->value().label, nodeType)); if (nodeType == TREE) { float score = 0.0f; diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index c48a37367..c96cda146 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -813,7 +813,7 @@ void ExtractGHKM::CollectWordLabelCounts( for (SyntaxTree::ConstLeafIterator p(root); p != SyntaxTree::ConstLeafIterator(); ++p) { const SyntaxTree &leaf = *p; - const std::string &word = leaf.value().GetLabel(); + const std::string &word = leaf.value().label; const SyntaxTree *ancestor = leaf.parent(); // If unary rule elimination is enabled and this word is at the end of a // chain of unary rewrites, e.g. @@ -825,7 +825,7 @@ void ExtractGHKM::CollectWordLabelCounts( ancestor->parent()->children().size() == 1) { ancestor = ancestor->parent(); } - const std::string &label = ancestor->value().GetLabel(); + const std::string &label = ancestor->value().label; ++wordCount[word]; wordLabel[word] = label; } @@ -837,7 +837,7 @@ std::vector ExtractGHKM::ReadTokens(const SyntaxTree &root) const for (SyntaxTree::ConstLeafIterator p(root); p != SyntaxTree::ConstLeafIterator(); ++p) { const SyntaxTree &leaf = *p; - const std::string &word = leaf.value().GetLabel(); + const std::string &word = leaf.value().label; tokens.push_back(word); } return tokens; diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index a6fc19dd9..1a49c862e 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -144,7 +144,7 @@ void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection, sourceNodeCollection->GetNodes(span.first,span.second); if (!sourceLabels.empty()) { // store the topmost matching label from the source syntax tree - m_sourceLabels.push_back(sourceLabels.back()->GetLabel()); + m_sourceLabels.push_back(sourceLabels.back()->label); } } else { // no matching source-side syntactic constituent: store nonMatchingLabel diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 8f1ff758b..e6fff965d 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -507,7 +507,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, int labelI = labelIndex[ 2+holeCount+holeTotal ]; string label = m_options.sourceSyntax ? - m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X"; + m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X"; hole.SetLabel(label, 0); currPos = hole.GetEnd(0); @@ -550,7 +550,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int int labelI = labelIndex[ 2+holeCount ]; string targetLabel; if (m_options.targetSyntax) { - targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel(); + targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; } else { @@ -675,7 +675,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS // phrase labels string targetLabel; if (m_options.targetSyntax) { - targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel(); + targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; } else { @@ -683,7 +683,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS } string sourceLabel = m_options.sourceSyntax ? - m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X"; + m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X"; // create non-terms on the source side preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex); @@ -947,13 +947,13 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count // phrase labels string targetLabel,sourceLabel; if (m_options.targetSyntax && m_options.conditionOnTargetLhs) { - sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel(); + sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label; } else { sourceLabel = m_options.sourceSyntax ? - m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X"; + m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X"; if (m_options.targetSyntax) { - targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel(); + targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; } else { @@ -1166,7 +1166,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence ) const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti); if (labels.size() > 0) { wordCount[ word ]++; - wordLabel[ word ] = labels[0]->GetLabel(); + wordLabel[ word ] = labels[0]->label; } } } diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp index 17a8dcb22..b9c58228d 100644 --- a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp +++ b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp @@ -27,7 +27,7 @@ TreeTsgFilter::TreeTsgFilter( TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s) { - IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel())); + IdTree *t = new IdTree(m_testVocab.Insert(s.value().label)); const std::vector &sChildren = s.children(); std::vector &tChildren = t->children(); tChildren.reserve(sChildren.size()); diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc index 39da54ef2..f20f2d978 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.cc +++ b/phrase-extract/pcfg-extract/rule_extractor.cc @@ -37,7 +37,7 @@ void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const return; } - std::size_t lhs = non_term_vocab_.Insert(tree.value().GetLabel()); + std::size_t lhs = non_term_vocab_.Insert(tree.value().label); std::vector rhs; const std::vector &children = tree.children(); @@ -45,7 +45,7 @@ void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { const SyntaxTree &child = **p; - rhs.push_back(non_term_vocab_.Insert(child.value().GetLabel())); + rhs.push_back(non_term_vocab_.Insert(child.value().label)); Extract(child, rc); } rc.Add(lhs, rhs); diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc index 61ae16e4c..3c6b6b0c8 100644 --- a/phrase-extract/pcfg-score/tree_scorer.cc +++ b/phrase-extract/pcfg-score/tree_scorer.cc @@ -58,13 +58,13 @@ bool TreeScorer::CalcScores(SyntaxTree &root) std::vector key; key.reserve(children.size()+1); - key.push_back(non_term_vocab_.Lookup(root.value().GetLabel())); + key.push_back(non_term_vocab_.Lookup(root.value().label)); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { SyntaxTree *child = *p; assert(!child->IsLeaf()); - key.push_back(non_term_vocab_.Lookup(child->value().GetLabel())); + key.push_back(non_term_vocab_.Lookup(child->value().label)); if (!CalcScores(*child)) { return false; } diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index 5bca886bf..4b5c2d573 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -118,9 +118,9 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words ) // output tree nodes vector< SyntaxNode* > nodes = tree.GetAllNodes(); for( size_t i=0; iGetStart() - << "-" << nodes[i]->GetEnd() - << "\" label=\"" << nodes[i]->GetLabel() + cout << " start + << "-" << nodes[i]->end + << "\" label=\"" << nodes[i]->label << "\"/>"; } cout << endl; @@ -133,7 +133,7 @@ void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) if (point.size() > 3) { const vector< SyntaxNode* >& topNodes = tree.GetNodes( point[0], point[point.size()-1]-1); - string topLabel = topNodes[0]->GetLabel(); + string topLabel = topNodes[0]->label; for(size_t i=2; i& topNodes = tree.GetNodes( point[0], endPoint); - string topLabel = topNodes[0]->GetLabel(); + string topLabel = topNodes[0]->label; for(size_t i=1; iGetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl; + // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->label << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->label << endl; newTree.AddNode( point[i],point[i+2]-1, - tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() + tree.GetNodes(point[i ],point[i+1]-1)[0]->label + "+" + - tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() ); + tree.GetNodes(point[i+1],point[i+2]-1)[0]->label); } } if (point.size() >= 4) { int ps = point.size(); - string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel(); + string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label; - // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl; + // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->label << endl; newTree.AddNode( point[1],point[ps-1]-1, topLabel + "\\" + - tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() ); + tree.GetNodes(point[0],point[1]-1)[0]->label ); - // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl; + // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label << endl; newTree.AddNode( point[0],point[ps-2]-1, topLabel + "/" + - tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() ); + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label ); } } @@ -219,12 +219,12 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) for(int mid=start+1; mid<=end && !done; mid++) { if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) { - // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl; + // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->label << "++" << tree.GetNodes(mid, end )[0]->label << endl; newTree.AddNode( start, end, - tree.GetNodes(start,mid-1)[0]->GetLabel() + tree.GetNodes(start,mid-1)[0]->label + "++" + - tree.GetNodes(mid, end )[0]->GetLabel() ); + tree.GetNodes(mid, end )[0]->label ); done = true; } } @@ -234,9 +234,9 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) for(int postEnd=end+1; postEndGetLabel() + tree.GetNodes(start,postEnd)[0]->label + "//" + - tree.GetNodes(end+1,postEnd)[0]->GetLabel() ); + tree.GetNodes(end+1,postEnd)[0]->label ); done = true; } } @@ -245,11 +245,11 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) // if matching a constituent A left-minus constituent B: use A\\B for(int preStart=start-1; preStart>=0; preStart--) { if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) { - // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <GetLabel() << endl; + // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->label << "\\\\" <label << endl; newTree.AddNode( start, end, - tree.GetNodes(preStart,end )[0]->GetLabel() + tree.GetNodes(preStart,end )[0]->label + "\\\\" + - tree.GetNodes(preStart,start-1)[0]->GetLabel() ); + tree.GetNodes(preStart,start-1)[0]->label ); done = true; } } @@ -268,6 +268,6 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) // adding all new nodes vector< SyntaxNode* > nodes = newTree.GetAllNodes(); for( size_t i=0; iGetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel()); + tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label); } } diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index 34f566a03..8bd511522 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -47,15 +47,15 @@ void XmlTreeParser::AttachWords(const std::vector &words, for (std::vector::iterator p = leaves.begin(); p != leaves.end(); ++p) { SyntaxTree *leaf = *p; - const int start = leaf->value().GetStart(); - const int end = leaf->value().GetEnd(); + const int start = leaf->value().start; + const int end = leaf->value().end; if (start != end) { std::ostringstream msg; msg << "leaf node covers multiple words (" << start << "-" << end << "): this is currently unsupported"; throw Exception(msg.str()); } - SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++)); + SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end)); leaf->children().push_back(newLeaf); newLeaf->parent() = leaf; } diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc index 3c16cb2eb..d17937fa8 100644 --- a/phrase-extract/syntax-common/xml_tree_writer.cc +++ b/phrase-extract/syntax-common/xml_tree_writer.cc @@ -16,7 +16,7 @@ void XmlTreeWriter::Write(const SyntaxTree &tree) const { assert(!tree.IsLeaf()); // Opening tag - out_ << " Date: Wed, 3 Jun 2015 14:09:49 +0100 Subject: [PATCH 5/6] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNodeCollection.cpp | 20 ++++++----- phrase-extract/SyntaxNodeCollection.h | 44 ++++++++++++++++--------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 7421cc0ed..356c49bf4 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -47,7 +47,7 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); - m_size = std::max(endPos+1, m_size); + m_numWords = std::max(endPos+1, m_numWords); return newNode; } @@ -56,8 +56,8 @@ ParentNodes SyntaxNodeCollection::Parse() ParentNodes parents; // looping through all spans of size >= 2 - for( int length=2; length<=m_size; length++ ) { - for( int startPos = 0; startPos <= m_size-length; startPos++ ) { + for( int length=2; length<=m_numWords; length++ ) { + for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) { if (HasNode( startPos, startPos+length-1 )) { // processing one (parent) span @@ -96,13 +96,14 @@ bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const return GetNodes( startPos, endPos).size() > 0; } -const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( + int startPos, int endPos ) const { - SyntaxTreeIndexIterator startIndex = m_index.find( startPos ); + NodeIndex::const_iterator startIndex = m_index.find( startPos ); if (startIndex == m_index.end() ) return m_emptyNode; - SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos ); + InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos ); if (endIndex == startIndex->second.end()) return m_emptyNode; @@ -120,14 +121,15 @@ std::auto_ptr SyntaxNodeCollection::ExtractTree() } // Connect the SyntaxTrees. - typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator; + typedef NodeIndex::const_iterator OuterIterator; + typedef InnerNodeIndex::const_reverse_iterator InnerIterator; SyntaxTree *root = 0; SyntaxNode *prevNode = 0; SyntaxTree *prevTree = 0; // Iterate over all start indices from lowest to highest. - for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) { - const SyntaxTreeIndex2 &inner = p->second; + for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) { + const InnerNodeIndex &inner = p->second; // Iterate over all end indices from highest to lowest. for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) { const std::vector &nodes = q->second; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index c8ca67d3d..060192980 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -34,38 +34,50 @@ namespace MosesTraining typedef std::vector< int > SplitPoints; typedef std::vector< SplitPoints > ParentNodes; +/** A collection of SyntaxNodes organized by start and end position. + * + */ class SyntaxNodeCollection { -protected: - std::vector< SyntaxNode* > m_nodes; - - typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2; - typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2; - typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex; - typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator; - SyntaxTreeIndex m_index; - int m_size; - std::vector< SyntaxNode* > m_emptyNode; - public: - SyntaxNodeCollection() : m_size(0) {} + SyntaxNodeCollection() : m_numWords(0) {} ~SyntaxNodeCollection(); + //! Construct and insert a new SyntaxNode. SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); + // TODO Rename (and move?) ParentNodes Parse(); + + //! Return true iff there are one or more SyntaxNodes with the given span. bool HasNode( int startPos, int endPos ) const; + + //! Lookup the SyntaxNodes for a given span. const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; - const std::vector< SyntaxNode* >& GetAllNodes() { - return m_nodes; - }; + + //! Get a vector of pointers to all SyntaxNodes (unordered). + const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; }; + size_t GetNumWords() const { - return m_size; + return m_numWords; } void Clear(); std::auto_ptr ExtractTree(); + +private: + typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex; + typedef std::map< int, InnerNodeIndex > NodeIndex; + + // Not copyable. + SyntaxNodeCollection(const SyntaxNodeCollection &); + SyntaxNodeCollection &operator=(const SyntaxNodeCollection &); + + std::vector< SyntaxNode* > m_nodes; + NodeIndex m_index; + int m_numWords; + std::vector< SyntaxNode* > m_emptyNode; }; } // namespace MosesTraining From 8653bd81590d1f9f658d9560458dc72d9556e197 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 14:20:00 +0100 Subject: [PATCH 6/6] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNodeCollection.cpp | 40 ---------------------- phrase-extract/SyntaxNodeCollection.h | 6 ---- phrase-extract/relax-parse-main.cpp | 44 ++++++++++++++++++++++++- phrase-extract/relax-parse.h | 10 ++++-- 4 files changed, 50 insertions(+), 50 deletions(-) diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 356c49bf4..0a344fcd7 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -51,46 +51,6 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, return newNode; } -ParentNodes SyntaxNodeCollection::Parse() -{ - ParentNodes parents; - - // looping through all spans of size >= 2 - for( int length=2; length<=m_numWords; length++ ) { - for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) { - if (HasNode( startPos, startPos+length-1 )) { - // processing one (parent) span - - //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; - SplitPoints splitPoints; - splitPoints.push_back( startPos ); - //std::cerr << " " << startPos; - - int first = 1; - int covered = 0; - int found_somehing = 1; // break loop if nothing found - while( covered < length && found_somehing ) { - // find largest covering subspan (child) - // starting at last covered position - found_somehing = 0; - for( int midPos=length-first; midPos>covered; midPos-- ) { - if( HasNode( startPos+covered, startPos+midPos-1 ) ) { - covered = midPos; - splitPoints.push_back( startPos+covered ); - // std::cerr << " " << ( startPos+covered ); - first = 0; - found_somehing = 1; - } - } - } - // std::cerr << std::endl; - parents.push_back( splitPoints ); - } - } - } - return parents; -} - bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const { return GetNodes( startPos, endPos).size() > 0; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index 060192980..8de151c55 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -31,9 +31,6 @@ namespace MosesTraining { -typedef std::vector< int > SplitPoints; -typedef std::vector< SplitPoints > ParentNodes; - /** A collection of SyntaxNodes organized by start and end position. * */ @@ -47,9 +44,6 @@ public: //! Construct and insert a new SyntaxNode. SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); - // TODO Rename (and move?) - ParentNodes Parse(); - //! Return true iff there are one or more SyntaxNodes with the given span. bool HasNode( int startPos, int endPos ) const; diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index 4b5c2d573..f7a2a271b 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -50,7 +50,7 @@ int main(int argc, char* argv[]) // output tree // cerr << "BEFORE:" << endl << tree; - ParentNodes parents = tree.Parse(); + ParentNodes parents = determineSplitPoints(tree); // execute selected grammar relaxation schemes if (leftBinarizeFlag) @@ -271,3 +271,45 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label); } } + +ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl) +{ + ParentNodes parents; + + const std::size_t numWords = nodeColl.GetNumWords(); + + // looping through all spans of size >= 2 + for( int length=2; length<=numWords; length++ ) { + for( int startPos = 0; startPos <= numWords-length; startPos++ ) { + if (nodeColl.HasNode( startPos, startPos+length-1 )) { + // processing one (parent) span + + //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; + SplitPoints splitPoints; + splitPoints.push_back( startPos ); + //std::cerr << " " << startPos; + + int first = 1; + int covered = 0; + int found_somehing = 1; // break loop if nothing found + while( covered < length && found_somehing ) { + // find largest covering subspan (child) + // starting at last covered position + found_somehing = 0; + for( int midPos=length-first; midPos>covered; midPos-- ) { + if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) { + covered = midPos; + splitPoints.push_back( startPos+covered ); + // std::cerr << " " << ( startPos+covered ); + first = 0; + found_somehing = 1; + } + } + } + // std::cerr << std::endl; + parents.push_back( splitPoints ); + } + } + } + return parents; +} diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h index a00aa6deb..7c412646a 100644 --- a/phrase-extract/relax-parse.h +++ b/phrase-extract/relax-parse.h @@ -37,10 +37,14 @@ bool leftBinarizeFlag = false; bool rightBinarizeFlag = false; char SAMTLevel = 0; +typedef std::vector< int > SplitPoints; +typedef std::vector< SplitPoints > ParentNodes; + // functions void init(int argc, char* argv[]); +ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &); void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector &words ); -void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); -void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); -void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents ); +void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents ); +void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );