From 60e56efc6bc41f08e7e7ae6251f9ae8ae93e42ad Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Sun, 7 Dec 2014 14:27:51 +0000 Subject: [PATCH] phrase-extract: add syntax-common sub-library And remove some (near-)duplicate code from pcfg-common and score-stsg. --- Jamroot | 2 +- phrase-extract/pcfg-common/Jamfile | 2 +- phrase-extract/pcfg-common/exception.h | 46 ------- phrase-extract/pcfg-common/numbered_set.h | 126 ------------------ phrase-extract/pcfg-common/pcfg.cc | 10 +- phrase-extract/pcfg-common/pcfg.h | 37 ++--- phrase-extract/pcfg-common/pcfg_tree.h | 39 +++--- phrase-extract/pcfg-common/syntax_tree.h | 65 +++------ phrase-extract/pcfg-common/tool.cc | 6 +- phrase-extract/pcfg-common/tool.h | 27 ++-- phrase-extract/pcfg-common/typedef.h | 16 +-- phrase-extract/pcfg-common/xml_tree_parser.cc | 24 ++-- phrase-extract/pcfg-common/xml_tree_parser.h | 23 ++-- phrase-extract/pcfg-common/xml_tree_writer.h | 36 +++-- phrase-extract/pcfg-extract/main.cc | 2 +- phrase-extract/pcfg-extract/options.h | 10 +- phrase-extract/pcfg-extract/pcfg_extract.cc | 9 +- phrase-extract/pcfg-extract/pcfg_extract.h | 15 +-- .../pcfg-extract/rule_collection.cc | 6 +- phrase-extract/pcfg-extract/rule_collection.h | 37 ++--- phrase-extract/pcfg-extract/rule_extractor.cc | 6 +- phrase-extract/pcfg-extract/rule_extractor.h | 21 ++- phrase-extract/pcfg-score/main.cc | 2 +- phrase-extract/pcfg-score/options.h | 10 +- phrase-extract/pcfg-score/pcfg_score.cc | 30 +++-- phrase-extract/pcfg-score/pcfg_score.h | 19 ++- phrase-extract/pcfg-score/tree_scorer.cc | 6 +- phrase-extract/pcfg-score/tree_scorer.h | 17 ++- phrase-extract/relax-parse-main.cpp | 2 +- phrase-extract/score-stsg/Exception.h | 23 ---- phrase-extract/score-stsg/Jamfile | 2 +- phrase-extract/score-stsg/LexicalTable.cpp | 7 +- phrase-extract/score-stsg/LexicalTable.h | 7 +- phrase-extract/score-stsg/Main.cpp | 2 +- phrase-extract/score-stsg/Options.h | 7 +- phrase-extract/score-stsg/RuleGroup.cpp | 7 +- phrase-extract/score-stsg/RuleGroup.h | 7 +- phrase-extract/score-stsg/RuleSymbol.h | 7 +- phrase-extract/score-stsg/RuleTableWriter.cpp | 8 +- phrase-extract/score-stsg/RuleTableWriter.h | 11 +- phrase-extract/score-stsg/ScoreStsg.cpp | 29 ++-- phrase-extract/score-stsg/ScoreStsg.h | 19 +-- .../score-stsg/TokenizedRuleHalf.cpp | 7 +- phrase-extract/score-stsg/TokenizedRuleHalf.h | 12 +- phrase-extract/score-stsg/Vocabulary.h | 8 +- phrase-extract/syntax-common/Jamfile | 8 ++ phrase-extract/syntax-common/exception.h | 20 +++ .../numbered_set.h} | 10 +- phrase-extract/syntax-common/string_tree.h | 13 ++ phrase-extract/syntax-common/tree-inl.h | 115 ++++++++++++++++ phrase-extract/syntax-common/tree.h | 91 +++++++++++++ .../tree_fragment_tokenizer.cc} | 10 +- .../tree_fragment_tokenizer.h} | 17 ++- .../tree_fragment_tokenizer_test.cc | 74 ++++++++++ phrase-extract/syntax-common/tree_test.cc | 66 +++++++++ .../syntax-common/xml_tree_parser.cc | 59 ++++++++ .../syntax-common/xml_tree_parser.h | 34 +++++ scripts/other/beautify.perl | 2 + 58 files changed, 805 insertions(+), 528 deletions(-) delete mode 100644 phrase-extract/pcfg-common/exception.h delete mode 100644 phrase-extract/pcfg-common/numbered_set.h delete mode 100644 phrase-extract/score-stsg/Exception.h create mode 100644 phrase-extract/syntax-common/Jamfile create mode 100644 phrase-extract/syntax-common/exception.h rename phrase-extract/{score-stsg/NumberedSet.h => syntax-common/numbered_set.h} (96%) create mode 100644 phrase-extract/syntax-common/string_tree.h create mode 100644 phrase-extract/syntax-common/tree-inl.h create mode 100644 phrase-extract/syntax-common/tree.h rename phrase-extract/{score-stsg/TreeFragmentTokenizer.cpp => syntax-common/tree_fragment_tokenizer.cc} (93%) rename phrase-extract/{score-stsg/TreeFragmentTokenizer.h => syntax-common/tree_fragment_tokenizer.h} (85%) create mode 100644 phrase-extract/syntax-common/tree_fragment_tokenizer_test.cc create mode 100644 phrase-extract/syntax-common/tree_test.cc create mode 100644 phrase-extract/syntax-common/xml_tree_parser.cc create mode 100644 phrase-extract/syntax-common/xml_tree_parser.h diff --git a/Jamroot b/Jamroot index c20dd1e48..ce14258a5 100644 --- a/Jamroot +++ b/Jamroot @@ -173,7 +173,7 @@ project : requirements ; #Add directories here if you want their incidental targets too (i.e. tests). -build-projects lm util phrase-extract search moses moses/LM mert moses-cmd mira scripts regression-testing ; +build-projects lm util phrase-extract phrase-extract/syntax-common search moses moses/LM mert moses-cmd mira scripts regression-testing ; if [ option.get "with-mm" : : "yes" ] { diff --git a/phrase-extract/pcfg-common/Jamfile b/phrase-extract/pcfg-common/Jamfile index b74b1071d..5669b443e 100644 --- a/phrase-extract/pcfg-common/Jamfile +++ b/phrase-extract/pcfg-common/Jamfile @@ -1 +1 @@ -lib pcfg_common : [ glob *.cc ] ..//deps : .. ; +lib pcfg_common : [ glob *.cc ] ..//syntax-common ..//deps : .. ; diff --git a/phrase-extract/pcfg-common/exception.h b/phrase-extract/pcfg-common/exception.h deleted file mode 100644 index d9266ca36..000000000 --- a/phrase-extract/pcfg-common/exception.h +++ /dev/null @@ -1,46 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_EXCEPTION_H_ -#define PCFG_EXCEPTION_H_ - -#include - -namespace Moses -{ -namespace PCFG -{ - -class Exception -{ -public: - Exception(const char *msg) : msg_(msg) {} - Exception(const std::string &msg) : msg_(msg) {} - const std::string &msg() const { - return msg_; - } -private: - std::string msg_; -}; - -} // namespace PCFG -} // namespace Moses - -#endif diff --git a/phrase-extract/pcfg-common/numbered_set.h b/phrase-extract/pcfg-common/numbered_set.h deleted file mode 100644 index 66e960404..000000000 --- a/phrase-extract/pcfg-common/numbered_set.h +++ /dev/null @@ -1,126 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_NUMBERED_SET_H_ -#define PCFG_NUMBERED_SET_H_ - -#include "exception.h" - -#include - -#include -#include -#include - -namespace Moses -{ -namespace PCFG -{ - -// Stores a set of elements of type T, each of which is allocated an integral -// ID of type I. IDs are contiguous starting at 0. Individual elements cannot -// be removed once inserted (but the whole set can be cleared). -template -class NumberedSet -{ -private: - typedef boost::unordered_map ElementToIdMap; - typedef std::vector IdToElementMap; - -public: - typedef I IdType; - typedef typename IdToElementMap::const_iterator const_iterator; - - NumberedSet() {} - - const_iterator begin() const { - return id_to_element_.begin(); - } - const_iterator end() const { - return id_to_element_.end(); - } - - // Static value - static I NullId() { - return std::numeric_limits::max(); - } - - bool Empty() const { - return id_to_element_.empty(); - } - std::size_t Size() const { - return id_to_element_.size(); - } - - // Insert the given object and return its ID. - I Insert(const T &); - - I Lookup(const T &) const; - const T &Lookup(I) const; - - void Clear(); - -private: - ElementToIdMap element_to_id_; - IdToElementMap id_to_element_; -}; - -template -I NumberedSet::Lookup(const T &s) const -{ - typename ElementToIdMap::const_iterator p = element_to_id_.find(s); - return (p == element_to_id_.end()) ? NullId() : p->second; -} - -template -const T &NumberedSet::Lookup(I id) const -{ - if (id < 0 || id >= id_to_element_.size()) { - std::ostringstream msg; - msg << "Value not found: " << id; - throw Exception(msg.str()); - } - return *(id_to_element_[id]); -} - -template -I NumberedSet::Insert(const T &x) -{ - std::pair value(x, id_to_element_.size()); - std::pair result = - element_to_id_.insert(value); - if (result.second) { - // x is a new element. - id_to_element_.push_back(&result.first->first); - } - return result.first->second; -} - -template -void NumberedSet::Clear() -{ - element_to_id_.clear(); - id_to_element_.clear(); -} - -} // namespace PCFG -} // namespace Moses - -#endif diff --git a/phrase-extract/pcfg-common/pcfg.cc b/phrase-extract/pcfg-common/pcfg.cc index 054e20a48..cae6d4763 100644 --- a/phrase-extract/pcfg-common/pcfg.cc +++ b/phrase-extract/pcfg-common/pcfg.cc @@ -19,14 +19,15 @@ #include "pcfg.h" -#include "exception.h" +#include #include #include -#include +#include "syntax-common/exception.h" -namespace Moses { +namespace MosesTraining { +namespace Syntax { namespace PCFG { void Pcfg::Add(const Key &key, double score) { @@ -103,4 +104,5 @@ void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const { } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-common/pcfg.h b/phrase-extract/pcfg-common/pcfg.h index 5398cd97e..c5c04cba4 100644 --- a/phrase-extract/pcfg-common/pcfg.h +++ b/phrase-extract/pcfg-common/pcfg.h @@ -21,21 +21,19 @@ #ifndef PCFG_PCFG_H_ #define PCFG_PCFG_H_ -#include "typedef.h" - #include #include #include #include -namespace Moses -{ -namespace PCFG -{ +#include "typedef.h" -class Pcfg -{ -public: +namespace MosesTraining { +namespace Syntax { +namespace PCFG { + +class Pcfg { + public: typedef std::vector Key; typedef std::map Map; typedef Map::iterator iterator; @@ -43,30 +41,23 @@ public: Pcfg() {} - iterator begin() { - return rules_.begin(); - } - const_iterator begin() const { - return rules_.begin(); - } + iterator begin() { return rules_.begin(); } + const_iterator begin() const { return rules_.begin(); } - iterator end() { - return rules_.end(); - } - const_iterator end() const { - return rules_.end(); - } + iterator end() { return rules_.end(); } + const_iterator end() const { return rules_.end(); } void Add(const Key &, double); bool Lookup(const Key &, double &) const; void Read(std::istream &, Vocabulary &); void Write(const Vocabulary &, std::ostream &) const; -private: + private: Map rules_; }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h index d125cad16..ce28eb8dd 100644 --- a/phrase-extract/pcfg-common/pcfg_tree.h +++ b/phrase-extract/pcfg-common/pcfg_tree.h @@ -21,48 +21,40 @@ #ifndef PCFG_PCFG_TREE_H_ #define PCFG_PCFG_TREE_H_ +#include + #include "syntax_tree.h" #include "xml_tree_writer.h" -#include - -namespace Moses -{ -namespace PCFG -{ +namespace MosesTraining { +namespace Syntax { +namespace PCFG { template -class PcfgTreeBase : public SyntaxTreeBase -{ -public: +class PcfgTreeBase : public SyntaxTreeBase { + public: typedef std::string LabelType; typedef SyntaxTreeBase BaseType; PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {} - double score() const { - return score_; - } - void set_score(double s) { - score_ = s; - } + double score() const { return score_; } + void set_score(double s) { score_ = s; } -private: + private: double score_; }; -class PcfgTree : public PcfgTreeBase -{ -public: +class PcfgTree : public PcfgTreeBase { + public: typedef PcfgTreeBase BaseType; PcfgTree(const BaseType::LabelType &label) : BaseType(label) {} }; // Specialise XmlOutputHandler for PcfgTree. template<> -class XmlOutputHandler -{ -public: +class XmlOutputHandler { + public: typedef std::map AttributeMap; void GetLabel(const PcfgTree &tree, std::string &label) const { @@ -81,6 +73,7 @@ public: }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h index 93d9dbec9..c0c6eaef9 100644 --- a/phrase-extract/pcfg-common/syntax_tree.h +++ b/phrase-extract/pcfg-common/syntax_tree.h @@ -24,16 +24,14 @@ #include #include -namespace Moses -{ -namespace PCFG -{ +namespace MosesTraining { +namespace Syntax { +namespace PCFG { // Base class for SyntaxTree, AgreementTree, and friends. template -class SyntaxTreeBase -{ -public: +class SyntaxTreeBase { + public: // Constructors SyntaxTreeBase(const T &label) : label_(label) @@ -48,54 +46,33 @@ public: // Destructor virtual ~SyntaxTreeBase(); - const T &label() const { - return label_; - } - const DerivedType *parent() const { - return parent_; - } - DerivedType *parent() { - return parent_; - } - const std::vector &children() const { - return children_; - } - std::vector &children() { - return children_; - } + const T &label() const { return label_; } + const DerivedType *parent() const { return parent_; } + DerivedType *parent() { return parent_; } + const std::vector &children() const { return children_; } + std::vector &children() { return children_; } - void set_label(const T &label) { - label_ = label; - } - void set_parent(DerivedType *parent) { - parent_ = parent; - } - void set_children(const std::vector &c) { - children_ = c; - } + void set_label(const T &label) { label_ = label; } + void set_parent(DerivedType *parent) { parent_ = parent; } + void set_children(const std::vector &c) { children_ = c; } - bool IsLeaf() const { - return children_.empty(); - } + bool IsLeaf() const { return children_.empty(); } bool IsPreterminal() const { return children_.size() == 1 && children_[0]->IsLeaf(); } - void AddChild(DerivedType *child) { - children_.push_back(child); - } + void AddChild(DerivedType *child) { children_.push_back(child); } -private: + private: T label_; std::vector children_; DerivedType *parent_; }; template -class SyntaxTree : public SyntaxTreeBase > -{ -public: +class SyntaxTree : public SyntaxTreeBase > { + public: typedef SyntaxTreeBase > BaseType; SyntaxTree(const T &label) : BaseType(label) {} SyntaxTree(const T &label, const std::vector &children) @@ -103,14 +80,14 @@ public: }; template -SyntaxTreeBase::~SyntaxTreeBase() -{ +SyntaxTreeBase::~SyntaxTreeBase() { for (std::size_t i = 0; i < children_.size(); ++i) { delete children_[i]; } } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-common/tool.cc b/phrase-extract/pcfg-common/tool.cc index bebd220e1..f54e07a12 100644 --- a/phrase-extract/pcfg-common/tool.cc +++ b/phrase-extract/pcfg-common/tool.cc @@ -21,7 +21,8 @@ #include -namespace Moses { +namespace MosesTraining { +namespace Syntax { namespace PCFG { std::istream &Tool::OpenInputOrDie(const std::string &filename) { @@ -77,4 +78,5 @@ void Tool::OpenNamedOutputOrDie(const std::string &filename, } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-common/tool.h b/phrase-extract/pcfg-common/tool.h index aada036e3..2c903a11e 100644 --- a/phrase-extract/pcfg-common/tool.h +++ b/phrase-extract/pcfg-common/tool.h @@ -21,30 +21,26 @@ #ifndef PCFG_TOOL_H_ #define PCFG_TOOL_H_ -#include - #include #include #include #include -namespace Moses -{ -namespace PCFG -{ +#include -class Tool -{ -public: +namespace MosesTraining { +namespace Syntax { +namespace PCFG { + +class Tool { + public: virtual ~Tool() {} - const std::string &name() const { - return name_; - } + const std::string &name() const { return name_; } virtual int Main(int argc, char *argv[]) = 0; -protected: + protected: Tool(const std::string &name) : name_(name) {} // Returns the boost::program_options style that should be used by all tools. @@ -82,7 +78,7 @@ protected: // the file cannot be opened for writing. void OpenNamedOutputOrDie(const std::string &, std::ofstream &); -private: + private: std::string name_; std::istream *input_ptr_; std::ifstream input_file_stream_; @@ -91,6 +87,7 @@ private: }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h index ce3e0423b..e738163df 100644 --- a/phrase-extract/pcfg-common/typedef.h +++ b/phrase-extract/pcfg-common/typedef.h @@ -21,19 +21,19 @@ #ifndef PCFG_TYPEDEF_H_ #define PCFG_TYPEDEF_H_ -#include "numbered_set.h" -#include "syntax_tree.h" - #include -namespace Moses -{ -namespace PCFG -{ +#include "syntax-common/numbered_set.h" +#include "syntax_tree.h" + +namespace MosesTraining { +namespace Syntax { +namespace PCFG { typedef NumberedSet Vocabulary; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc index b6c1da177..3d9291994 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.cc +++ b/phrase-extract/pcfg-common/xml_tree_parser.cc @@ -19,25 +19,23 @@ #include "xml_tree_parser.h" -#include "exception.h" +#include +#include + #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" -#include -#include +#include "syntax-common/exception.h" -using namespace MosesTraining; - -namespace Moses { +namespace MosesTraining { +namespace Syntax { namespace PCFG { -XmlTreeParser::XmlTreeParser() -{ +XmlTreeParser::XmlTreeParser() { } -std::auto_ptr XmlTreeParser::Parse(const std::string &line) -{ +std::auto_ptr XmlTreeParser::Parse(const std::string &line) { m_line = line; m_tree.Clear(); try { @@ -60,8 +58,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) // Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree. std::auto_ptr XmlTreeParser::ConvertTree( const SyntaxNode &tree, - const std::vector &words) -{ + const std::vector &words) { std::auto_ptr root(new PcfgTree(tree.GetLabel())); const std::vector &children = tree.GetChildren(); if (children.empty()) { @@ -87,4 +84,5 @@ std::auto_ptr XmlTreeParser::ConvertTree( } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h index 7eec14033..675a112d8 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ b/phrase-extract/pcfg-common/xml_tree_parser.h @@ -21,28 +21,26 @@ #ifndef PCFG_XML_TREE_PARSER_H_ #define PCFG_XML_TREE_PARSER_H_ -#include "pcfg_tree.h" -#include "SyntaxTree.h" - #include #include #include #include #include -namespace Moses -{ -namespace PCFG -{ +#include "pcfg_tree.h" +#include "SyntaxTree.h" + +namespace MosesTraining { +namespace Syntax { +namespace PCFG { // Parses a string in Moses' XML parse tree format and returns a PcfgTree // object. -class XmlTreeParser -{ -public: +class XmlTreeParser { + public: XmlTreeParser(); std::auto_ptr Parse(const std::string &); -private: + private: std::auto_ptr ConvertTree(const MosesTraining::SyntaxNode &, const std::vector &); @@ -54,6 +52,7 @@ private: }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h index e09942279..8582e544f 100644 --- a/phrase-extract/pcfg-common/xml_tree_writer.h +++ b/phrase-extract/pcfg-common/xml_tree_writer.h @@ -21,10 +21,6 @@ #ifndef PCFG_XML_TREE_WRITER_H_ #define PCFG_XML_TREE_WRITER_H_ -#include "syntax_tree.h" - -#include "XmlTree.h" - #include #include #include @@ -32,15 +28,17 @@ #include #include -namespace Moses -{ -namespace PCFG -{ +#include "XmlTree.h" + +#include "syntax_tree.h" + +namespace MosesTraining { +namespace Syntax { +namespace PCFG { template -class XmlOutputHandler -{ -public: +class XmlOutputHandler { + public: typedef std::map AttributeMap; void GetLabel(const InputTree &, std::string &) const; @@ -48,19 +46,17 @@ public: }; template -class XmlTreeWriter : public XmlOutputHandler -{ -public: +class XmlTreeWriter : public XmlOutputHandler { + public: typedef XmlOutputHandler Base; void Write(const InputTree &, std::ostream &) const; -private: + private: std::string Escape(const std::string &) const; }; template void XmlTreeWriter::Write(const InputTree &tree, - std::ostream &out) const -{ + std::ostream &out) const { assert(!tree.IsLeaf()); // Opening tag @@ -104,8 +100,7 @@ void XmlTreeWriter::Write(const InputTree &tree, // Escapes XML special characters. template -std::string XmlTreeWriter::Escape(const std::string &s) const -{ +std::string XmlTreeWriter::Escape(const std::string &s) const { std::string t; std::size_t len = s.size(); t.reserve(len); @@ -134,6 +129,7 @@ std::string XmlTreeWriter::Escape(const std::string &s) const } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-extract/main.cc b/phrase-extract/pcfg-extract/main.cc index 47b45afc3..84051f2e2 100644 --- a/phrase-extract/pcfg-extract/main.cc +++ b/phrase-extract/pcfg-extract/main.cc @@ -20,6 +20,6 @@ #include "pcfg_extract.h" int main(int argc, char *argv[]) { - Moses::PCFG::PcfgExtract tool; + MosesTraining::Syntax::PCFG::PcfgExtract tool; return tool.Main(argc, argv); } diff --git a/phrase-extract/pcfg-extract/options.h b/phrase-extract/pcfg-extract/options.h index 2633f025a..ffaa3bb17 100644 --- a/phrase-extract/pcfg-extract/options.h +++ b/phrase-extract/pcfg-extract/options.h @@ -23,16 +23,16 @@ #include -namespace Moses -{ -namespace PCFG -{ +namespace MosesTraining { +namespace Syntax { +namespace PCFG { struct Options { std::string corpus_file; }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc index 71c2e31c3..a5e06aa82 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.cc +++ b/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -23,7 +23,8 @@ #include "rule_collection.h" #include "rule_extractor.h" -#include "pcfg-common/exception.h" +#include "syntax-common/exception.h" + #include "pcfg-common/pcfg.h" #include "pcfg-common/pcfg_tree.h" #include "pcfg-common/syntax_tree.h" @@ -42,7 +43,8 @@ #include #include -namespace Moses { +namespace MosesTraining { +namespace Syntax { namespace PCFG { int PcfgExtract::Main(int argc, char *argv[]) { @@ -128,4 +130,5 @@ void PcfgExtract::ProcessOptions(int argc, char *argv[], } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-extract/pcfg_extract.h b/phrase-extract/pcfg-extract/pcfg_extract.h index e8c306876..835564341 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.h +++ b/phrase-extract/pcfg-extract/pcfg_extract.h @@ -23,15 +23,13 @@ #include "pcfg-common/tool.h" -namespace Moses -{ -namespace PCFG -{ +namespace MosesTraining { +namespace Syntax { +namespace PCFG { -class Options; +struct Options; -class PcfgExtract : public Tool -{ +class PcfgExtract : public Tool { public: PcfgExtract() : Tool("pcfg-extract") {} virtual int Main(int, char *[]); @@ -40,6 +38,7 @@ private: }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-extract/rule_collection.cc b/phrase-extract/pcfg-extract/rule_collection.cc index 32b63e0ef..21e84d2fa 100644 --- a/phrase-extract/pcfg-extract/rule_collection.cc +++ b/phrase-extract/pcfg-extract/rule_collection.cc @@ -23,7 +23,8 @@ #include -namespace Moses { +namespace MosesTraining { +namespace Syntax { namespace PCFG { void RuleCollection::Add(std::size_t lhs, const std::vector &rhs) { @@ -55,4 +56,5 @@ void RuleCollection::CreatePcfg(Pcfg &pcfg) { } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-extract/rule_collection.h b/phrase-extract/pcfg-extract/rule_collection.h index 32cb2dc05..66fa98657 100644 --- a/phrase-extract/pcfg-extract/rule_collection.h +++ b/phrase-extract/pcfg-extract/rule_collection.h @@ -21,21 +21,19 @@ #ifndef PCFG_EXTRACT_RULE_COLLECTION_H_ #define PCFG_EXTRACT_RULE_COLLECTION_H_ -#include "pcfg-common/pcfg.h" +#include #include -#include +#include "pcfg-common/pcfg.h" -namespace Moses -{ -namespace PCFG -{ +namespace MosesTraining { +namespace Syntax { +namespace PCFG { // Contains PCFG rules and their counts. -class RuleCollection -{ -public: +class RuleCollection { + public: typedef boost::unordered_map, std::size_t> RhsCountMap; typedef boost::unordered_map Map; typedef Map::iterator iterator; @@ -43,28 +41,21 @@ public: RuleCollection() {} - iterator begin() { - return collection_.begin(); - } - const_iterator begin() const { - return collection_.begin(); - } + iterator begin() { return collection_.begin(); } + const_iterator begin() const { return collection_.begin(); } - iterator end() { - return collection_.end(); - } - const_iterator end() const { - return collection_.end(); - } + iterator end() { return collection_.end(); } + const_iterator end() const { return collection_.end(); } void Add(std::size_t, const std::vector &); void CreatePcfg(Pcfg &); -private: + private: Map collection_; }; } // namespace PCFG -} // namespace Moses +} // namespace Synatx +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc index 217574e7d..bb4698fae 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.cc +++ b/phrase-extract/pcfg-extract/rule_extractor.cc @@ -21,7 +21,8 @@ #include "pcfg-common/pcfg_tree.h" -namespace Moses { +namespace MosesTraining { +namespace Syntax { namespace PCFG { RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab) @@ -48,4 +49,5 @@ void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const { } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h index e4b411c01..1dddd796f 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.h +++ b/phrase-extract/pcfg-extract/rule_extractor.h @@ -21,28 +21,27 @@ #ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_ #define PCFG_EXTRACT_RULE_EXTRACTOR_H_ -#include "rule_collection.h" - #include "pcfg-common/typedef.h" -namespace Moses -{ -namespace PCFG -{ +#include "rule_collection.h" + +namespace MosesTraining { +namespace Syntax { +namespace PCFG { class PcfgTree; // Extracts PCFG rules from syntax trees and adds them to a RuleCollection. -class RuleExtractor -{ -public: +class RuleExtractor { + public: RuleExtractor(Vocabulary &); void Extract(const PcfgTree &, RuleCollection &) const; -private: + private: Vocabulary &non_term_vocab_; }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-score/main.cc b/phrase-extract/pcfg-score/main.cc index da5392add..5ce19f797 100644 --- a/phrase-extract/pcfg-score/main.cc +++ b/phrase-extract/pcfg-score/main.cc @@ -20,6 +20,6 @@ #include "pcfg_score.h" int main(int argc, char *argv[]) { - Moses::PCFG::PcfgScore tool; + MosesTraining::Syntax::PCFG::PcfgScore tool; return tool.Main(argc, argv); } diff --git a/phrase-extract/pcfg-score/options.h b/phrase-extract/pcfg-score/options.h index fd54b4b6b..bbd56d6d0 100644 --- a/phrase-extract/pcfg-score/options.h +++ b/phrase-extract/pcfg-score/options.h @@ -23,16 +23,16 @@ #include -namespace Moses -{ -namespace PCFG -{ +namespace MosesTraining { +namespace Syntax { +namespace PCFG { struct Options { std::string pcfg_file; }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc index 345d7fc60..a561c18ed 100644 --- a/phrase-extract/pcfg-score/pcfg_score.cc +++ b/phrase-extract/pcfg-score/pcfg_score.cc @@ -19,18 +19,6 @@ #include "pcfg_score.h" -#include "options.h" -#include "tree_scorer.h" - -#include "pcfg-common/exception.h" -#include "pcfg-common/pcfg.h" -#include "pcfg-common/pcfg_tree.h" -#include "pcfg-common/syntax_tree.h" -#include "pcfg-common/typedef.h" -#include "pcfg-common/xml_tree_parser.h" - -#include - #include #include #include @@ -40,8 +28,21 @@ #include #include #include +#include "options.h" +#include "tree_scorer.h" -namespace Moses { +#include + +#include "syntax-common/exception.h" + +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/syntax_tree.h" +#include "pcfg-common/typedef.h" +#include "pcfg-common/xml_tree_parser.h" + +namespace MosesTraining { +namespace Syntax { namespace PCFG { int PcfgScore::Main(int argc, char *argv[]) { @@ -149,4 +150,5 @@ void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const { } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-score/pcfg_score.h b/phrase-extract/pcfg-score/pcfg_score.h index f49c9a0be..fb9971c35 100644 --- a/phrase-extract/pcfg-score/pcfg_score.h +++ b/phrase-extract/pcfg-score/pcfg_score.h @@ -23,23 +23,22 @@ #include "pcfg-common/tool.h" -namespace Moses -{ -namespace PCFG -{ +namespace MosesTraining { +namespace Syntax { +namespace PCFG { -class Options; +struct Options; -class PcfgScore : public Tool -{ -public: +class PcfgScore : public Tool { + public: PcfgScore() : Tool("pcfg-score") {} virtual int Main(int, char *[]); -private: + private: void ProcessOptions(int, char *[], Options &) const; }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc index f9ce97ae0..53b6aaccf 100644 --- a/phrase-extract/pcfg-score/tree_scorer.cc +++ b/phrase-extract/pcfg-score/tree_scorer.cc @@ -21,7 +21,8 @@ #include -namespace Moses { +namespace MosesTraining { +namespace Syntax { namespace PCFG { TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab) @@ -65,4 +66,5 @@ bool TreeScorer::Score(PcfgTree &root) const { } } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h index 8cb59c0c2..3cf4693a6 100644 --- a/phrase-extract/pcfg-score/tree_scorer.h +++ b/phrase-extract/pcfg-score/tree_scorer.h @@ -25,26 +25,25 @@ #include "pcfg-common/pcfg_tree.h" #include "pcfg-common/typedef.h" -namespace Moses -{ -namespace PCFG -{ +namespace MosesTraining { +namespace Syntax { +namespace PCFG { -class TreeScorer -{ -public: +class TreeScorer { + public: TreeScorer(const Pcfg &, const Vocabulary &); // Score tree according to PCFG. Returns false if unsuccessful (due to // missing rule). bool Score(PcfgTree &) const; -private: + private: const Pcfg &pcfg_; const Vocabulary &non_term_vocab_; }; } // namespace PCFG -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining #endif diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index b415c4d0e..a6d50cef5 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -77,7 +77,7 @@ void init(int argc, char* argv[]) if (argc < 2) { cerr << "syntax: relax-parse < in-parse > out-parse [" - << " --LeftBinarize | ---RightBinarize |" + << " --LeftBinarize | --RightBinarize |" << " --SAMT 1-4 ]" << endl; exit(1); } diff --git a/phrase-extract/score-stsg/Exception.h b/phrase-extract/score-stsg/Exception.h deleted file mode 100644 index e8c56c0d3..000000000 --- a/phrase-extract/score-stsg/Exception.h +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once - -#include - -namespace Moses -{ -namespace ScoreStsg -{ - -class Exception -{ -public: - Exception(const char *msg) : m_msg(msg) {} - Exception(const std::string &msg) : m_msg(msg) {} - const std::string &GetMsg() const { - return m_msg; - } -private: - std::string m_msg; -}; - -} // namespace ScoreStsg -} // namespace Moses diff --git a/phrase-extract/score-stsg/Jamfile b/phrase-extract/score-stsg/Jamfile index aa37292fa..6ae17b565 100644 --- a/phrase-extract/score-stsg/Jamfile +++ b/phrase-extract/score-stsg/Jamfile @@ -1 +1 @@ -exe score-stsg : [ glob *.cpp ] ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : .. ; +exe score-stsg : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : .. ; diff --git a/phrase-extract/score-stsg/LexicalTable.cpp b/phrase-extract/score-stsg/LexicalTable.cpp index 797a6a903..d5d7ce6ab 100644 --- a/phrase-extract/score-stsg/LexicalTable.cpp +++ b/phrase-extract/score-stsg/LexicalTable.cpp @@ -5,7 +5,9 @@ #include #include -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -50,4 +52,5 @@ void LexicalTable::Load(std::istream &input) } } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/LexicalTable.h b/phrase-extract/score-stsg/LexicalTable.h index b2ccf6984..54bae1dec 100644 --- a/phrase-extract/score-stsg/LexicalTable.h +++ b/phrase-extract/score-stsg/LexicalTable.h @@ -8,7 +8,9 @@ #include "Vocabulary.h" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -40,4 +42,5 @@ private: }; } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/Main.cpp b/phrase-extract/score-stsg/Main.cpp index 3809bd503..4a8f7a57f 100644 --- a/phrase-extract/score-stsg/Main.cpp +++ b/phrase-extract/score-stsg/Main.cpp @@ -2,6 +2,6 @@ int main(int argc, char *argv[]) { - Moses::ScoreStsg::ScoreStsg tool; + MosesTraining::Syntax::ScoreStsg::ScoreStsg tool; return tool.Main(argc, argv); } diff --git a/phrase-extract/score-stsg/Options.h b/phrase-extract/score-stsg/Options.h index 5b3664052..17b959c84 100644 --- a/phrase-extract/score-stsg/Options.h +++ b/phrase-extract/score-stsg/Options.h @@ -2,7 +2,9 @@ #include -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -38,4 +40,5 @@ public: }; } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/RuleGroup.cpp b/phrase-extract/score-stsg/RuleGroup.cpp index 8c4f7b9fd..bbbe3b2b6 100644 --- a/phrase-extract/score-stsg/RuleGroup.cpp +++ b/phrase-extract/score-stsg/RuleGroup.cpp @@ -1,6 +1,8 @@ #include "RuleGroup.h" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -39,4 +41,5 @@ void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign, } } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/RuleGroup.h b/phrase-extract/score-stsg/RuleGroup.h index a2889f2bf..de0c25f17 100644 --- a/phrase-extract/score-stsg/RuleGroup.h +++ b/phrase-extract/score-stsg/RuleGroup.h @@ -6,7 +6,9 @@ #include "util/string_piece.hh" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -65,4 +67,5 @@ private: }; } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/RuleSymbol.h b/phrase-extract/score-stsg/RuleSymbol.h index 4c9ae0083..efefe6266 100644 --- a/phrase-extract/score-stsg/RuleSymbol.h +++ b/phrase-extract/score-stsg/RuleSymbol.h @@ -2,7 +2,9 @@ #include "util/string_piece.hh" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -14,4 +16,5 @@ struct RuleSymbol }; } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/RuleTableWriter.cpp b/phrase-extract/score-stsg/RuleTableWriter.cpp index 62f3c2702..0a1d5aa08 100644 --- a/phrase-extract/score-stsg/RuleTableWriter.cpp +++ b/phrase-extract/score-stsg/RuleTableWriter.cpp @@ -12,14 +12,15 @@ #include "util/string_piece.hh" #include "util/tokenize_piece.hh" -#include "Exception.h" #include "InputFileStream.h" #include "LexicalTable.h" #include "OutputFileStream.h" #include "Options.h" #include "RuleGroup.h" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -75,4 +76,5 @@ void RuleTableWriter::WriteRuleHalf(const TokenizedRuleHalf &half) } } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/RuleTableWriter.h b/phrase-extract/score-stsg/RuleTableWriter.h index 68403dfa7..db8924de3 100644 --- a/phrase-extract/score-stsg/RuleTableWriter.h +++ b/phrase-extract/score-stsg/RuleTableWriter.h @@ -8,7 +8,9 @@ #include "Options.h" #include "TokenizedRuleHalf.h" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -16,7 +18,7 @@ namespace ScoreStsg class RuleTableWriter { public: - RuleTableWriter(const Options &options, OutputFileStream &out) + RuleTableWriter(const Options &options, Moses::OutputFileStream &out) : m_options(options) , m_out(out) {} @@ -34,8 +36,9 @@ private: void WriteRuleHalf(const TokenizedRuleHalf &); const Options &m_options; - OutputFileStream &m_out; + Moses::OutputFileStream &m_out; }; } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/ScoreStsg.cpp b/phrase-extract/score-stsg/ScoreStsg.cpp index e3c2f088e..04e3b5a44 100644 --- a/phrase-extract/score-stsg/ScoreStsg.cpp +++ b/phrase-extract/score-stsg/ScoreStsg.cpp @@ -15,15 +15,19 @@ #include "util/string_piece_hash.hh" #include "util/tokenize_piece.hh" -#include "Exception.h" #include "InputFileStream.h" -#include "LexicalTable.h" #include "OutputFileStream.h" + +#include "syntax-common/exception.h" + +#include "LexicalTable.h" #include "Options.h" #include "RuleGroup.h" #include "RuleTableWriter.h" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -44,12 +48,12 @@ int ScoreStsg::Main(int argc, char *argv[]) ProcessOptions(argc, argv, m_options); // Open input files. - InputFileStream extractStream(m_options.extractFile); - InputFileStream lexStream(m_options.lexFile); + Moses::InputFileStream extractStream(m_options.extractFile); + Moses::InputFileStream lexStream(m_options.lexFile); // Open output files. - OutputFileStream outStream; - OutputFileStream countOfCountsStream; + Moses::OutputFileStream outStream; + Moses::OutputFileStream countOfCountsStream; OpenOutputFileOrDie(m_options.tableFile, outStream); if (m_options.goodTuring || m_options.kneserNey) { OpenOutputFileOrDie(m_options.tableFile+".coc", countOfCountsStream); @@ -161,7 +165,7 @@ void ScoreStsg::ProcessRuleGroupOrDie(const RuleGroup &group, } catch (const Exception &e) { std::ostringstream msg; msg << "failed to process rule group at lines " << start << "-" << end - << ": " << e.GetMsg(); + << ": " << e.msg(); Error(msg.str()); } catch (const std::exception &e) { std::ostringstream msg; @@ -228,7 +232,7 @@ void ScoreStsg::ProcessRuleGroup(const RuleGroup &group, } void ScoreStsg::ParseAlignmentString(const std::string &s, int numTgtWords, - MosesTraining::ALIGNMENT &tgtToSrc) + ALIGNMENT &tgtToSrc) { tgtToSrc.clear(); tgtToSrc.resize(numTgtWords); @@ -262,7 +266,7 @@ void ScoreStsg::ParseAlignmentString(const std::string &s, int numTgtWords, double ScoreStsg::ComputeLexProb(const std::vector &sourceFrontier, const std::vector &targetFrontier, - const MosesTraining::ALIGNMENT &tgtToSrc) + const ALIGNMENT &tgtToSrc) { double lexScore = 1.0; for (std::size_t i = 0; i < targetFrontier.size(); ++i) { @@ -293,7 +297,7 @@ double ScoreStsg::ComputeLexProb(const std::vector &sourceFrontier, } void ScoreStsg::OpenOutputFileOrDie(const std::string &filename, - OutputFileStream &stream) + Moses::OutputFileStream &stream) { bool ret = stream.Open(filename); if (!ret) { @@ -437,4 +441,5 @@ void ScoreStsg::Error(const std::string &msg) const } } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/ScoreStsg.h b/phrase-extract/score-stsg/ScoreStsg.h index 2382b38c0..628c0080e 100644 --- a/phrase-extract/score-stsg/ScoreStsg.h +++ b/phrase-extract/score-stsg/ScoreStsg.h @@ -7,6 +7,7 @@ #include #include "ExtractionPhrasePair.h" +#include "OutputFileStream.h" #include "LexicalTable.h" #include "Options.h" @@ -14,11 +15,10 @@ #include "TokenizedRuleHalf.h" #include "Vocabulary.h" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { - -class OutputFileStream; - namespace ScoreStsg { @@ -41,14 +41,14 @@ private: double ComputeLexProb(const std::vector &, const std::vector &, - const MosesTraining::ALIGNMENT &); + const ALIGNMENT &); void Error(const std::string &) const; - void OpenOutputFileOrDie(const std::string &, OutputFileStream &); + void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &); void ParseAlignmentString(const std::string &, int, - MosesTraining::ALIGNMENT &); + ALIGNMENT &); void ProcessOptions(int, char *[], Options &) const; @@ -68,8 +68,9 @@ private: int m_totalDistinct; TokenizedRuleHalf m_sourceHalf; TokenizedRuleHalf m_targetHalf; - MosesTraining::ALIGNMENT m_tgtToSrc; + ALIGNMENT m_tgtToSrc; }; } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/TokenizedRuleHalf.cpp b/phrase-extract/score-stsg/TokenizedRuleHalf.cpp index 5f115ae9e..6ccc2a311 100644 --- a/phrase-extract/score-stsg/TokenizedRuleHalf.cpp +++ b/phrase-extract/score-stsg/TokenizedRuleHalf.cpp @@ -1,6 +1,8 @@ #include "TokenizedRuleHalf.h" -namespace Moses +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -34,4 +36,5 @@ bool TokenizedRuleHalf::IsTree() const } } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/TokenizedRuleHalf.h b/phrase-extract/score-stsg/TokenizedRuleHalf.h index b21ea2d37..2fbb80f38 100644 --- a/phrase-extract/score-stsg/TokenizedRuleHalf.h +++ b/phrase-extract/score-stsg/TokenizedRuleHalf.h @@ -3,10 +3,13 @@ #include #include -#include "RuleSymbol.h" -#include "TreeFragmentTokenizer.h" +#include "syntax-common/tree_fragment_tokenizer.h" -namespace Moses +#include "RuleSymbol.h" + +namespace MosesTraining +{ +namespace Syntax { namespace ScoreStsg { @@ -42,4 +45,5 @@ struct TokenizedRuleHalf }; } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/Vocabulary.h b/phrase-extract/score-stsg/Vocabulary.h index 8dc773fe2..db31c73f5 100644 --- a/phrase-extract/score-stsg/Vocabulary.h +++ b/phrase-extract/score-stsg/Vocabulary.h @@ -2,12 +2,14 @@ #include -#include "NumberedSet.h" +#include "syntax-common/numbered_set.h" -namespace Moses { +namespace MosesTraining { +namespace Syntax { namespace ScoreStsg { typedef NumberedSet Vocabulary; } // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/Jamfile b/phrase-extract/syntax-common/Jamfile new file mode 100644 index 000000000..c76ab50a5 --- /dev/null +++ b/phrase-extract/syntax-common/Jamfile @@ -0,0 +1,8 @@ +lib syntax_common : [ glob *.cc : *_test.cc ] ..//deps : .. ; + +import testing ; + +for local t in [ glob *_test.cc ] { + local name = [ MATCH "(.*)\.cc" : $(t) ] ; + unit-test $(name) : $(t) syntax_common /top//boost_unit_test_framework /top//boost_system ; +} diff --git a/phrase-extract/syntax-common/exception.h b/phrase-extract/syntax-common/exception.h new file mode 100644 index 000000000..18d529fc3 --- /dev/null +++ b/phrase-extract/syntax-common/exception.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace MosesTraining { +namespace Syntax { + +class Exception { + public: + Exception(const char *msg) : msg_(msg) {} + Exception(const std::string &msg) : msg_(msg) {} + + const std::string &msg() const { return msg_; } + + private: + std::string msg_; +}; + +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/NumberedSet.h b/phrase-extract/syntax-common/numbered_set.h similarity index 96% rename from phrase-extract/score-stsg/NumberedSet.h rename to phrase-extract/syntax-common/numbered_set.h index 56f7c05c2..60933fe96 100644 --- a/phrase-extract/score-stsg/NumberedSet.h +++ b/phrase-extract/syntax-common/numbered_set.h @@ -6,10 +6,10 @@ #include -#include "Exception.h" +#include "exception.h" -namespace Moses { -namespace ScoreStsg { +namespace MosesTraining { +namespace Syntax { // Stores a set of elements of type T, each of which is allocated an integral // ID of type I. IDs are contiguous starting at 0. Individual elements cannot @@ -106,5 +106,5 @@ void NumberedSet::Clear() { id_to_element_.clear(); } -} // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/string_tree.h b/phrase-extract/syntax-common/string_tree.h new file mode 100644 index 000000000..c1676e72c --- /dev/null +++ b/phrase-extract/syntax-common/string_tree.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +#include "tree.h" + +namespace MosesTraining { +namespace Syntax { + +typedef Tree StringTree; + +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/tree-inl.h b/phrase-extract/syntax-common/tree-inl.h new file mode 100644 index 000000000..2ba55df1a --- /dev/null +++ b/phrase-extract/syntax-common/tree-inl.h @@ -0,0 +1,115 @@ +#pragma once + +#include +#include + +namespace MosesTraining { +namespace Syntax { + +template +Tree::~Tree() { + for (typename std::vector::iterator p = children_.begin(); + p != children_.end(); ++p) { + delete *p; + } +} + +template +void Tree::SetParents() { + for (typename std::vector::iterator p = children_.begin(); + p != children_.end(); ++p) { + (*p)->parent() = this; + (*p)->SetParents(); + } +} + +template +std::size_t Tree::Depth() const { + std::size_t depth = 0; + Tree *ancestor = parent_; + while (ancestor != 0) { + ++depth; + ancestor = ancestor->parent_; + } + return depth; +} + +template +class Tree::PreOrderIterator { + public: + PreOrderIterator(); + PreOrderIterator(Tree &); + + Tree &operator*() { return *node_; } + Tree *operator->() { return node_; } + + PreOrderIterator &operator++(); + PreOrderIterator operator++(int); + + bool operator==(const Tree::PreOrderIterator &); + bool operator!=(const Tree::PreOrderIterator &); + + private: + // Pointer to the current node. + Tree *node_; + + // Stack of indices defining the position of node_ within the child vectors + // of its ancestors. + std::stack index_stack_; +}; + +template +Tree::PreOrderIterator::PreOrderIterator() + : node_(0) { +} + +template +Tree::PreOrderIterator::PreOrderIterator(Tree &t) + : node_(&t) { +} + +template +typename Tree::PreOrderIterator &Tree::PreOrderIterator::operator++() { + // If the current node has children then visit the left-most child next. + if (!node_->children().empty()) { + index_stack_.push(0); + node_ = node_->children()[0]; + return *this; + } + // Otherwise, try node's ancestors until either a node is found with a + // sibling to the right or we reach the root (in which case the traversal + // is complete). + Tree *ancestor = node_->parent_; + while (ancestor) { + std::size_t index = index_stack_.top(); + index_stack_.pop(); + if (index+1 < ancestor->children_.size()) { + index_stack_.push(index+1); + node_ = ancestor->children()[index+1]; + return *this; + } + ancestor = ancestor->parent_; + } + node_ = 0; + return *this; +} + +template +typename Tree::PreOrderIterator Tree::PreOrderIterator::operator++(int) { + PreOrderIterator tmp(*this); + ++*this; + return tmp; +} + +template +bool Tree::PreOrderIterator::operator==(const PreOrderIterator &rhs) { + return node_ == rhs.node_; +} + +template +bool Tree::PreOrderIterator::operator!=(const PreOrderIterator &rhs) { + return node_ != rhs.node_; +} + +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/tree.h b/phrase-extract/syntax-common/tree.h new file mode 100644 index 000000000..52adaa699 --- /dev/null +++ b/phrase-extract/syntax-common/tree.h @@ -0,0 +1,91 @@ +#pragma once + +#include + +namespace MosesTraining { +namespace Syntax { + +// A basic k-ary tree with node values of type T. Each node has a vector of +// pointers to its children and a pointer to its parent (or 0 for the root). +// +// See the unit tests in tree_test.cc for examples of usage. +// +// Note: a Tree owns its children: it will delete them on destruction. +// +// Note: it's the user's responsibility to ensure that parent and child pointers +// are correctly set and maintained. A convenient(-ish) way of building a +// properly-connected tree is to add all the nodes as children of their +// respective parents (using the children() accessor) and then call +// SetParents() on the root at the end. +// +template +class Tree { + public: + // Constructors + Tree() + : value_() + , children_() + , parent_(0) {} + + Tree(const T &value) + : value_(value) + , children_() + , parent_(0) {} + + // Destructor (deletes children) + ~Tree(); + + // Access tree's value. + const T &value() const { return value_; } + T &value() { return value_; } + + // Access tree's parent. + const Tree *parent() const { return parent_; } + Tree *&parent() { return parent_; } + + // Access tree's children. + const std::vector &children() const { return children_; } + std::vector &children() { return children_; } + + // Set the parent values for this subtree (excluding this node). + void SetParents(); + + // Leaf predicate. + bool IsLeaf() const { return children_.empty(); } + + // Calculate the depth of this node within the tree (where the root has a + // depth of 0, root's children have a depth 1, etc). + std::size_t Depth() const; + + // Iterators + // + // All iterators are forward iterators. Example use: + // + // Tree &root = GetMeATree(); + // for (Tree::PreOrderIterator p(root); + // p != Tree::PreOrderIterator(); ++p) { + // std::cout << p->value() << " "; + // } + + // Pre-order iterators. + class PreOrderIterator; + // class ConstPreOrderIterator; TODO + + // Post-order iterators. + // class PostOrderIterator; TODO + // class ConstPostOrderIterator; TODO + + // Leaf iterators (left-to-right). + // class LeafIterator; TODO + // class ConstLeafIterator; TODO + + private: + T value_; + std::vector children_; + Tree *parent_; +}; + +} // namespace Syntax +} // namespace MosesTraining + +#include "tree-inl.h" diff --git a/phrase-extract/score-stsg/TreeFragmentTokenizer.cpp b/phrase-extract/syntax-common/tree_fragment_tokenizer.cc similarity index 93% rename from phrase-extract/score-stsg/TreeFragmentTokenizer.cpp rename to phrase-extract/syntax-common/tree_fragment_tokenizer.cc index cafc39432..ab3db3a84 100644 --- a/phrase-extract/score-stsg/TreeFragmentTokenizer.cpp +++ b/phrase-extract/syntax-common/tree_fragment_tokenizer.cc @@ -1,10 +1,10 @@ -#include "TreeFragmentTokenizer.h" +#include "tree_fragment_tokenizer.h" #include -namespace Moses +namespace MosesTraining { -namespace ScoreStsg +namespace Syntax { TreeFragmentToken::TreeFragmentToken(TreeFragmentTokenType t, @@ -86,5 +86,5 @@ bool operator!=(const TreeFragmentTokenizer &lhs, return !(lhs == rhs); } -} // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/score-stsg/TreeFragmentTokenizer.h b/phrase-extract/syntax-common/tree_fragment_tokenizer.h similarity index 85% rename from phrase-extract/score-stsg/TreeFragmentTokenizer.h rename to phrase-extract/syntax-common/tree_fragment_tokenizer.h index 5360eb5b9..ca8741a52 100644 --- a/phrase-extract/score-stsg/TreeFragmentTokenizer.h +++ b/phrase-extract/syntax-common/tree_fragment_tokenizer.h @@ -2,10 +2,8 @@ #include "util/string_piece.hh" -namespace Moses -{ -namespace ScoreStsg -{ +namespace MosesTraining { +namespace Syntax { enum TreeFragmentTokenType { TreeFragmentToken_EOS, @@ -24,10 +22,11 @@ struct TreeFragmentToken { // Tokenizes tree fragment strings in Moses format. // -// For example, the string "[NP [NP [NN a]] [NP]]" is tokenized to the sequence: +// For example, the string "[S [NP [NN weasels]] [VP]]" is tokenized to the +// sequence: // // 1 LSB "[" -// 2 WORD "NP" +// 2 WORD "S" // 3 LSB "[" // 4 WORD "NP" // 5 LSB "[" @@ -36,7 +35,7 @@ struct TreeFragmentToken { // 8 RSB "]" // 9 RSB "]" // 10 LSB "[" -// 11 WORD "NP" +// 11 WORD "VP" // 12 RSB "]" // 13 RSB "]" // 14 EOS undefined @@ -66,5 +65,5 @@ class TreeFragmentTokenizer { std::size_t pos_; }; -} // namespace ScoreStsg -} // namespace Moses +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/tree_fragment_tokenizer_test.cc b/phrase-extract/syntax-common/tree_fragment_tokenizer_test.cc new file mode 100644 index 000000000..cd09c6911 --- /dev/null +++ b/phrase-extract/syntax-common/tree_fragment_tokenizer_test.cc @@ -0,0 +1,74 @@ +#include "tree_fragment_tokenizer.h" + +#define BOOST_TEST_MODULE TreeTest +#include + +#include + +namespace MosesTraining { +namespace Syntax { +namespace { + +BOOST_AUTO_TEST_CASE(tokenize_empty) { + const std::string fragment = ""; + std::vector tokens; + for (TreeFragmentTokenizer p(fragment); p != TreeFragmentTokenizer(); ++p) { + tokens.push_back(*p); + } + BOOST_REQUIRE(tokens.empty()); +} + +BOOST_AUTO_TEST_CASE(tokenize_space) { + const std::string fragment = " [ weasel weasel ] [] ] wea[sel"; + std::vector tokens; + for (TreeFragmentTokenizer p(fragment); p != TreeFragmentTokenizer(); ++p) { + tokens.push_back(*p); + } + BOOST_REQUIRE(tokens.size() == 10); + BOOST_REQUIRE(tokens[0].type == TreeFragmentToken_LSB); + BOOST_REQUIRE(tokens[0].value == "["); + BOOST_REQUIRE(tokens[1].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[1].value == "weasel"); + BOOST_REQUIRE(tokens[2].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[2].value == "weasel"); + BOOST_REQUIRE(tokens[3].type == TreeFragmentToken_RSB); + BOOST_REQUIRE(tokens[3].value == "]"); + BOOST_REQUIRE(tokens[4].type == TreeFragmentToken_LSB); + BOOST_REQUIRE(tokens[4].value == "["); + BOOST_REQUIRE(tokens[5].type == TreeFragmentToken_RSB); + BOOST_REQUIRE(tokens[5].value == "]"); + BOOST_REQUIRE(tokens[6].type == TreeFragmentToken_RSB); + BOOST_REQUIRE(tokens[6].value == "]"); + BOOST_REQUIRE(tokens[7].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[7].value == "wea"); + BOOST_REQUIRE(tokens[8].type == TreeFragmentToken_LSB); + BOOST_REQUIRE(tokens[8].value == "["); + BOOST_REQUIRE(tokens[9].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[9].value == "sel"); +} + +BOOST_AUTO_TEST_CASE(tokenize_fragment) { + const std::string fragment = "[S [NP [NN weasels]] [VP]]"; + std::vector tokens; + for (TreeFragmentTokenizer p(fragment); p != TreeFragmentTokenizer(); ++p) { + tokens.push_back(*p); + } + BOOST_REQUIRE(tokens.size() == 13); + BOOST_REQUIRE(tokens[0].type == TreeFragmentToken_LSB); + BOOST_REQUIRE(tokens[1].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[2].type == TreeFragmentToken_LSB); + BOOST_REQUIRE(tokens[3].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[4].type == TreeFragmentToken_LSB); + BOOST_REQUIRE(tokens[5].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[6].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[7].type == TreeFragmentToken_RSB); + BOOST_REQUIRE(tokens[8].type == TreeFragmentToken_RSB); + BOOST_REQUIRE(tokens[9].type == TreeFragmentToken_LSB); + BOOST_REQUIRE(tokens[10].type == TreeFragmentToken_WORD); + BOOST_REQUIRE(tokens[11].type == TreeFragmentToken_RSB); + BOOST_REQUIRE(tokens[12].type == TreeFragmentToken_RSB); +} + +} // namespace +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/tree_test.cc b/phrase-extract/syntax-common/tree_test.cc new file mode 100644 index 000000000..0a54ad3f1 --- /dev/null +++ b/phrase-extract/syntax-common/tree_test.cc @@ -0,0 +1,66 @@ +#include "tree.h" + +#define BOOST_TEST_MODULE TreeTest +#include + +#include + +namespace MosesTraining { +namespace Syntax { +namespace { + +// Test Tree<>::PreOrderIterator with a trivial, single-node tree. +BOOST_AUTO_TEST_CASE(pre_order_1) { + boost::scoped_ptr > root(new Tree(123)); + Tree::PreOrderIterator p(*root); + BOOST_REQUIRE(p != Tree::PreOrderIterator()); + BOOST_REQUIRE(p->value() == 123); + ++p; + BOOST_REQUIRE(p == Tree::PreOrderIterator()); +} + +// Test Tree<>::PreOrderIterator on this tree: (1 (2 3) (4) (5 6 (7 8))) +BOOST_AUTO_TEST_CASE(pre_order_2) { + boost::scoped_ptr > root(new Tree(1)); + root->children().push_back(new Tree(2)); + root->children()[0]->children().push_back(new Tree(3)); + root->children().push_back(new Tree(4)); + root->children().push_back(new Tree(5)); + root->children()[2]->children().push_back(new Tree(6)); + root->children()[2]->children().push_back(new Tree(7)); + root->children()[2]->children()[1]->children().push_back(new Tree(8)); + root->SetParents(); + + Tree::PreOrderIterator p(*root); + Tree::PreOrderIterator end; + + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 1); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 2); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 3); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 4); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 5); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 6); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 7); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 8); + ++p; + BOOST_REQUIRE(p == end); +} + +} // namespace +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc new file mode 100644 index 000000000..c4363a3e2 --- /dev/null +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -0,0 +1,59 @@ +#include "xml_tree_parser.h" + +#include "tables-core.h" +#include "XmlException.h" +#include "XmlTree.h" + +#include +#include + +namespace MosesTraining { +namespace Syntax { + +StringTree *XmlTreeParser::Parse(const std::string &line) { + line_ = line; + tree_.Clear(); + try { + if (!ProcessAndStripXMLTags(line_, tree_, label_set_, top_label_set_, + false)) { + throw Exception(""); + } + } catch (const XmlException &e) { + throw Exception(e.getMsg()); + } + tree_.ConnectNodes(); + SyntaxNode *root = tree_.GetTop(); + assert(root); + words_ = tokenize(line_.c_str()); + return ConvertTree(*root, words_); +} + +// Converts a SyntaxNode tree to a StringTree. +StringTree *XmlTreeParser::ConvertTree(const SyntaxNode &tree, + const std::vector &words) { + StringTree *root = new StringTree(tree.GetLabel()); + const std::vector &children = tree.GetChildren(); + if (children.empty()) { + if (tree.GetStart() != tree.GetEnd()) { + std::ostringstream msg; + msg << "leaf node covers multiple words (" << tree.GetStart() + << "-" << tree.GetEnd() << "): this is currently unsupported"; + throw Exception(msg.str()); + } + StringTree *leaf = new StringTree(words[tree.GetStart()]); + leaf->parent() = root; + root->children().push_back(leaf); + } else { + for (std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + assert(*p); + StringTree *child = ConvertTree(**p, words); + child->parent() = root; + root->children().push_back(child); + } + } + return root; +} + +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h new file mode 100644 index 000000000..a5563f63a --- /dev/null +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include +#include +#include + +#include "SyntaxTree.h" + +#include "exception.h" +#include "string_tree.h" + +namespace MosesTraining { +namespace Syntax { + +// Parses a string in Moses' XML parse tree format and returns a StringTree +// object. This is a wrapper around the ProcessAndStripXMLTags function. +class XmlTreeParser { + public: + StringTree *Parse(const std::string &); + + private: + static StringTree *ConvertTree(const MosesTraining::SyntaxNode &, + const std::vector &); + + std::set label_set_; + std::map top_label_set_; + std::string line_; + MosesTraining::SyntaxTree tree_; + std::vector words_; +}; + +} // namespace Syntax +} // namespace MosesTraining diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl index e653440dc..3e77edd53 100755 --- a/scripts/other/beautify.perl +++ b/scripts/other/beautify.perl @@ -40,6 +40,8 @@ sub Beautify($) next if ($name eq "srilm"); next if ($name eq "irstlm"); next if ($name eq "UG"); + next if ($name eq "pcfg-common"); + next if ($name eq "syntax-common"); $name = $path ."/" .$name; if (-d $name) {