From 6bea23357c1d5a9a50382330d14f4c734f94ac98 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 09:28:38 +0100
Subject: [PATCH 1/6] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/pcfg-common/pcfg_tree.h        |  79 ----------
 phrase-extract/pcfg-common/syntax_tree.h      |  93 ------------
 phrase-extract/pcfg-common/typedef.h          |   1 -
 phrase-extract/pcfg-common/xml_tree_parser.cc |  89 ------------
 phrase-extract/pcfg-common/xml_tree_parser.h  |  59 --------
 phrase-extract/pcfg-common/xml_tree_writer.h  | 135 ------------------
 phrase-extract/pcfg-extract/Jamfile           |   2 +-
 phrase-extract/pcfg-extract/pcfg_extract.cc   |  34 ++---
 phrase-extract/pcfg-extract/rule_extractor.cc |  16 +--
 phrase-extract/pcfg-extract/rule_extractor.h  |   6 +-
 phrase-extract/pcfg-score/pcfg_score.cc       |  19 +--
 phrase-extract/pcfg-score/tree_scorer.cc      |  66 +++++++--
 phrase-extract/pcfg-score/tree_scorer.h       |  10 +-
 .../syntax-common/xml_tree_parser.cc          |   5 +-
 .../syntax-common/xml_tree_parser.h           |  36 ++++-
 .../syntax-common/xml_tree_writer.cc          |  82 +++++++++++
 .../syntax-common/xml_tree_writer.h           |  27 ++++
 17 files changed, 245 insertions(+), 514 deletions(-)
 delete mode 100644 phrase-extract/pcfg-common/pcfg_tree.h
 delete mode 100644 phrase-extract/pcfg-common/syntax_tree.h
 delete mode 100644 phrase-extract/pcfg-common/xml_tree_parser.cc
 delete mode 100644 phrase-extract/pcfg-common/xml_tree_parser.h
 delete mode 100644 phrase-extract/pcfg-common/xml_tree_writer.h
 create mode 100644 phrase-extract/syntax-common/xml_tree_writer.cc
 create mode 100644 phrase-extract/syntax-common/xml_tree_writer.h
diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h
deleted file mode 100644
index ce28eb8dd..000000000
--- a/phrase-extract/pcfg-common/pcfg_tree.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_PCFG_TREE_H_
-#define PCFG_PCFG_TREE_H_
-
-#include <string>
-
-#include "syntax_tree.h"
-#include "xml_tree_writer.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-template<typename DerivedType>
-class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
- public:
-  typedef std::string LabelType;
-  typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
-
-  PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
-
-  double score() const { return score_; }
-  void set_score(double s) { score_ = s; }
-
- private:
-  double score_;
-};
-
-class PcfgTree : public PcfgTreeBase<PcfgTree> {
- public:
-  typedef PcfgTreeBase<PcfgTree> BaseType;
-  PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
-};
-
-// Specialise XmlOutputHandler for PcfgTree.
-template<>
-class XmlOutputHandler<PcfgTree> {
- public:
-  typedef std::map<std::string, std::string> AttributeMap;
-
-  void GetLabel(const PcfgTree &tree, std::string &label) const {
-    label = tree.label();
-  }
-
-  void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
-    attribute_map.clear();
-    double score = tree.score();
-    if (score != 0.0) {
-      std::ostringstream out;
-      out << tree.score();
-      attribute_map["pcfg"] = out.str();
-    }
-  }
-};
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h
deleted file mode 100644
index c0c6eaef9..000000000
--- a/phrase-extract/pcfg-common/syntax_tree.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_SYNTAX_TREE_H_
-#define PCFG_SYNTAX_TREE_H_
-
-#include <cassert>
-#include <vector>
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-// Base class for SyntaxTree, AgreementTree, and friends.
-template<typename T, typename DerivedType>
-class SyntaxTreeBase {
- public:
-  // Constructors
-  SyntaxTreeBase(const T &label)
-    : label_(label)
-    , children_()
-    , parent_(0) {}
-
-  SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
-    : label_(label)
-    , children_(children)
-    , parent_(0) {}
-
-  // Destructor
-  virtual ~SyntaxTreeBase();
-
-  const T &label() const { return label_; }
-  const DerivedType *parent() const { return parent_; }
-  DerivedType *parent() { return parent_; }
-  const std::vector<DerivedType *> &children() const { return children_; }
-  std::vector<DerivedType *> &children() { return children_; }
-
-  void set_label(const T &label) { label_ = label; }
-  void set_parent(DerivedType *parent) { parent_ = parent; }
-  void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
-
-  bool IsLeaf() const { return children_.empty(); }
-
-  bool IsPreterminal() const {
-    return children_.size() == 1 && children_[0]->IsLeaf();
-  }
-
-  void AddChild(DerivedType *child) { children_.push_back(child); }
-
- private:
-  T label_;
-  std::vector<DerivedType *> children_;
-  DerivedType *parent_;
-};
-
-template<typename T>
-class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
- public:
-  typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
-  SyntaxTree(const T &label) : BaseType(label) {}
-  SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
-    : BaseType(label, children) {}
-};
-
-template<typename T, typename DerivedType>
-SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
-  for (std::size_t i = 0; i < children_.size(); ++i) {
-    delete children_[i];
-  }
-}
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h
index e738163df..1280b89cf 100644
--- a/phrase-extract/pcfg-common/typedef.h
+++ b/phrase-extract/pcfg-common/typedef.h
@@ -24,7 +24,6 @@
 #include <string>
 
 #include "syntax-common/numbered_set.h"
-#include "syntax_tree.h"
 
 namespace MosesTraining {
 namespace Syntax {
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc
deleted file mode 100644
index f15a04811..000000000
--- a/phrase-extract/pcfg-common/xml_tree_parser.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#include "xml_tree_parser.h"
-
-#include <cassert>
-#include <vector>
-
-#include "tables-core.h"
-#include "XmlException.h"
-#include "XmlTree.h"
-#include "util/tokenize.hh"
-
-#include "syntax-common/exception.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-XmlTreeParser::XmlTreeParser() {
-}
-
-std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
-  m_line = line;
-  m_tree.Clear();
-  try {
-    if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
-      throw Exception("");
-    }
-  } catch (const XmlException &e) {
-    throw Exception(e.getMsg());
-  }
-  m_tree.ConnectNodes();
-  SyntaxNode *root = m_tree.GetTop();
-  if (!root) {
-    // There is no XML tree.
-    return std::auto_ptr<PcfgTree>();
-  }
-  m_words = util::tokenize(m_line);
-  return ConvertTree(*root, m_words);
-}
-
-// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
-std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
-    const SyntaxNode &tree,
-    const std::vector<std::string> &words) {
-  std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
-  const std::vector<SyntaxNode*> &children = tree.GetChildren();
-  if (children.empty()) {
-    if (tree.GetStart() != tree.GetEnd()) {
-      std::ostringstream msg;
-      msg << "leaf node covers multiple words (" << tree.GetStart()
-          << "-" << tree.GetEnd() << "): this is currently unsupported";
-      throw Exception(msg.str());
-    }
-    std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
-    leaf->set_parent(root.get());
-    root->AddChild(leaf.release());
-  } else {
-    for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
-         p != children.end(); ++p) {
-      assert(*p);
-      std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
-      child->set_parent(root.get());
-      root->AddChild(child.release());
-    }
-  }
-  return root;
-}
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h
deleted file mode 100644
index 8605c0691..000000000
--- a/phrase-extract/pcfg-common/xml_tree_parser.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_XML_TREE_PARSER_H_
-#define PCFG_XML_TREE_PARSER_H_
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "pcfg_tree.h"
-#include "SyntaxNode.h"
-#include "SyntaxNodeCollection.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-// Parses a string in Moses' XML parse tree format and returns a PcfgTree
-// object.
-class XmlTreeParser {
- public:
-  XmlTreeParser();
-  std::auto_ptr<PcfgTree> Parse(const std::string &);
- private:
-  std::auto_ptr<PcfgTree> ConvertTree(const MosesTraining::SyntaxNode &,
-                                      const std::vector<std::string> &);
-
-  std::set<std::string> m_labelSet;
-  std::map<std::string, int> m_topLabelSet;
-  std::string m_line;
-  MosesTraining::SyntaxNodeCollection m_tree;
-  std::vector<std::string> m_words;
-};
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h
deleted file mode 100644
index 8582e544f..000000000
--- a/phrase-extract/pcfg-common/xml_tree_writer.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/***********************************************************************
- Moses - statistical machine translation system
- Copyright (C) 2006-2012 University of Edinburgh
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-***********************************************************************/
-
-#pragma once
-#ifndef PCFG_XML_TREE_WRITER_H_
-#define PCFG_XML_TREE_WRITER_H_
-
-#include <cassert>
-#include <map>
-#include <memory>
-#include <ostream>
-#include <vector>
-#include <string>
-
-#include "XmlTree.h"
-
-#include "syntax_tree.h"
-
-namespace MosesTraining {
-namespace Syntax {
-namespace PCFG {
-
-template<typename InputTree>
-class XmlOutputHandler {
- public:
-  typedef std::map<std::string, std::string> AttributeMap;
-
-  void GetLabel(const InputTree &, std::string &) const;
-  void GetAttributes(const InputTree &, AttributeMap &) const;
-};
-
-template<typename InputTree>
-class XmlTreeWriter : public XmlOutputHandler<InputTree> {
- public:
-  typedef XmlOutputHandler<InputTree> Base;
-  void Write(const InputTree &, std::ostream &) const;
- private:
-  std::string Escape(const std::string &) const;
-};
-
-template<typename InputTree>
-void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
-                                     std::ostream &out) const {
-  assert(!tree.IsLeaf());
-
-  // Opening tag
-
-  std::string label;
-  Base::GetLabel(tree, label);
-  out << "<tree label=\"" << Escape(label) << "\"";
-
-  typename Base::AttributeMap attribute_map;
-  Base::GetAttributes(tree, attribute_map);
-
-  for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
-       p != attribute_map.end(); ++p) {
-    out << " " << p->first << "=\"" << p->second << "\"";
-  }
-
-  out << ">";
-
-  // Children
-
-  const std::vector<InputTree *> &children = tree.children();
-  for (typename std::vector<InputTree *>::const_iterator p = children.begin();
-       p != children.end(); ++p) {
-    InputTree &child = **p;
-    if (child.IsLeaf()) {
-      Base::GetLabel(child, label);
-      out << " " << Escape(label);
-    } else {
-      out << " ";
-      Write(**p, out);
-    }
-  }
-
-  // Closing tag
-  out << " </tree>";
-
-  if (tree.parent() == 0) {
-    out << std::endl;
-  }
-}
-
-// Escapes XML special characters.
-template<typename InputTree>
-std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
-  std::string t;
-  std::size_t len = s.size();
-  t.reserve(len);
-  for (std::size_t i = 0; i < len; ++i) {
-    if (s[i] == '<') {
-      t += "&lt;";
-    } else if (s[i] == '>') {
-      t += "&gt;";
-    } else if (s[i] == '[') {
-      t += "&#91;";
-    } else if (s[i] == ']') {
-      t += "&#93;";
-    } else if (s[i] == '|') {
-      t += "&#124;";
-    } else if (s[i] == '&') {
-      t += "&amp;";
-    } else if (s[i] == '\'') {
-      t += "&apos;";
-    } else if (s[i] == '"') {
-      t += "&quot;";
-    } else {
-      t += s[i];
-    }
-  }
-  return t;
-}
-
-}  // namespace PCFG
-}  // namespace Syntax
-}  // namespace MosesTraining
-
-#endif
diff --git a/phrase-extract/pcfg-extract/Jamfile b/phrase-extract/pcfg-extract/Jamfile
index 61f056599..2442b967a 100644
--- a/phrase-extract/pcfg-extract/Jamfile
+++ b/phrase-extract/pcfg-extract/Jamfile
@@ -1 +1 @@
-exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : <include>.. ;
+exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : <include>.. ;
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index 29d63b994..8e7a40e07 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -19,20 +19,6 @@
 
 #include "pcfg_extract.h"
 
-#include "options.h"
-#include "rule_collection.h"
-#include "rule_extractor.h"
-
-#include "syntax-common/exception.h"
-
-#include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
-#include "pcfg-common/syntax_tree.h"
-#include "pcfg-common/typedef.h"
-#include "pcfg-common/xml_tree_parser.h"
-
-#include <boost/program_options.hpp>
-
 #include <cassert>
 #include <cstdlib>
 #include <fstream>
@@ -43,6 +29,20 @@
 #include <string>
 #include <vector>
 
+#include <boost/program_options.hpp>
+
+#include "syntax-common/exception.h"
+#include "syntax-common/xml_tree_parser.h"
+
+#include "SyntaxTree.h"
+
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/typedef.h"
+
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+
 namespace MosesTraining
 {
 namespace Syntax
@@ -60,10 +60,12 @@ int PcfgExtract::Main(int argc, char *argv[])
   Vocabulary non_term_vocab;
   RuleExtractor rule_extractor(non_term_vocab);
   RuleCollection rule_collection;
-  XmlTreeParser parser;
+  std::set<std::string> label_set;
+  std::map<std::string, int> top_label_set;
+  XmlTreeParser parser(label_set, top_label_set);
   std::string line;
   std::size_t line_num = 0;
-  std::auto_ptr<PcfgTree> tree;
+  std::auto_ptr<MosesTraining::SyntaxTree> tree;
   while (std::getline(std::cin, line)) {
     ++line_num;
     try {
diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc
index bd2c48c8a..39da54ef2 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.cc
+++ b/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -19,8 +19,6 @@
 
 #include "rule_extractor.h"
 
-#include "pcfg-common/pcfg_tree.h"
-
 namespace MosesTraining
 {
 namespace Syntax
@@ -33,21 +31,21 @@ RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
 {
 }
 
-void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const
+void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
 {
-  if (tree.IsPreterminal() || tree.IsLeaf()) {
+  if (tree.IsLeaf() || tree.children()[0]->IsLeaf()) {
     return;
   }
 
-  std::size_t lhs = non_term_vocab_.Insert(tree.label());
+  std::size_t lhs = non_term_vocab_.Insert(tree.value().GetLabel());
   std::vector<std::size_t> rhs;
 
-  const std::vector<PcfgTree *> &children = tree.children();
+  const std::vector<SyntaxTree *> &children = tree.children();
   rhs.reserve(children.size());
-  for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+  for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
-    const PcfgTree &child = **p;
-    rhs.push_back(non_term_vocab_.Insert(child.label()));
+    const SyntaxTree &child = **p;
+    rhs.push_back(non_term_vocab_.Insert(child.value().GetLabel()));
     Extract(child, rc);
   }
   rc.Add(lhs, rhs);
diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h
index f35460909..d32d76992 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.h
+++ b/phrase-extract/pcfg-extract/rule_extractor.h
@@ -21,6 +21,8 @@
 #ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
 #define PCFG_EXTRACT_RULE_EXTRACTOR_H_
 
+#include "SyntaxTree.h"
+
 #include "pcfg-common/typedef.h"
 
 #include "rule_collection.h"
@@ -32,14 +34,12 @@ namespace Syntax
 namespace PCFG
 {
 
-class PcfgTree;
-
 // Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
 class RuleExtractor
 {
 public:
   RuleExtractor(Vocabulary &);
-  void Extract(const PcfgTree &, RuleCollection &) const;
+  void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const;
 private:
   Vocabulary &non_term_vocab_;
 };
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index 314e0fb38..d656d2882 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -33,13 +33,14 @@
 
 #include <boost/program_options.hpp>
 
+#include "SyntaxTree.h"
+
 #include "syntax-common/exception.h"
+#include "syntax-common/xml_tree_parser.h"
+#include "syntax-common/xml_tree_writer.h"
 
 #include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
-#include "pcfg-common/syntax_tree.h"
 #include "pcfg-common/typedef.h"
-#include "pcfg-common/xml_tree_parser.h"
 
 namespace MosesTraining
 {
@@ -65,15 +66,17 @@ int PcfgScore::Main(int argc, char *argv[])
 
   // Score corpus according to PCFG.
   TreeScorer scorer(pcfg, non_term_vocab);
-  XmlTreeParser parser;
-  XmlTreeWriter<PcfgTree> writer;
+  std::set<std::string> label_set;
+  std::map<std::string, int> top_label_set;
+  XmlTreeParser parser(label_set, top_label_set);
+  XmlTreeWriter writer(std::cout);
   std::string line;
   std::size_t line_num = 0;
-  std::auto_ptr<PcfgTree> tree;
+  std::auto_ptr<SyntaxTree> tree;
   while (std::getline(std::cin, line)) {
     ++line_num;
     try {
-      tree = parser.Parse(line);
+      tree = parser.Parse(line, true);
     } catch (Exception &e) {
       std::ostringstream msg;
       msg << "line " << line_num << ": " << e.msg();
@@ -93,7 +96,7 @@ int PcfgScore::Main(int argc, char *argv[])
       std::cout << line << std::endl;
       continue;
     }
-    writer.Write(*tree, std::cout);
+    writer.Write(*tree);
   }
 
   return 0;
diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc
index 74d6e79ef..61ae16e4c 100644
--- a/phrase-extract/pcfg-score/tree_scorer.cc
+++ b/phrase-extract/pcfg-score/tree_scorer.cc
@@ -20,6 +20,7 @@
 #include "tree_scorer.h"
 
 #include <cassert>
+#include <sstream>
 
 namespace MosesTraining
 {
@@ -34,30 +35,41 @@ TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
 {
 }
 
-bool TreeScorer::Score(PcfgTree &root) const
+bool TreeScorer::Score(SyntaxTree &root)
 {
-  if (root.IsPreterminal() || root.IsLeaf()) {
+  scores_.clear();
+  ZeroScores(root);
+  if (!CalcScores(root)) {
+    return false;
+  }
+  SetAttributes(root);
+  return true;
+}
+
+bool TreeScorer::CalcScores(SyntaxTree &root)
+{
+  if (root.IsLeaf() || root.children()[0]->IsLeaf()) {
     return true;
   }
 
-  const std::vector<PcfgTree *> &children = root.children();
+  const std::vector<SyntaxTree *> &children = root.children();
 
   double log_prob = 0.0;
 
   std::vector<std::size_t> key;
   key.reserve(children.size()+1);
-  key.push_back(non_term_vocab_.Lookup(root.label()));
+  key.push_back(non_term_vocab_.Lookup(root.value().GetLabel()));
 
-  for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+  for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
-    PcfgTree *child = *p;
+    SyntaxTree *child = *p;
     assert(!child->IsLeaf());
-    key.push_back(non_term_vocab_.Lookup(child->label()));
-    if (!Score(*child)) {
+    key.push_back(non_term_vocab_.Lookup(child->value().GetLabel()));
+    if (!CalcScores(*child)) {
       return false;
     }
-    if (!child->IsPreterminal()) {
-      log_prob += child->score();
+    if (!child->children()[0]->IsLeaf()) {
+      log_prob += scores_[child];
     }
   }
   double rule_score;
@@ -66,10 +78,42 @@ bool TreeScorer::Score(PcfgTree &root) const
     return false;
   }
   log_prob += rule_score;
-  root.set_score(log_prob);
+  scores_[&root] = log_prob;
   return true;
 }
 
+void TreeScorer::SetAttributes(SyntaxTree &root)
+{
+  // Terminals don't need attributes.
+  if (root.IsLeaf()) {
+    return;
+  }
+  // Preterminals don't need attributes (they have the implicit score 0.0).
+  if (root.children()[0]->IsLeaf()) {
+    return;
+  }
+  double score = scores_[&root];
+  if (score != 0.0) {
+    std::ostringstream out;
+    out << score;
+    root.value().attributes["pcfg"] = out.str();
+  }
+  for (std::vector<SyntaxTree *>::const_iterator p(root.children().begin());
+       p != root.children().end(); ++p) {
+    SetAttributes(**p);
+  }
+}
+
+void TreeScorer::ZeroScores(SyntaxTree &root)
+{
+  scores_[&root] = 0.0f;
+  const std::vector<SyntaxTree *> &children = root.children();
+  for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    ZeroScores(**p);
+  }
+}
+
 }  // namespace PCFG
 }  // namespace Syntax
 }  // namespace MosesTraining
diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h
index 8b1afcc3a..cf9fdd1a3 100644
--- a/phrase-extract/pcfg-score/tree_scorer.h
+++ b/phrase-extract/pcfg-score/tree_scorer.h
@@ -21,8 +21,9 @@
 #ifndef PCFG_SCORE_TREE_SCORER_H_
 #define PCFG_SCORE_TREE_SCORER_H_
 
+#include "SyntaxTree.h"
+
 #include "pcfg-common/pcfg.h"
-#include "pcfg-common/pcfg_tree.h"
 #include "pcfg-common/typedef.h"
 
 namespace MosesTraining
@@ -39,11 +40,16 @@ public:
 
   // Score tree according to PCFG.  Returns false if unsuccessful (due to
   // missing rule).
-  bool Score(PcfgTree &) const;
+  bool Score(SyntaxTree &);
 
 private:
   const Pcfg &pcfg_;
   const Vocabulary &non_term_vocab_;
+  std::map<SyntaxTree *, double> scores_;
+
+  bool CalcScores(SyntaxTree &);
+  void SetAttributes(SyntaxTree &);
+  void ZeroScores(SyntaxTree &);
 };
 
 }  // namespace PCFG
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index bf3c6d87e..6eeb110e9 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -20,13 +20,14 @@ XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
 {
 }
 
-std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line)
+std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
+                                               bool unescape)
 {
   line_ = line;
   node_collection_.Clear();
   try {
     if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
-                                top_label_set_, false)) {
+                                top_label_set_, unescape)) {
       throw Exception("");
     }
   } catch (const XmlException &e) {
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index e0b75c830..0f671c65a 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -15,18 +15,42 @@
 namespace MosesTraining {
 namespace Syntax {
 
-// Parses a string in Moses' XML parse tree format and returns a SyntaxTree
-// object.  This is a wrapper around the ProcessAndStripXMLTags function.
+/** Parses string representations of parse trees in Moses' XML format and
+ *  converts them to SyntaxTree objects.
+ *
+ *  This is a thin wrapper around the ProcessAndStripXMLTags function.  After
+ *  calling Parse(), the output of the ProcessAndStripXMLTags function (the
+ *  sentence, node collection, label set, and top label set) are available via
+ *  accessors.
+ */
 class XmlTreeParser {
  public:
   XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
 
-  std::auto_ptr<SyntaxTree> Parse(const std::string &);
+  //! Parse a single sentence and return a SyntaxTree (with words attached).
+  std::auto_ptr<SyntaxTree> Parse(const std::string &, bool=false);
 
-  const std::vector<std::string>& GetWords() {
-    return words_;
-  }
+  // TODO
+  //! Get the sentence string (see ProcessAndStripXMLTags)
+  //const std::string &sentence() const;
 
+  // FIXME
+  //! Get the sentence as a vector of tokens
+  const std::vector<std::string>& GetWords() { return words_; }
+
+  // TODO
+  //! Get the node collection (see ProcessAndStripXMLTags)
+  const SyntaxNodeCollection &node_collection() const;
+
+  // TODO
+  //! Get the label set (see ProcessAndStripXMLTags)
+  const std::set<std::string> &label_set() const;
+
+  // TODO
+  //! Get the top label set (see ProcessAndStripXMLTags)
+  const std::map<std::string, int> &top_label_set() const;
+
+  // FIXME
   const SyntaxNodeCollection &GetNodeCollection() const {
     return node_collection_;
   }
diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc
new file mode 100644
index 000000000..3c16cb2eb
--- /dev/null
+++ b/phrase-extract/syntax-common/xml_tree_writer.cc
@@ -0,0 +1,82 @@
+#include "xml_tree_writer.h"
+
+#include <cassert>
+#include <ostream>
+#include <vector>
+#include <string>
+
+#include "SyntaxTree.h"
+#include "XmlTree.h"
+
+
+namespace MosesTraining {
+namespace Syntax {
+
+void XmlTreeWriter::Write(const SyntaxTree &tree) const {
+  assert(!tree.IsLeaf());
+
+  // Opening tag
+  out_ << "<tree label=\"" << Escape(tree.value().GetLabel()) << "\"";
+  for (SyntaxNode::AttributeMap::const_iterator
+       p = tree.value().attributes.begin();
+       p != tree.value().attributes.end(); ++p) {
+    if (p->first != "label") {
+      out_ << " " << p->first << "=\"" << p->second << "\"";
+    }
+  }
+  out_ << ">";
+
+  // Children
+  for (std::vector<SyntaxTree *>::const_iterator p = tree.children().begin();
+       p != tree.children().end(); ++p) {
+    SyntaxTree &child = **p;
+    if (child.IsLeaf()) {
+      out_ << " " << Escape(child.value().GetLabel());
+    } else {
+      out_ << " ";
+      Write(child);
+    }
+  }
+
+  // Closing tag
+  out_ << " </tree>";
+
+  if (tree.parent() == 0) {
+    out_ << std::endl;
+  }
+}
+
+// Escapes XML special characters.
+std::string XmlTreeWriter::Escape(const std::string &s) const {
+  if (!escape_) {
+    return s;
+  }
+  std::string t;
+  std::size_t len = s.size();
+  t.reserve(len);
+  for (std::size_t i = 0; i < len; ++i) {
+    if (s[i] == '<') {
+      t += "&lt;";
+    } else if (s[i] == '>') {
+      t += "&gt;";
+    } else if (s[i] == '[') {
+      t += "&#91;";
+    } else if (s[i] == ']') {
+      t += "&#93;";
+    } else if (s[i] == '|') {
+      t += "&#124;";
+    } else if (s[i] == '&') {
+      t += "&amp;";
+    } else if (s[i] == '\'') {
+      t += "&apos;";
+    } else if (s[i] == '"') {
+      t += "&quot;";
+    } else {
+      t += s[i];
+    }
+  }
+  return t;
+}
+
+}  // namespace Syntax
+}  // namespace MosesTraining
diff --git a/phrase-extract/syntax-common/xml_tree_writer.h b/phrase-extract/syntax-common/xml_tree_writer.h
new file mode 100644
index 000000000..b39d01fab
--- /dev/null
+++ b/phrase-extract/syntax-common/xml_tree_writer.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "SyntaxTree.h"
+
+namespace MosesTraining {
+namespace Syntax {
+
+class XmlTreeWriter {
+ public:
+  XmlTreeWriter(std::ostream &out, bool escape=true)
+      : out_(out)
+      , escape_(escape) {}
+
+  void Write(const SyntaxTree &) const;
+
+ private:
+  std::string Escape(const std::string &) const;
+
+  std::ostream &out_;
+  bool escape_;
+};
+
+}  // namespace Syntax
+}  // namespace MosesTraining

From 2e21f051f217a6b835433cbc456bdcc841187ec0 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 10:05:36 +0100
Subject: [PATCH 2/6] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   | 43 +++++++--------
 .../filter-rule-table/FilterRuleTable.cpp     |  4 +-
 phrase-extract/pcfg-extract/pcfg_extract.cc   |  4 +-
 phrase-extract/pcfg-score/pcfg_score.cc       |  4 +-
 .../syntax-common/xml_tree_parser.cc          | 15 ++----
 .../syntax-common/xml_tree_parser.h           | 53 ++++++++-----------
 6 files changed, 49 insertions(+), 74 deletions(-)

diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 2293371ac..c48a37367 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -119,14 +119,6 @@ int ExtractGHKM::Main(int argc, char *argv[])
     OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
   }
 
-  // Target label sets for producing glue grammar.
-  std::set<std::string> targetLabelSet;
-  std::map<std::string, int> targetTopLabelSet;
-
-  // Source label sets for producing glue grammar.
-  std::set<std::string> sourceLabelSet;
-  std::map<std::string, int> sourceTopLabelSet;
-
   // Word count statistics for producing unknown word labels.
   std::map<std::string, int> targetWordCount;
   std::map<std::string, std::string> targetWordLabel;
@@ -139,8 +131,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
   std::string sourceLine;
   std::string alignmentLine;
   Alignment alignment;
-  Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
-  Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
+  Syntax::XmlTreeParser targetXmlTreeParser;
+  Syntax::XmlTreeParser sourceXmlTreeParser;
   ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
   StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
   size_t lineNum = options.sentenceOffset;
@@ -194,7 +186,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
         }
         Error(oss.str());
       }
-      sourceTokens = sourceXmlTreeParser.GetWords();
+      sourceTokens = sourceXmlTreeParser.words();
     }
 
     // Read word alignments.
@@ -240,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
     // Initialize phrase orientation scoring object
     PhraseOrientation phraseOrientation(sourceTokens.size(),
-        targetXmlTreeParser.GetWords().size(), alignment);
+        targetXmlTreeParser.words().size(), alignment);
 
     // Write the rules, subject to scope pruning.
     const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
@@ -272,7 +264,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
         // SCFG output.
         ScfgRule *r = 0;
         if (options.sourceLabels) {
-          r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection());
+          r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection());
         } else {
           r = new ScfgRule(**q);
         }
@@ -315,14 +307,14 @@ int ExtractGHKM::Main(int argc, char *argv[])
 
   std::map<std::string,size_t> sourceLabels;
   if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
-
-    sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side)
-    sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side)
-    sourceLabelSet.insert("TOPLABEL");  // as used in the glue grammar
-    sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar
+    std::set<std::string> extendedLabelSet = sourceXmlTreeParser.label_set();
+    extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side)
+    extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side)
+    extendedLabelSet.insert("TOPLABEL");  // as used in the glue grammar
+    extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar
     size_t index = 0;
-    for (std::set<std::string>::const_iterator iter=sourceLabelSet.begin();
-         iter!=sourceLabelSet.end(); ++iter, ++index) {
+    for (std::set<std::string>::const_iterator iter=extendedLabelSet.begin();
+         iter!=extendedLabelSet.end(); ++iter, ++index) {
       sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
     }
     WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
@@ -332,14 +324,18 @@ int ExtractGHKM::Main(int argc, char *argv[])
   std::map<std::string, int> strippedTargetTopLabelSet;
   if (options.stripBitParLabels &&
       (!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
-    StripBitParLabels(targetLabelSet, targetTopLabelSet, strippedTargetLabelSet, strippedTargetTopLabelSet);
+    StripBitParLabels(targetXmlTreeParser.label_set(),
+                      targetXmlTreeParser.top_label_set(),
+                      strippedTargetLabelSet, strippedTargetTopLabelSet);
   }
 
   if (!options.glueGrammarFile.empty()) {
     if (options.stripBitParLabels) {
       WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
     } else {
-      WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream);
+      WriteGlueGrammar(targetXmlTreeParser.label_set(),
+                       targetXmlTreeParser.top_label_set(),
+                       sourceLabels, options, glueGrammarStream);
     }
   }
 
@@ -355,7 +351,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
     if (options.stripBitParLabels) {
       WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
     } else {
-      WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream);
+      WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(),
+                                  unknownWordSoftMatchesStream);
     }
   }
 
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
index 0c6f132f8..32d2019cf 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
@@ -126,9 +126,7 @@ void FilterRuleTable::ReadTestSet(
 void FilterRuleTable::ReadTestSet(
   std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
 {
-  std::set<std::string> labelSet;
-  std::map<std::string, int> topLabelSet;
-  XmlTreeParser parser(labelSet, topLabelSet);
+  XmlTreeParser parser;
   int lineNum = 0;
   std::string line;
   while (std::getline(input, line)) {
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
index 8e7a40e07..87419edb7 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.cc
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -60,9 +60,7 @@ int PcfgExtract::Main(int argc, char *argv[])
   Vocabulary non_term_vocab;
   RuleExtractor rule_extractor(non_term_vocab);
   RuleCollection rule_collection;
-  std::set<std::string> label_set;
-  std::map<std::string, int> top_label_set;
-  XmlTreeParser parser(label_set, top_label_set);
+  XmlTreeParser parser;
   std::string line;
   std::size_t line_num = 0;
   std::auto_ptr<MosesTraining::SyntaxTree> tree;
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
index d656d2882..e11f73f70 100644
--- a/phrase-extract/pcfg-score/pcfg_score.cc
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -66,9 +66,7 @@ int PcfgScore::Main(int argc, char *argv[])
 
   // Score corpus according to PCFG.
   TreeScorer scorer(pcfg, non_term_vocab);
-  std::set<std::string> label_set;
-  std::map<std::string, int> top_label_set;
-  XmlTreeParser parser(label_set, top_label_set);
+  XmlTreeParser parser;
   XmlTreeWriter writer(std::cout);
   std::string line;
   std::size_t line_num = 0;
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index 6eeb110e9..34f566a03 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -10,23 +10,18 @@
 #include "XmlException.h"
 #include "XmlTree.h"
 
+#include "exception.h"
+
 namespace MosesTraining {
 namespace Syntax {
 
-XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
-                             std::map<std::string, int> &topLabelSet)
-  : label_set_(labelSet)
-  , top_label_set_(topLabelSet)
-{
-}
-
 std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
                                                bool unescape)
 {
-  line_ = line;
+  sentence_ = line;
   node_collection_.Clear();
   try {
-    if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
+    if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_,
                                 top_label_set_, unescape)) {
       throw Exception("");
     }
@@ -34,7 +29,7 @@ std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line,
     throw Exception(e.getMsg());
   }
   std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree();
-  words_ = util::tokenize(line_);
+  words_ = util::tokenize(sentence_);
   AttachWords(words_, *root);
   return root;
 }
diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h
index 0f671c65a..48ea056b8 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@@ -6,12 +6,9 @@
 #include <string>
 #include <vector>
 
-#include "SyntaxNode.h"
 #include "SyntaxNodeCollection.h"
 #include "SyntaxTree.h"
 
-#include "exception.h"
-
 namespace MosesTraining {
 namespace Syntax {
 
@@ -25,44 +22,36 @@ namespace Syntax {
  */
 class XmlTreeParser {
  public:
-  XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
-
   //! Parse a single sentence and return a SyntaxTree (with words attached).
-  std::auto_ptr<SyntaxTree> Parse(const std::string &, bool=false);
+  std::auto_ptr<SyntaxTree> Parse(const std::string &, bool unescape=false);
 
-  // TODO
-  //! Get the sentence string (see ProcessAndStripXMLTags)
-  //const std::string &sentence() const;
+  //! Get the sentence string (as returned by ProcessAndStripXMLTags).
+  const std::string &sentence() const { return sentence_; }
 
-  // FIXME
-  //! Get the sentence as a vector of tokens
-  const std::vector<std::string>& GetWords() { return words_; }
+  //! Get the sentence as a vector of words.
+  const std::vector<std::string> &words() const { return words_; }
 
-  // TODO
-  //! Get the node collection (see ProcessAndStripXMLTags)
-  const SyntaxNodeCollection &node_collection() const;
-
-  // TODO
-  //! Get the label set (see ProcessAndStripXMLTags)
-  const std::set<std::string> &label_set() const;
-
-  // TODO
-  //! Get the top label set (see ProcessAndStripXMLTags)
-  const std::map<std::string, int> &top_label_set() const;
-
-  // FIXME
-  const SyntaxNodeCollection &GetNodeCollection() const {
+  //! Get the node collection (as returned by ProcessAndStripXMLTags).
+  const SyntaxNodeCollection &node_collection() const {
     return node_collection_;
   }
 
- private:
-  std::set<std::string> &label_set_;
-  std::map<std::string, int> &top_label_set_;
-  std::string line_;
-  SyntaxNodeCollection node_collection_;
-  std::vector<std::string> words_;
+  //! Get the label set (as returned by ProcessAndStripXMLTags).
+  const std::set<std::string> &label_set() const { return label_set_; }
 
+  //! Get the top label set (as returned by ProcessAndStripXMLTags).
+  const std::map<std::string, int> &top_label_set() const {
+    return top_label_set_;
+  }
+
+ private:
   void AttachWords(const std::vector<std::string> &, SyntaxTree &);
+
+  std::string sentence_;
+  SyntaxNodeCollection node_collection_;
+  std::set<std::string> label_set_;
+  std::map<std::string, int> top_label_set_;
+  std::vector<std::string> words_;
 };
 
 }  // namespace Syntax

From 5e09d3dc71ab8391c651418c01aa5c324e53683b Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 10:33:46 +0100
Subject: [PATCH 3/6] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNode.h             | 25 +-------------
 phrase-extract/SyntaxNodeCollection.cpp | 43 -------------------------
 phrase-extract/SyntaxNodeCollection.h   | 10 +-----
 phrase-extract/XmlTree.cpp              |  5 ---
 phrase-extract/extract-rules-main.cpp   | 19 ++++++++---
 5 files changed, 17 insertions(+), 85 deletions(-)

diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
index 5f57e1790..883f9724f 100644
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@@ -32,9 +32,6 @@ class SyntaxNode
 protected:
   int m_start, m_end;
   std::string m_label;
-  std::vector< SyntaxNode* > m_children;
-  SyntaxNode* m_parent;
-  float m_pcfgScore;
 public:
   typedef std::map<std::string, std::string> AttributeMap;
 
@@ -43,9 +40,7 @@ public:
   SyntaxNode( int startPos, int endPos, std::string label )
     :m_start(startPos)
     ,m_end(endPos)
-    ,m_label(label)
-    ,m_parent(0)
-    ,m_pcfgScore(0.0f) {
+    ,m_label(label) {
   }
   int GetStart() const {
     return m_start;
@@ -56,24 +51,6 @@ public:
   std::string GetLabel() const {
     return m_label;
   }
-  float GetPcfgScore() const {
-    return m_pcfgScore;
-  }
-  void SetPcfgScore(float score) {
-    m_pcfgScore = score;
-  }
-  SyntaxNode *GetParent() {
-    return m_parent;
-  }
-  void SetParent(SyntaxNode *parent) {
-    m_parent = parent;
-  }
-  void AddChild(SyntaxNode* child) {
-    m_children.push_back(child);
-  }
-  const std::vector< SyntaxNode* > &GetChildren() const {
-    return m_children;
-  }
 };
 
 }  // namespace MosesTraining
diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 60a2f6c2f..e1c9c44e1 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -33,7 +33,6 @@ SyntaxNodeCollection::~SyntaxNodeCollection()
 
 void SyntaxNodeCollection::Clear()
 {
-  m_top = 0;
   // loop through all m_nodes, delete them
   for(size_t i=0; i<m_nodes.size(); i++) {
     delete m_nodes[i];
@@ -110,48 +109,6 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos,
   return endIndex->second;
 }
 
-void SyntaxNodeCollection::ConnectNodes()
-{
-  typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
-
-  SyntaxNode *prev = 0;
-  // Iterate over all start indices from lowest to highest.
-  for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
-    const SyntaxTreeIndex2 &inner = p->second;
-    // Iterate over all end indices from highest to lowest.
-    for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
-      const std::vector<SyntaxNode*> &nodes = q->second;
-      // Iterate over all nodes that cover the same span in order of tree
-      // depth, top-most first.
-      for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
-           r != nodes.rend(); ++r) {
-        SyntaxNode *node = *r;
-        if (!prev) {
-          // node is the root.
-          m_top = node;
-          node->SetParent(0);
-        } else if (prev->GetStart() == node->GetStart()) {
-          // prev is the parent of node.
-          assert(prev->GetEnd() >= node->GetEnd());
-          node->SetParent(prev);
-          prev->AddChild(node);
-        } else {
-          // prev is a descendant of node's parent.  The lowest common
-          // ancestor of prev and node will be node's parent.
-          SyntaxNode *ancestor = prev->GetParent();
-          while (ancestor->GetEnd() < node->GetEnd()) {
-            ancestor = ancestor->GetParent();
-          }
-          assert(ancestor);
-          node->SetParent(ancestor);
-          ancestor->AddChild(node);
-        }
-        prev = node;
-      }
-    }
-  }
-}
-
 std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
 {
   std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index a0d19841c..c8ca67d3d 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -38,7 +38,6 @@ class SyntaxNodeCollection
 {
 protected:
   std::vector< SyntaxNode* > m_nodes;
-  SyntaxNode* m_top;
 
   typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
   typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
@@ -49,18 +48,12 @@ protected:
   std::vector< SyntaxNode* > m_emptyNode;
 
 public:
-  SyntaxNodeCollection()
-    : m_top(0)  // m_top doesn't get set unless ConnectNodes is called.
-    , m_size(0) {}
+  SyntaxNodeCollection() : m_size(0) {}
 
   ~SyntaxNodeCollection();
 
   SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
 
-  SyntaxNode *GetTop() {
-    return m_top;
-  }
-
   ParentNodes Parse();
   bool HasNode( int startPos, int endPos ) const;
   const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
@@ -70,7 +63,6 @@ public:
   size_t GetNumWords() const {
     return m_size;
   }
-  void ConnectNodes();
   void Clear();
 
   std::auto_ptr<SyntaxTree> ExtractTree();
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index d3c5da900..ffbbd453a 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -398,10 +398,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
         string label = ParseXmlTagAttribute(tagContent,"label");
         labelCollection.insert( label );
 
-        string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
-        float pcfgScore = pcfgString == "" ? 0.0f
-                          : std::atof(pcfgString.c_str());
-
         // report what we have processed so far
         if (0) {
           cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
@@ -409,7 +405,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
           cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
         }
         SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
-        node->SetPcfgScore(pcfgScore);
         ParseXmlTagAttributes(tagContent, node->attributes);
       }
     }
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 825f12d89..8f1ff758b 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -110,6 +110,8 @@ void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
 void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
 void writeUnknownWordLabel(const string &);
 
+double getPcfgScore(const SyntaxNode &);
+
 
 int main(int argc, char* argv[])
 {
@@ -564,8 +566,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
       }
 
       if (m_options.pcfgScore) {
-        double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
-        logPCFGScore -= score;
+        logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]);
       }
 
       currPos = hole.GetEnd(1);
@@ -689,7 +690,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
 
   // target
   if (m_options.pcfgScore) {
-    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+    double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]);
     rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
                   + " [" + targetLabel + "]";
     rule.pcfgScore = std::exp(logPCFGScore);
@@ -973,7 +974,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
   rule.target += "[" + targetLabel + "]";
 
   if (m_options.pcfgScore) {
-    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+    double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]);
     rule.pcfgScore = std::exp(logPCFGScore);
   }
 
@@ -1194,3 +1195,13 @@ void writeUnknownWordLabel(const string & fileName)
 
   outFile.close();
 }
+
+double getPcfgScore(const SyntaxNode &node)
+{
+  double score = 0.0f;
+  SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg");
+  if (p != node.attributes.end()) {
+    score = std::atof(p->second.c_str());
+  }
+  return score;
+}

From ed321791a75c6177b218a0098d184c308bc9c561 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 11:10:45 +0100
Subject: [PATCH 4/6] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNode.h                   | 36 +++++----------
 phrase-extract/SyntaxNodeCollection.cpp       |  8 ++--
 phrase-extract/XmlTree.cpp                    |  2 +-
 .../extract-ghkm/AlignmentGraph.cpp           |  3 +-
 phrase-extract/extract-ghkm/ExtractGHKM.cpp   |  6 +--
 phrase-extract/extract-ghkm/ScfgRule.cpp      |  2 +-
 phrase-extract/extract-rules-main.cpp         | 16 +++----
 .../filter-rule-table/TreeTsgFilter.cpp       |  2 +-
 phrase-extract/pcfg-extract/rule_extractor.cc |  4 +-
 phrase-extract/pcfg-score/tree_scorer.cc      |  4 +-
 phrase-extract/relax-parse-main.cpp           | 44 +++++++++----------
 .../syntax-common/xml_tree_parser.cc          |  6 +--
 .../syntax-common/xml_tree_writer.cc          |  4 +-
 13 files changed, 62 insertions(+), 75 deletions(-)

diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h
index 883f9724f..f38e94713 100644
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@@ -20,37 +20,23 @@
 #pragma once
 
 #include <map>
-#include <sstream>
 #include <string>
-#include <vector>
 
-namespace MosesTraining
-{
+namespace MosesTraining {
 
-class SyntaxNode
-{
-protected:
-  int m_start, m_end;
-  std::string m_label;
-public:
+struct SyntaxNode {
   typedef std::map<std::string, std::string> AttributeMap;
 
-  AttributeMap attributes;
+  SyntaxNode(const std::string &label_, int start_, int end_)
+    : label(label_)
+    , start(start_)
+    , end(end_) {
+  }
 
-  SyntaxNode( int startPos, int endPos, std::string label )
-    :m_start(startPos)
-    ,m_end(endPos)
-    ,m_label(label) {
-  }
-  int GetStart() const {
-    return m_start;
-  }
-  int GetEnd() const {
-    return m_end;
-  }
-  std::string GetLabel() const {
-    return m_label;
-  }
+  std::string label;
+  int start;
+  int end;
+  AttributeMap attributes;
 };
 
 }  // namespace MosesTraining
diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index e1c9c44e1..7421cc0ed 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -44,7 +44,7 @@ void SyntaxNodeCollection::Clear()
 SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
                                           const std::string &label)
 {
-  SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
+  SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
   m_nodes.push_back( newNode );
   m_index[ startPos ][ endPos ].push_back( newNode );
   m_size = std::max(endPos+1, m_size);
@@ -141,16 +141,16 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
           // node is the root.
           root = tree;
           tree->parent() = 0;
-        } else if (prevNode->GetStart() == node->GetStart()) {
+        } else if (prevNode->start == node->start) {
           // prevNode is the parent of node.
-          assert(prevNode->GetEnd() >= node->GetEnd());
+          assert(prevNode->end >= node->end);
           tree->parent() = prevTree;
           prevTree->children().push_back(tree);
         } else {
           // prevNode is a descendant of node's parent.  The lowest common
           // ancestor of prevNode and node will be node's parent.
           SyntaxTree *ancestor = prevTree->parent();
-          while (ancestor->value().GetEnd() < tree->value().GetEnd()) {
+          while (ancestor->value().end < tree->value().end) {
             ancestor = ancestor->parent();
           }
           assert(ancestor);
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index ffbbd453a..d8b77b6e6 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -419,7 +419,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
   const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
   for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
     SyntaxNode *n = *node;
-    const string &label = n->GetLabel();
+    const string &label = n->label;
     if (topLabelCollection.find( label ) == topLabelCollection.end())
       topLabelCollection[ label ] = 0;
     topLabelCollection[ label ]++;
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 1a3c23de5..7c179295f 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -21,6 +21,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstdlib>
 #include <memory>
 #include <stack>
 
@@ -213,7 +214,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
 {
   NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
 
-  std::auto_ptr<Node> n(new Node(root->value().GetLabel(), nodeType));
+  std::auto_ptr<Node> n(new Node(root->value().label, nodeType));
 
   if (nodeType == TREE) {
     float score = 0.0f;
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index c48a37367..c96cda146 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -813,7 +813,7 @@ void ExtractGHKM::CollectWordLabelCounts(
   for (SyntaxTree::ConstLeafIterator p(root);
        p != SyntaxTree::ConstLeafIterator(); ++p) {
     const SyntaxTree &leaf = *p;
-    const std::string &word = leaf.value().GetLabel();
+    const std::string &word = leaf.value().label;
     const SyntaxTree *ancestor = leaf.parent();
     // If unary rule elimination is enabled and this word is at the end of a
     // chain of unary rewrites, e.g.
@@ -825,7 +825,7 @@ void ExtractGHKM::CollectWordLabelCounts(
            ancestor->parent()->children().size() == 1) {
       ancestor = ancestor->parent();
     }
-    const std::string &label = ancestor->value().GetLabel();
+    const std::string &label = ancestor->value().label;
     ++wordCount[word];
     wordLabel[word] = label;
   }
@@ -837,7 +837,7 @@ std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
   for (SyntaxTree::ConstLeafIterator p(root);
        p != SyntaxTree::ConstLeafIterator(); ++p) {
     const SyntaxTree &leaf = *p;
-    const std::string &word = leaf.value().GetLabel();
+    const std::string &word = leaf.value().label;
     tokens.push_back(word);
   }
   return tokens;
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index a6fc19dd9..1a49c862e 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -144,7 +144,7 @@ void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection,
       sourceNodeCollection->GetNodes(span.first,span.second);
     if (!sourceLabels.empty()) {
       // store the topmost matching label from the source syntax tree
-      m_sourceLabels.push_back(sourceLabels.back()->GetLabel());
+      m_sourceLabels.push_back(sourceLabels.back()->label);
     }
   } else {
     // no matching source-side syntactic constituent: store nonMatchingLabel
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 8f1ff758b..e6fff965d 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -507,7 +507,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
 
       int labelI = labelIndex[ 2+holeCount+holeTotal ];
       string label = m_options.sourceSyntax ?
-                     m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+                     m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X";
       hole.SetLabel(label, 0);
 
       currPos = hole.GetEnd(0);
@@ -550,7 +550,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
       int labelI = labelIndex[ 2+holeCount ];
       string targetLabel;
       if (m_options.targetSyntax) {
-        targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
+        targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
       } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
         targetLabel = "S";
       } else {
@@ -675,7 +675,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
   // phrase labels
   string targetLabel;
   if (m_options.targetSyntax) {
-    targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
+    targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
   } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
     targetLabel = "S";
   } else {
@@ -683,7 +683,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
   }
 
   string sourceLabel = m_options.sourceSyntax ?
-                       m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+                       m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X";
 
   // create non-terms on the source side
   preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
@@ -947,13 +947,13 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
   // phrase labels
   string targetLabel,sourceLabel;
   if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
-    sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+    sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
   } else {
     sourceLabel = m_options.sourceSyntax ?
-                  m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+                  m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
 
     if (m_options.targetSyntax) {
-      targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+      targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
     } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
       targetLabel = "S";
     } else {
@@ -1166,7 +1166,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
     const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
     if (labels.size() > 0) {
       wordCount[ word ]++;
-      wordLabel[ word ] = labels[0]->GetLabel();
+      wordLabel[ word ] = labels[0]->label;
     }
   }
 }
diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
index 17a8dcb22..b9c58228d 100644
--- a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
+++ b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
@@ -27,7 +27,7 @@ TreeTsgFilter::TreeTsgFilter(
 
 TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s)
 {
-  IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel()));
+  IdTree *t = new IdTree(m_testVocab.Insert(s.value().label));
   const std::vector<SyntaxTree*> &sChildren = s.children();
   std::vector<IdTree*> &tChildren = t->children();
   tChildren.reserve(sChildren.size());
diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc
index 39da54ef2..f20f2d978 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.cc
+++ b/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -37,7 +37,7 @@ void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
     return;
   }
 
-  std::size_t lhs = non_term_vocab_.Insert(tree.value().GetLabel());
+  std::size_t lhs = non_term_vocab_.Insert(tree.value().label);
   std::vector<std::size_t> rhs;
 
   const std::vector<SyntaxTree *> &children = tree.children();
@@ -45,7 +45,7 @@ void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const
   for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
     const SyntaxTree &child = **p;
-    rhs.push_back(non_term_vocab_.Insert(child.value().GetLabel()));
+    rhs.push_back(non_term_vocab_.Insert(child.value().label));
     Extract(child, rc);
   }
   rc.Add(lhs, rhs);
diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc
index 61ae16e4c..3c6b6b0c8 100644
--- a/phrase-extract/pcfg-score/tree_scorer.cc
+++ b/phrase-extract/pcfg-score/tree_scorer.cc
@@ -58,13 +58,13 @@ bool TreeScorer::CalcScores(SyntaxTree &root)
 
   std::vector<std::size_t> key;
   key.reserve(children.size()+1);
-  key.push_back(non_term_vocab_.Lookup(root.value().GetLabel()));
+  key.push_back(non_term_vocab_.Lookup(root.value().label));
 
   for (std::vector<SyntaxTree *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
     SyntaxTree *child = *p;
     assert(!child->IsLeaf());
-    key.push_back(non_term_vocab_.Lookup(child->value().GetLabel()));
+    key.push_back(non_term_vocab_.Lookup(child->value().label));
     if (!CalcScores(*child)) {
       return false;
     }
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index 5bca886bf..4b5c2d573 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -118,9 +118,9 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words )
   // output tree nodes
   vector< SyntaxNode* > nodes = tree.GetAllNodes();
   for( size_t i=0; i<nodes.size(); i++ ) {
-    cout << " <tree span=\"" << nodes[i]->GetStart()
-         << "-" << nodes[i]->GetEnd()
-         << "\" label=\"" << nodes[i]->GetLabel()
+    cout << " <tree span=\"" << nodes[i]->start
+         << "-" << nodes[i]->end
+         << "\" label=\"" << nodes[i]->label
          << "\"/>";
   }
   cout << endl;
@@ -133,7 +133,7 @@ void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
     if (point.size() > 3) {
       const vector< SyntaxNode* >& topNodes
       = tree.GetNodes( point[0], point[point.size()-1]-1);
-      string topLabel = topNodes[0]->GetLabel();
+      string topLabel = topNodes[0]->label;
 
       for(size_t i=2; i<point.size()-1; i++) {
         // cerr << "LeftBin  " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl;
@@ -151,7 +151,7 @@ void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
       int endPoint = point[point.size()-1]-1;
       const vector< SyntaxNode* >& topNodes
       = tree.GetNodes( point[0], endPoint);
-      string topLabel = topNodes[0]->GetLabel();
+      string topLabel = topNodes[0]->label;
 
       for(size_t i=1; i<point.size()-2; i++) {
         // cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl;
@@ -178,29 +178,29 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
       // cerr << endl;
 
       for(size_t i = 0; i+2 < point.size(); i++) {
-        // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i  ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl;
+        // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i  ],point[i+1]-1)[0]->label << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->label << endl;
 
         newTree.AddNode( point[i],point[i+2]-1,
-                         tree.GetNodes(point[i  ],point[i+1]-1)[0]->GetLabel()
+                         tree.GetNodes(point[i  ],point[i+1]-1)[0]->label
                          + "+" +
-                         tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() );
+                         tree.GetNodes(point[i+1],point[i+2]-1)[0]->label);
       }
     }
     if (point.size() >= 4) {
       int ps = point.size();
-      string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel();
+      string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label;
 
-      // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl;
+      // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->label << endl;
       newTree.AddNode( point[1],point[ps-1]-1,
                        topLabel
                        + "\\" +
-                       tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() );
+                       tree.GetNodes(point[0],point[1]-1)[0]->label );
 
-      // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl;
+      // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label << endl;
       newTree.AddNode( point[0],point[ps-2]-1,
                        topLabel
                        + "/" +
-                       tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() );
+                       tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label );
     }
   }
 
@@ -219,12 +219,12 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
 
       for(int mid=start+1; mid<=end && !done; mid++) {
         if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
-          // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid,  end  )[0]->GetLabel() << endl;
+          // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->label << "++" << tree.GetNodes(mid,  end  )[0]->label << endl;
 
           newTree.AddNode( start, end,
-                           tree.GetNodes(start,mid-1)[0]->GetLabel()
+                           tree.GetNodes(start,mid-1)[0]->label
                            + "++" +
-                           tree.GetNodes(mid,  end  )[0]->GetLabel() );
+                           tree.GetNodes(mid,  end  )[0]->label );
           done = true;
         }
       }
@@ -234,9 +234,9 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
       for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
         if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
           newTree.AddNode( start, end,
-                           tree.GetNodes(start,postEnd)[0]->GetLabel()
+                           tree.GetNodes(start,postEnd)[0]->label
                            + "//" +
-                           tree.GetNodes(end+1,postEnd)[0]->GetLabel() );
+                           tree.GetNodes(end+1,postEnd)[0]->label );
           done = true;
         }
       }
@@ -245,11 +245,11 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
       // if matching a constituent A left-minus constituent B: use A\\B
       for(int preStart=start-1; preStart>=0; preStart--) {
         if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
-          // cerr << "\tadding " << tree.GetNodes(preStart,end    )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl;
+          // cerr << "\tadding " << tree.GetNodes(preStart,end    )[0]->label << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->label << endl;
           newTree.AddNode( start, end,
-                           tree.GetNodes(preStart,end    )[0]->GetLabel()
+                           tree.GetNodes(preStart,end    )[0]->label
                            + "\\\\" +
-                           tree.GetNodes(preStart,start-1)[0]->GetLabel() );
+                           tree.GetNodes(preStart,start-1)[0]->label );
           done = true;
         }
       }
@@ -268,6 +268,6 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
   // adding all new nodes
   vector< SyntaxNode* > nodes = newTree.GetAllNodes();
   for( size_t i=0; i<nodes.size(); i++ ) {
-    tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel());
+    tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
   }
 }
diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc
index 34f566a03..8bd511522 100644
--- a/phrase-extract/syntax-common/xml_tree_parser.cc
+++ b/phrase-extract/syntax-common/xml_tree_parser.cc
@@ -47,15 +47,15 @@ void XmlTreeParser::AttachWords(const std::vector<std::string> &words,
   for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end();
        ++p) {
     SyntaxTree *leaf = *p;
-    const int start = leaf->value().GetStart();
-    const int end = leaf->value().GetEnd();
+    const int start = leaf->value().start;
+    const int end = leaf->value().end;
     if (start != end) {
       std::ostringstream msg;
       msg << "leaf node covers multiple words (" << start << "-" << end
           << "): this is currently unsupported";
       throw Exception(msg.str());
     }
-    SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++));
+    SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end));
     leaf->children().push_back(newLeaf);
     newLeaf->parent() = leaf;
   }
diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc
index 3c16cb2eb..d17937fa8 100644
--- a/phrase-extract/syntax-common/xml_tree_writer.cc
+++ b/phrase-extract/syntax-common/xml_tree_writer.cc
@@ -16,7 +16,7 @@ void XmlTreeWriter::Write(const SyntaxTree &tree) const {
   assert(!tree.IsLeaf());
 
   // Opening tag
-  out_ << "<tree label=\"" << Escape(tree.value().GetLabel()) << "\"";
+  out_ << "<tree label=\"" << Escape(tree.value().label) << "\"";
   for (SyntaxNode::AttributeMap::const_iterator
        p = tree.value().attributes.begin();
        p != tree.value().attributes.end(); ++p) {
@@ -31,7 +31,7 @@ void XmlTreeWriter::Write(const SyntaxTree &tree) const {
        p != tree.children().end(); ++p) {
     SyntaxTree &child = **p;
     if (child.IsLeaf()) {
-      out_ << " " << Escape(child.value().GetLabel());
+      out_ << " " << Escape(child.value().label);
     } else {
       out_ << " ";
       Write(child);

From 9097fd8965e039f9c5c889d76a614dd4eda19651 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 14:09:49 +0100
Subject: [PATCH 5/6] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNodeCollection.cpp | 20 ++++++-----
 phrase-extract/SyntaxNodeCollection.h   | 44 ++++++++++++++++---------
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 7421cc0ed..356c49bf4 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -47,7 +47,7 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
   SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
   m_nodes.push_back( newNode );
   m_index[ startPos ][ endPos ].push_back( newNode );
-  m_size = std::max(endPos+1, m_size);
+  m_numWords = std::max(endPos+1, m_numWords);
   return newNode;
 }
 
@@ -56,8 +56,8 @@ ParentNodes SyntaxNodeCollection::Parse()
   ParentNodes parents;
 
   // looping through all spans of size >= 2
-  for( int length=2; length<=m_size; length++ ) {
-    for( int startPos = 0; startPos <= m_size-length; startPos++ ) {
+  for( int length=2; length<=m_numWords; length++ ) {
+    for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) {
       if (HasNode( startPos, startPos+length-1 )) {
         // processing one (parent) span
 
@@ -96,13 +96,14 @@ bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
   return GetNodes( startPos, endPos).size() > 0;
 }
 
-const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
+    int startPos, int endPos ) const
 {
-  SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
+  NodeIndex::const_iterator startIndex = m_index.find( startPos );
   if (startIndex == m_index.end() )
     return m_emptyNode;
 
-  SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
+  InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
   if (endIndex == startIndex->second.end())
     return m_emptyNode;
 
@@ -120,14 +121,15 @@ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
   }
 
   // Connect the SyntaxTrees.
-  typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
+  typedef NodeIndex::const_iterator OuterIterator;
+  typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
 
   SyntaxTree *root = 0;
   SyntaxNode *prevNode = 0;
   SyntaxTree *prevTree = 0;
   // Iterate over all start indices from lowest to highest.
-  for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
-    const SyntaxTreeIndex2 &inner = p->second;
+  for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
+    const InnerNodeIndex &inner = p->second;
     // Iterate over all end indices from highest to lowest.
     for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
       const std::vector<SyntaxNode*> &nodes = q->second;
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index c8ca67d3d..060192980 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -34,38 +34,50 @@ namespace MosesTraining
 typedef std::vector< int > SplitPoints;
 typedef std::vector< SplitPoints > ParentNodes;
 
+/** A collection of SyntaxNodes organized by start and end position.
+ *
+ */
 class SyntaxNodeCollection
 {
-protected:
-  std::vector< SyntaxNode* > m_nodes;
-
-  typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
-  typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
-  typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
-  typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
-  SyntaxTreeIndex m_index;
-  int m_size;
-  std::vector< SyntaxNode* > m_emptyNode;
-
 public:
-  SyntaxNodeCollection() : m_size(0) {}
+  SyntaxNodeCollection() : m_numWords(0) {}
 
   ~SyntaxNodeCollection();
 
+  //! Construct and insert a new SyntaxNode.
   SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
 
+  // TODO Rename (and move?)
   ParentNodes Parse();
+
+  //! Return true iff there are one or more SyntaxNodes with the given span.
   bool HasNode( int startPos, int endPos ) const;
+
+  //! Lookup the SyntaxNodes for a given span.
   const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
-  const std::vector< SyntaxNode* >& GetAllNodes() {
-    return m_nodes;
-  };
+
+  //! Get a vector of pointers to all SyntaxNodes (unordered).
+  const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; };
+
   size_t GetNumWords() const {
-    return m_size;
+    return m_numWords;
   }
   void Clear();
 
   std::auto_ptr<SyntaxTree> ExtractTree();
+
+private:
+  typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
+  typedef std::map< int, InnerNodeIndex > NodeIndex;
+
+  // Not copyable.
+  SyntaxNodeCollection(const SyntaxNodeCollection &);
+  SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
+
+  std::vector< SyntaxNode* > m_nodes;
+  NodeIndex m_index;
+  int m_numWords;
+  std::vector< SyntaxNode* > m_emptyNode;
 };
 
 }  // namespace MosesTraining

From 8653bd81590d1f9f658d9560458dc72d9556e197 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 3 Jun 2015 14:20:00 +0100
Subject: [PATCH 6/6] Ongoing moses/phrase-extract refactoring

---
 phrase-extract/SyntaxNodeCollection.cpp | 40 ----------------------
 phrase-extract/SyntaxNodeCollection.h   |  6 ----
 phrase-extract/relax-parse-main.cpp     | 44 ++++++++++++++++++++++++-
 phrase-extract/relax-parse.h            | 10 ++++--
 4 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 356c49bf4..0a344fcd7 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -51,46 +51,6 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
   return newNode;
 }
 
-ParentNodes SyntaxNodeCollection::Parse()
-{
-  ParentNodes parents;
-
-  // looping through all spans of size >= 2
-  for( int length=2; length<=m_numWords; length++ ) {
-    for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) {
-      if (HasNode( startPos, startPos+length-1 )) {
-        // processing one (parent) span
-
-        //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
-        SplitPoints splitPoints;
-        splitPoints.push_back( startPos );
-        //std::cerr << " " << startPos;
-
-        int first = 1;
-        int covered = 0;
-        int found_somehing = 1; // break loop if nothing found
-        while( covered < length && found_somehing ) {
-          // find largest covering subspan (child)
-          // starting at last covered position
-          found_somehing = 0;
-          for( int midPos=length-first; midPos>covered; midPos-- ) {
-            if( HasNode( startPos+covered, startPos+midPos-1 ) ) {
-              covered = midPos;
-              splitPoints.push_back( startPos+covered );
-              // std::cerr << " " << ( startPos+covered );
-              first = 0;
-              found_somehing = 1;
-            }
-          }
-        }
-        // std::cerr << std::endl;
-        parents.push_back( splitPoints );
-      }
-    }
-  }
-  return parents;
-}
-
 bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
 {
   return GetNodes( startPos, endPos).size() > 0;
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index 060192980..8de151c55 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -31,9 +31,6 @@
 namespace MosesTraining
 {
 
-typedef std::vector< int > SplitPoints;
-typedef std::vector< SplitPoints > ParentNodes;
-
 /** A collection of SyntaxNodes organized by start and end position.
  *
  */
@@ -47,9 +44,6 @@ public:
   //! Construct and insert a new SyntaxNode.
   SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
 
-  // TODO Rename (and move?)
-  ParentNodes Parse();
-
   //! Return true iff there are one or more SyntaxNodes with the given span.
   bool HasNode( int startPos, int endPos ) const;
 
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index 4b5c2d573..f7a2a271b 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -50,7 +50,7 @@ int main(int argc, char* argv[])
     // output tree
     // cerr << "BEFORE:" << endl << tree;
 
-    ParentNodes parents = tree.Parse();
+    ParentNodes parents = determineSplitPoints(tree);
 
     // execute selected grammar relaxation schemes
     if (leftBinarizeFlag)
@@ -271,3 +271,45 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
     tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
   }
 }
+
+ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl)
+{
+  ParentNodes parents;
+
+  const std::size_t numWords = nodeColl.GetNumWords();
+
+  // looping through all spans of size >= 2
+  for( int length=2; length<=numWords; length++ ) {
+    for( int startPos = 0; startPos <= numWords-length; startPos++ ) {
+      if (nodeColl.HasNode( startPos, startPos+length-1 )) {
+        // processing one (parent) span
+
+        //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
+        SplitPoints splitPoints;
+        splitPoints.push_back( startPos );
+        //std::cerr << " " << startPos;
+
+        int first = 1;
+        int covered = 0;
+        int found_somehing = 1; // break loop if nothing found
+        while( covered < length && found_somehing ) {
+          // find largest covering subspan (child)
+          // starting at last covered position
+          found_somehing = 0;
+          for( int midPos=length-first; midPos>covered; midPos-- ) {
+            if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) {
+              covered = midPos;
+              splitPoints.push_back( startPos+covered );
+              // std::cerr << " " << ( startPos+covered );
+              first = 0;
+              found_somehing = 1;
+            }
+          }
+        }
+        // std::cerr << std::endl;
+        parents.push_back( splitPoints );
+      }
+    }
+  }
+  return parents;
+}
diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h
index a00aa6deb..7c412646a 100644
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@@ -37,10 +37,14 @@ bool leftBinarizeFlag = false;
 bool rightBinarizeFlag = false;
 char SAMTLevel = 0;
 
+typedef std::vector< int > SplitPoints;
+typedef std::vector< SplitPoints > ParentNodes;
+
 // functions
 void init(int argc, char* argv[]);
+ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &);
 void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
-void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
-void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
-void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
+void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
+void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
+void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );