moses/phrase-extract refactoring

Final commit in this round of refactoring (which started with commit 2f735998...). The main changes are: - a general storage mechanism for attribute/value pairs in XML-style tree / lattice input. E.g. the "pcfg-score" and "semantic-role" attributes in: <tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree> - consolidation of the various near-duplicate Tree / XmlTreeParser classes that have accumulated over the years (my fault) - miscellaneous de-crufting
2024-10-05 15:58:03 +03:00 · 2015-06-09 16:50:27 +01:00 · 2015-06-09 16:50:27 +01:00 · fa51da28c5
commit fa51da28c5
parent b76194a16b
4 changed files with 21 additions and 19 deletions
--- a/phrase-extract/SyntaxNode.h
+++ b/phrase-extract/SyntaxNode.h
@ -25,6 +25,9 @@
 namespace MosesTraining
 {

+/*! A node in a syntactic structure (tree, lattice, etc.).  SyntaxNodes have a
+ *  label and a span plus an arbitrary set of name/value attributes.
+ */
 struct SyntaxNode {
  typedef std::map<std::string, std::string> AttributeMap;

--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@ -55,11 +55,13 @@ public:
    return m_nodes;
  };

-  size_t GetNumWords() const {
-    return m_numWords;
-  }
+  //! Get the number of words (defined as 1 + the max end pos of any node).
+  std::size_t GetNumWords() const { return m_numWords; }
+
+  //! Clear the container (this deletes the SyntaxNodes).
  void Clear();

+  //! Extract a SyntaxTree (assuming the collection's nodes constitute a tree).
  std::auto_ptr<SyntaxTree> ExtractTree();

 private:
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@ -80,7 +80,6 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName)
  return tag.substr(contentsStart,contentsEnd-contentsStart);
 }

-// TODO Special handling of "label" attribute
 // s should be a sequence of name=attribute pairs separated by whitespace.
 // e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
 void ParseXmlTagAttributes(const std::string &s,
@ -107,8 +106,9 @@ void ParseXmlTagAttributes(const std::string &s,
        throw XmlException("invalid tag content");
      }
    }
-    // TODO unescape \"
-    attributes[name] = s.substr(begin+1, pos-begin-1);
+    if (name != "label" && name != "span") {
+      attributes[name] = s.substr(begin+1, pos-begin-1);
+    }
    begin = pos+1;
  }
 }
@ -245,20 +245,17 @@ vector<string> TokenizeXml(const string& str)
 }

 /**
- * Process a sentence with xml annotation
- * Xml tags may specifiy additional/replacing translation options
- * and reordering constraints
+ * Process a sentence with XML-style annotation of syntactic nodes.
 *
- * \param line in: sentence, out: sentence without the xml
- * \param res vector with translation options specified by xml
- * \param reorderingConstraint reordering constraint zones specified by xml
- * \param walls reordering constraint walls specified by xml
+ * \param line[in,out]            in: sentence, out: sentence without the XML
+ * \param nodeCollection[out]     the collection of SyntaxNode objects for this
+ *                                sentence
+ * \param labelCollection[out]    label values are inserted into this set
+ * \param topLabelCollection[out] top labels (key) and their counts (value)
+ *                                are inserted into this map
+ * \param unescapeSpecialChars    flag indicating whether XML special characters
+ *                                should be unescaped
 */
-/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
-	is so we can link things up afterwards. We can't create TranslationOptions as we
-	parse because we don't have the completed source parsed until after this function
-	removes all the markup from it (CreateFromString in Sentence::Read).
-*/
 bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
                            set< string > &labelCollection,
                            map< string, int > &topLabelCollection,
--- a/phrase-extract/syntax-common/xml_tree_parser.h
+++ b/phrase-extract/syntax-common/xml_tree_parser.h
@ -16,7 +16,7 @@ namespace Syntax {
 *  converts them to SyntaxTree objects.
 *
 *  This is a thin wrapper around the ProcessAndStripXMLTags function.  After
- *  calling Parse(), the output of the ProcessAndStripXMLTags function (the
+ *  calling Parse(), the output from the ProcessAndStripXMLTags call (the
 *  sentence, node collection, label set, and top label set) are available via
 *  accessors.
 */