moses/phrase-extract refactoring

Final commit in this round of refactoring (which started with commit
2f735998...).  The main changes are:

  - a general storage mechanism for attribute/value pairs in XML-style
    tree / lattice input.  E.g. the "pcfg-score" and "semantic-role"
    attributes in:

     <tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree>

  - consolidation of the various near-duplicate Tree / XmlTreeParser classes
    that have accumulated over the years (my fault)

  - miscellaneous de-crufting
This commit is contained in:
Phil Williams 2015-06-09 16:50:27 +01:00
parent b76194a16b
commit fa51da28c5
4 changed files with 21 additions and 19 deletions

View File

@ -25,6 +25,9 @@
namespace MosesTraining
{
/*! A node in a syntactic structure (tree, lattice, etc.). SyntaxNodes have a
* label and a span plus an arbitrary set of name/value attributes.
*/
struct SyntaxNode {
typedef std::map<std::string, std::string> AttributeMap;

View File

@ -55,11 +55,13 @@ public:
return m_nodes;
};
size_t GetNumWords() const {
return m_numWords;
}
//! Get the number of words (defined as 1 + the max end pos of any node).
std::size_t GetNumWords() const { return m_numWords; }
//! Clear the container (this deletes the SyntaxNodes).
void Clear();
//! Extract a SyntaxTree (assuming the collection's nodes constitute a tree).
std::auto_ptr<SyntaxTree> ExtractTree();
private:

View File

@ -80,7 +80,6 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName)
return tag.substr(contentsStart,contentsEnd-contentsStart);
}
// TODO Special handling of "label" attribute
// s should be a sequence of name=attribute pairs separated by whitespace.
// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
void ParseXmlTagAttributes(const std::string &s,
@ -107,8 +106,9 @@ void ParseXmlTagAttributes(const std::string &s,
throw XmlException("invalid tag content");
}
}
// TODO unescape \"
attributes[name] = s.substr(begin+1, pos-begin-1);
if (name != "label" && name != "span") {
attributes[name] = s.substr(begin+1, pos-begin-1);
}
begin = pos+1;
}
}
@ -245,20 +245,17 @@ vector<string> TokenizeXml(const string& str)
}
/**
* Process a sentence with xml annotation
* Xml tags may specifiy additional/replacing translation options
* and reordering constraints
* Process a sentence with XML-style annotation of syntactic nodes.
*
* \param line in: sentence, out: sentence without the xml
* \param res vector with translation options specified by xml
* \param reorderingConstraint reordering constraint zones specified by xml
* \param walls reordering constraint walls specified by xml
* \param line[in,out] in: sentence, out: sentence without the XML
* \param nodeCollection[out] the collection of SyntaxNode objects for this
* sentence
* \param labelCollection[out] label values are inserted into this set
* \param topLabelCollection[out] top labels (key) and their counts (value)
* are inserted into this map
* \param unescapeSpecialChars flag indicating whether XML special characters
* should be unescaped
*/
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
is so we can link things up afterwards. We can't create TranslationOptions as we
parse because we don't have the completed source parsed until after this function
removes all the markup from it (CreateFromString in Sentence::Read).
*/
bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
set< string > &labelCollection,
map< string, int > &topLabelCollection,

View File

@ -16,7 +16,7 @@ namespace Syntax {
* converts them to SyntaxTree objects.
*
* This is a thin wrapper around the ProcessAndStripXMLTags function. After
* calling Parse(), the output of the ProcessAndStripXMLTags function (the
* calling Parse(), the output from the ProcessAndStripXMLTags call (the
* sentence, node collection, label set, and top label set) are available via
* accessors.
*/