mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-05 15:58:03 +03:00
moses/phrase-extract refactoring
Final commit in this round of refactoring (which started with commit 2f735998...). The main changes are: - a general storage mechanism for attribute/value pairs in XML-style tree / lattice input. E.g. the "pcfg-score" and "semantic-role" attributes in: <tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree> - consolidation of the various near-duplicate Tree / XmlTreeParser classes that have accumulated over the years (my fault) - miscellaneous de-crufting
This commit is contained in:
parent
b76194a16b
commit
fa51da28c5
@ -25,6 +25,9 @@
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
/*! A node in a syntactic structure (tree, lattice, etc.). SyntaxNodes have a
|
||||
* label and a span plus an arbitrary set of name/value attributes.
|
||||
*/
|
||||
struct SyntaxNode {
|
||||
typedef std::map<std::string, std::string> AttributeMap;
|
||||
|
||||
|
@ -55,11 +55,13 @@ public:
|
||||
return m_nodes;
|
||||
};
|
||||
|
||||
size_t GetNumWords() const {
|
||||
return m_numWords;
|
||||
}
|
||||
//! Get the number of words (defined as 1 + the max end pos of any node).
|
||||
std::size_t GetNumWords() const { return m_numWords; }
|
||||
|
||||
//! Clear the container (this deletes the SyntaxNodes).
|
||||
void Clear();
|
||||
|
||||
//! Extract a SyntaxTree (assuming the collection's nodes constitute a tree).
|
||||
std::auto_ptr<SyntaxTree> ExtractTree();
|
||||
|
||||
private:
|
||||
|
@ -80,7 +80,6 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName)
|
||||
return tag.substr(contentsStart,contentsEnd-contentsStart);
|
||||
}
|
||||
|
||||
// TODO Special handling of "label" attribute
|
||||
// s should be a sequence of name=attribute pairs separated by whitespace.
|
||||
// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
|
||||
void ParseXmlTagAttributes(const std::string &s,
|
||||
@ -107,8 +106,9 @@ void ParseXmlTagAttributes(const std::string &s,
|
||||
throw XmlException("invalid tag content");
|
||||
}
|
||||
}
|
||||
// TODO unescape \"
|
||||
attributes[name] = s.substr(begin+1, pos-begin-1);
|
||||
if (name != "label" && name != "span") {
|
||||
attributes[name] = s.substr(begin+1, pos-begin-1);
|
||||
}
|
||||
begin = pos+1;
|
||||
}
|
||||
}
|
||||
@ -245,20 +245,17 @@ vector<string> TokenizeXml(const string& str)
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a sentence with xml annotation
|
||||
* Xml tags may specifiy additional/replacing translation options
|
||||
* and reordering constraints
|
||||
* Process a sentence with XML-style annotation of syntactic nodes.
|
||||
*
|
||||
* \param line in: sentence, out: sentence without the xml
|
||||
* \param res vector with translation options specified by xml
|
||||
* \param reorderingConstraint reordering constraint zones specified by xml
|
||||
* \param walls reordering constraint walls specified by xml
|
||||
* \param line[in,out] in: sentence, out: sentence without the XML
|
||||
* \param nodeCollection[out] the collection of SyntaxNode objects for this
|
||||
* sentence
|
||||
* \param labelCollection[out] label values are inserted into this set
|
||||
* \param topLabelCollection[out] top labels (key) and their counts (value)
|
||||
* are inserted into this map
|
||||
* \param unescapeSpecialChars flag indicating whether XML special characters
|
||||
* should be unescaped
|
||||
*/
|
||||
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
|
||||
is so we can link things up afterwards. We can't create TranslationOptions as we
|
||||
parse because we don't have the completed source parsed until after this function
|
||||
removes all the markup from it (CreateFromString in Sentence::Read).
|
||||
*/
|
||||
bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
||||
set< string > &labelCollection,
|
||||
map< string, int > &topLabelCollection,
|
||||
|
@ -16,7 +16,7 @@ namespace Syntax {
|
||||
* converts them to SyntaxTree objects.
|
||||
*
|
||||
* This is a thin wrapper around the ProcessAndStripXMLTags function. After
|
||||
* calling Parse(), the output of the ProcessAndStripXMLTags function (the
|
||||
* calling Parse(), the output from the ProcessAndStripXMLTags call (the
|
||||
* sentence, node collection, label set, and top label set) are available via
|
||||
* accessors.
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user