Rename MosesTraining::SyntaxTree to MosesTraining::SyntaxNodeCollection

This is the first step in a small-scale refactoring effort that will touch a
lot of the syntax-related code in moses/phrase-extract.  The end goals are:

  - a storage mechanism for general attribute/value pairs in XML-style
    tree / lattice input.  E.g. the "pcfg-score" and "semantic-role"
    attributes in:

     <tree label="PRP" pcfg-score="1.0" semantic-role="AGENT"> I </tree>

  - consolidation of the various near-duplicate Tree / XmlTreeParser classes
    that have accumulated over the years (my fault)

  - general de-crufting
This commit is contained in:
Phil Williams 2015-05-29 18:46:02 +01:00
parent 5d8af9c289
commit 2f735998ca
14 changed files with 51 additions and 94 deletions

View File

@ -36,8 +36,8 @@ namespace MosesTraining
class SentenceAlignmentWithSyntax : public SentenceAlignment
{
public:
SyntaxTree targetTree;
SyntaxTree sourceTree;
SyntaxNodeCollection targetTree;
SyntaxNodeCollection sourceTree;
std::set<std::string> & m_targetLabelCollection;
std::set<std::string> & m_sourceLabelCollection;
std::map<std::string, int> & m_targetTopLabelCollection;

View File

@ -1,6 +1,3 @@
// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
@ -29,12 +26,12 @@
namespace MosesTraining
{
SyntaxTree::~SyntaxTree()
SyntaxNodeCollection::~SyntaxNodeCollection()
{
Clear();
}
void SyntaxTree::Clear()
void SyntaxNodeCollection::Clear()
{
m_top = 0;
// loop through all m_nodes, delete them
@ -45,7 +42,7 @@ void SyntaxTree::Clear()
m_index.clear();
}
SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
SyntaxNode *SyntaxNodeCollection::AddNode( int startPos, int endPos, std::string label )
{
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
m_nodes.push_back( newNode );
@ -54,7 +51,7 @@ SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
return newNode;
}
ParentNodes SyntaxTree::Parse()
ParentNodes SyntaxNodeCollection::Parse()
{
ParentNodes parents;
@ -94,12 +91,12 @@ ParentNodes SyntaxTree::Parse()
return parents;
}
bool SyntaxTree::HasNode( int startPos, int endPos ) const
bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
{
return GetNodes( startPos, endPos).size() > 0;
}
const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos ) const
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const
{
SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
if (startIndex == m_index.end() )
@ -112,15 +109,7 @@ const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos
return endIndex->second;
}
// for printing out tree
std::string SyntaxTree::ToString() const
{
std::stringstream out;
out << *this;
return out.str();
}
void SyntaxTree::ConnectNodes()
void SyntaxNodeCollection::ConnectNodes()
{
typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
@ -162,27 +151,4 @@ void SyntaxTree::ConnectNodes()
}
}
std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
{
size_t size = t.m_index.size();
for(size_t length=1; length<=size; length++) {
for(size_t space=0; space<length; space++) {
os << " ";
}
for(size_t start=0; start<=size-length; start++) {
if (t.HasNode( start, start+(length-1) )) {
std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
os << label.substr(0,7) << " ";
} else {
os << "------- ";
}
}
os << std::endl;
}
return os;
}
}

View File

@ -1,6 +1,3 @@
// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2009 University of Edinburgh
@ -20,12 +17,12 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <string>
#include <vector>
#include <map>
#include <sstream>
#include <string>
#include <vector>
namespace MosesTraining
{
@ -79,7 +76,7 @@ public:
typedef std::vector< int > SplitPoints;
typedef std::vector< SplitPoints > ParentNodes;
class SyntaxTree
class SyntaxNodeCollection
{
protected:
std::vector< SyntaxNode* > m_nodes;
@ -93,14 +90,12 @@ protected:
int m_size;
std::vector< SyntaxNode* > m_emptyNode;
friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
public:
SyntaxTree()
SyntaxNodeCollection()
: m_top(0) // m_top doesn't get set unless ConnectNodes is called.
, m_size(0) {}
~SyntaxTree();
~SyntaxNodeCollection();
SyntaxNode *AddNode( int startPos, int endPos, std::string label );
@ -119,10 +114,6 @@ public:
}
void ConnectNodes();
void Clear();
std::string ToString() const;
};
std::ostream& operator<<(std::ostream&, const SyntaxTree&);
}
} // namespace MosesTraining

View File

@ -1,6 +1,3 @@
// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
@ -228,7 +225,10 @@ vector<string> TokenizeXml(const string& str)
parse because we don't have the completed source parsed until after this function
removes all the markup from it (CreateFromString in Sentence::Read).
*/
bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection, bool unescapeSpecialChars )
bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
set< string > &labelCollection,
map< string, int > &topLabelCollection,
bool unescapeSpecialChars )
{
//parse XML markup in translation line
@ -374,7 +374,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
}
SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
node->SetPcfgScore(pcfgScore);
}
}
@ -386,7 +386,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
}
// collect top labels
const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 );
const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
SyntaxNode *n = *node;
const string &label = n->GetLabel();

View File

@ -35,7 +35,7 @@ std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r"
std::string TrimXml(const std::string& str);
bool isXmlTag(const std::string& tag);
std::vector<std::string> TokenizeXml(const std::string& str);
bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
std::string unescape(const std::string &str);

View File

@ -172,7 +172,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
// Parse source tree and construct a SyntaxTree object.
MosesTraining::SyntaxTree sourceSyntaxTree;
MosesTraining::SyntaxNodeCollection sourceSyntaxTree;
MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL;
if (options.sourceLabels) {
@ -196,7 +196,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
// Read source tokens.
std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
// Construct a source ParseTree object from the SyntaxTree object.
// Construct a source ParseTree object from the SyntaxNodeCollection object.
std::auto_ptr<ParseTree> sourceParseTree;
if (options.sourceLabels) {

View File

@ -31,7 +31,7 @@ namespace GHKM
{
ScfgRule::ScfgRule(const Subgraph &fragment,
const MosesTraining::SyntaxTree *sourceSyntaxTree)
const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree)
: m_graphFragment(fragment)
, m_sourceLHS("X", NonTerminal)
, m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
@ -133,9 +133,9 @@ ScfgRule::ScfgRule(const Subgraph &fragment,
}
}
void ScfgRule::PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
const Node *node,
const std::string &nonMatchingLabel)
void ScfgRule::PushSourceLabel(
const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree,
const Node *node, const std::string &nonMatchingLabel)
{
ContiguousSpan span = Closure(node->GetSpan());
if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span?

View File

@ -41,7 +41,7 @@ class ScfgRule : public Rule
{
public:
ScfgRule(const Subgraph &fragment,
const MosesTraining::SyntaxTree *sourceSyntaxTree = 0);
const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree = 0);
const Subgraph &GetGraphFragment() const {
return m_graphFragment;
@ -78,9 +78,9 @@ public:
}
private:
void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
const Node *node,
const std::string &nonMatchingLabel);
void PushSourceLabel(
const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree,
const Node *node, const std::string &nonMatchingLabel);
const Subgraph& m_graphFragment;
Symbol m_sourceLHS;

View File

@ -58,7 +58,7 @@ private:
std::set<std::string> &m_labelSet;
std::map<std::string, int> &m_topLabelSet;
std::string m_line;
MosesTraining::SyntaxTree m_tree;
MosesTraining::SyntaxNodeCollection m_tree;
std::vector<std::string> m_words;
};

View File

@ -47,7 +47,7 @@ class XmlTreeParser {
std::set<std::string> m_labelSet;
std::map<std::string, int> m_topLabelSet;
std::string m_line;
MosesTraining::SyntaxTree m_tree;
MosesTraining::SyntaxNodeCollection m_tree;
std::vector<std::string> m_words;
};

View File

@ -43,7 +43,7 @@ int main(int argc, char* argv[])
// process into syntax tree representation
set< string > labelCollection; // set of labels, not used
map< string, int > topLabelCollection; // count of top labels, not used
SyntaxTree tree;
SyntaxNodeCollection tree;
ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
const vector< string > inWords = util::tokenize( inBufferString );
@ -105,7 +105,7 @@ void init(int argc, char* argv[])
}
}
void store( SyntaxTree &tree, const vector< string > &words )
void store( SyntaxNodeCollection &tree, const vector< string > &words )
{
// output words
for( size_t i=0; i<words.size(); i++ ) {
@ -126,7 +126,7 @@ void store( SyntaxTree &tree, const vector< string > &words )
cout << endl;
}
void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
{
for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
const SplitPoints &point = *p;
@ -143,7 +143,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
}
}
void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
{
for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
const SplitPoints &point = *p;
@ -161,11 +161,11 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
}
}
void SAMT( SyntaxTree &tree, ParentNodes &parents )
void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
{
int numWords = tree.GetNumWords();
SyntaxTree newTree; // to store new nodes
SyntaxNodeCollection newTree; // to store new nodes
// look through parents to combine children
for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {

View File

@ -39,8 +39,8 @@ char SAMTLevel = 0;
// functions
void init(int argc, char* argv[]);
void store( MosesTraining::SyntaxTree &tree, const std::vector<std::string> &words );
void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );

View File

@ -13,17 +13,17 @@ namespace Syntax {
StringTree *XmlTreeParser::Parse(const std::string &line) {
line_ = line;
tree_.Clear();
node_collection_.Clear();
try {
if (!ProcessAndStripXMLTags(line_, tree_, label_set_, top_label_set_,
false)) {
if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_,
top_label_set_, false)) {
throw Exception("");
}
} catch (const XmlException &e) {
throw Exception(e.getMsg());
}
tree_.ConnectNodes();
SyntaxNode *root = tree_.GetTop();
node_collection_.ConnectNodes();
SyntaxNode *root = node_collection_.GetTop();
assert(root);
words_ = util::tokenize(line_);
return ConvertTree(*root, words_);

View File

@ -26,7 +26,7 @@ class XmlTreeParser {
std::set<std::string> label_set_;
std::map<std::string, int> top_label_set_;
std::string line_;
MosesTraining::SyntaxTree tree_;
MosesTraining::SyntaxNodeCollection node_collection_;
std::vector<std::string> words_;
};