From 8653bd81590d1f9f658d9560458dc72d9556e197 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 14:20:00 +0100 Subject: [PATCH] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNodeCollection.cpp | 40 ---------------------- phrase-extract/SyntaxNodeCollection.h | 6 ---- phrase-extract/relax-parse-main.cpp | 44 ++++++++++++++++++++++++- phrase-extract/relax-parse.h | 10 ++++-- 4 files changed, 50 insertions(+), 50 deletions(-) diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 356c49bf4..0a344fcd7 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -51,46 +51,6 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, return newNode; } -ParentNodes SyntaxNodeCollection::Parse() -{ - ParentNodes parents; - - // looping through all spans of size >= 2 - for( int length=2; length<=m_numWords; length++ ) { - for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) { - if (HasNode( startPos, startPos+length-1 )) { - // processing one (parent) span - - //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; - SplitPoints splitPoints; - splitPoints.push_back( startPos ); - //std::cerr << " " << startPos; - - int first = 1; - int covered = 0; - int found_somehing = 1; // break loop if nothing found - while( covered < length && found_somehing ) { - // find largest covering subspan (child) - // starting at last covered position - found_somehing = 0; - for( int midPos=length-first; midPos>covered; midPos-- ) { - if( HasNode( startPos+covered, startPos+midPos-1 ) ) { - covered = midPos; - splitPoints.push_back( startPos+covered ); - // std::cerr << " " << ( startPos+covered ); - first = 0; - found_somehing = 1; - } - } - } - // std::cerr << std::endl; - parents.push_back( splitPoints ); - } - } - } - return parents; -} - bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const { return GetNodes( startPos, endPos).size() > 0; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index 060192980..8de151c55 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -31,9 +31,6 @@ namespace MosesTraining { -typedef std::vector< int > SplitPoints; -typedef std::vector< SplitPoints > ParentNodes; - /** A collection of SyntaxNodes organized by start and end position. * */ @@ -47,9 +44,6 @@ public: //! Construct and insert a new SyntaxNode. SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); - // TODO Rename (and move?) - ParentNodes Parse(); - //! Return true iff there are one or more SyntaxNodes with the given span. bool HasNode( int startPos, int endPos ) const; diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index 4b5c2d573..f7a2a271b 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -50,7 +50,7 @@ int main(int argc, char* argv[]) // output tree // cerr << "BEFORE:" << endl << tree; - ParentNodes parents = tree.Parse(); + ParentNodes parents = determineSplitPoints(tree); // execute selected grammar relaxation schemes if (leftBinarizeFlag) @@ -271,3 +271,45 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label); } } + +ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl) +{ + ParentNodes parents; + + const std::size_t numWords = nodeColl.GetNumWords(); + + // looping through all spans of size >= 2 + for( int length=2; length<=numWords; length++ ) { + for( int startPos = 0; startPos <= numWords-length; startPos++ ) { + if (nodeColl.HasNode( startPos, startPos+length-1 )) { + // processing one (parent) span + + //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; + SplitPoints splitPoints; + splitPoints.push_back( startPos ); + //std::cerr << " " << startPos; + + int first = 1; + int covered = 0; + int found_somehing = 1; // break loop if nothing found + while( covered < length && found_somehing ) { + // find largest covering subspan (child) + // starting at last covered position + found_somehing = 0; + for( int midPos=length-first; midPos>covered; midPos-- ) { + if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) { + covered = midPos; + splitPoints.push_back( startPos+covered ); + // std::cerr << " " << ( startPos+covered ); + first = 0; + found_somehing = 1; + } + } + } + // std::cerr << std::endl; + parents.push_back( splitPoints ); + } + } + } + return parents; +} diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h index a00aa6deb..7c412646a 100644 --- a/phrase-extract/relax-parse.h +++ b/phrase-extract/relax-parse.h @@ -37,10 +37,14 @@ bool leftBinarizeFlag = false; bool rightBinarizeFlag = false; char SAMTLevel = 0; +typedef std::vector< int > SplitPoints; +typedef std::vector< SplitPoints > ParentNodes; + // functions void init(int argc, char* argv[]); +ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &); void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector &words ); -void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); -void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); -void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents ); +void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents ); +void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );