Ongoing moses/phrase-extract refactoring

2024-12-25 12:52:29 +03:00 · 2015-06-03 14:20:00 +01:00 · 2015-06-03 14:20:00 +01:00 · 8653bd8159
commit 8653bd8159
parent 9097fd8965
4 changed files with 50 additions and 50 deletions
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@ -51,46 +51,6 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
  return newNode;
 }

-ParentNodes SyntaxNodeCollection::Parse()
-{
-  ParentNodes parents;
-
-  // looping through all spans of size >= 2
-  for( int length=2; length<=m_numWords; length++ ) {
-    for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) {
-      if (HasNode( startPos, startPos+length-1 )) {
-        // processing one (parent) span
-
-        //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
-        SplitPoints splitPoints;
-        splitPoints.push_back( startPos );
-        //std::cerr << " " << startPos;
-
-        int first = 1;
-        int covered = 0;
-        int found_somehing = 1; // break loop if nothing found
-        while( covered < length && found_somehing ) {
-          // find largest covering subspan (child)
-          // starting at last covered position
-          found_somehing = 0;
-          for( int midPos=length-first; midPos>covered; midPos-- ) {
-            if( HasNode( startPos+covered, startPos+midPos-1 ) ) {
-              covered = midPos;
-              splitPoints.push_back( startPos+covered );
-              // std::cerr << " " << ( startPos+covered );
-              first = 0;
-              found_somehing = 1;
-            }
-          }
-        }
-        // std::cerr << std::endl;
-        parents.push_back( splitPoints );
-      }
-    }
-  }
-  return parents;
-}
-
 bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
 {
  return GetNodes( startPos, endPos).size() > 0;
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@ -31,9 +31,6 @@
 namespace MosesTraining
 {

-typedef std::vector< int > SplitPoints;
-typedef std::vector< SplitPoints > ParentNodes;
-
 /** A collection of SyntaxNodes organized by start and end position.
 *
 */
@ -47,9 +44,6 @@ public:
  //! Construct and insert a new SyntaxNode.
  SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );

-  // TODO Rename (and move?)
-  ParentNodes Parse();
-
  //! Return true iff there are one or more SyntaxNodes with the given span.
  bool HasNode( int startPos, int endPos ) const;

--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@ -50,7 +50,7 @@ int main(int argc, char* argv[])
    // output tree
    // cerr << "BEFORE:" << endl << tree;

-    ParentNodes parents = tree.Parse();
+    ParentNodes parents = determineSplitPoints(tree);

    // execute selected grammar relaxation schemes
    if (leftBinarizeFlag)
@ -271,3 +271,45 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
    tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
  }
 }
+
+ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl)
+{
+  ParentNodes parents;
+
+  const std::size_t numWords = nodeColl.GetNumWords();
+
+  // looping through all spans of size >= 2
+  for( int length=2; length<=numWords; length++ ) {
+    for( int startPos = 0; startPos <= numWords-length; startPos++ ) {
+      if (nodeColl.HasNode( startPos, startPos+length-1 )) {
+        // processing one (parent) span
+
+        //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
+        SplitPoints splitPoints;
+        splitPoints.push_back( startPos );
+        //std::cerr << " " << startPos;
+
+        int first = 1;
+        int covered = 0;
+        int found_somehing = 1; // break loop if nothing found
+        while( covered < length && found_somehing ) {
+          // find largest covering subspan (child)
+          // starting at last covered position
+          found_somehing = 0;
+          for( int midPos=length-first; midPos>covered; midPos-- ) {
+            if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) {
+              covered = midPos;
+              splitPoints.push_back( startPos+covered );
+              // std::cerr << " " << ( startPos+covered );
+              first = 0;
+              found_somehing = 1;
+            }
+          }
+        }
+        // std::cerr << std::endl;
+        parents.push_back( splitPoints );
+      }
+    }
+  }
+  return parents;
+}
--- a/phrase-extract/relax-parse.h
+++ b/phrase-extract/relax-parse.h
@ -37,10 +37,14 @@ bool leftBinarizeFlag = false;
 bool rightBinarizeFlag = false;
 char SAMTLevel = 0;

+typedef std::vector< int > SplitPoints;
+typedef std::vector< SplitPoints > ParentNodes;
+
 // functions
 void init(int argc, char* argv[]);
+ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &);
 void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector<std::string> &words );
-void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
-void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
-void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents );
+void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
+void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );
+void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents );