subimtted working scripts for hierarchical training (msd)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/hierarchical-reo@2796 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-29 06:52:34 +03:00 · 2010-01-29 22:38:18 +00:00 · 2010-01-29 22:38:18 +00:00 · bf70dd4767
commit bf70dd4767
parent ad3b0760b2
3 changed files with 2070 additions and 108 deletions
--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@ -10,9 +10,7 @@
 #include <time.h>
 #include <cstring>

-#include <map>
-#include <set>
-#include <vector>
+#include "hierarchical.h"

 using namespace std;

@ -28,20 +26,6 @@ using namespace std;
              }
 #define LINE_MAX_LENGTH 60000

-// HPhraseVertex represents a point in the alignment matrix
-typedef pair <int, int> HPhraseVertex;
-
-// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix:
-// bottom-left and top-right
-typedef pair<HPhraseVertex, HPhraseVertex> HPhrase;
-
-// HPhraseVector is a vector of HPhrases
-typedef vector < HPhrase > HPhraseVector;
-
-// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning
-// The key of the map is the English index and the value is a set of the foreign ones
-typedef map <int, set<int> > HSenteceVertices;
-
 class SentenceAlignment {
 	public:
 		vector<string> english;
@ -59,20 +43,11 @@ void addPhrase( SentenceAlignment &, int, int, int, int );
 vector<string> tokenize( char [] );
 bool isAligned ( SentenceAlignment &, int, int );

-// Reordering
+// Hierarchical reordering
 void HRextract( SentenceAlignment & );
 void HRaddPhrase( SentenceAlignment &, int, int, int, int, string &, string & );
 void PBextract( SentenceAlignment & );

-enum REO_MODEL_NAME {REO_HIER, REO_PHRASE, REO_WORD};
-enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO, REO_LR};
-
-bool allModelsOutputFlag = false;
-REO_MODEL_NAME modelName;
-REO_MODEL_TYPE modelType;
-
-map < REO_MODEL_NAME, REO_MODEL_TYPE > selectedModels;
-
 ofstream extractFile;
 ofstream extractFileInv;
 ofstream extractFileOrientation;
@ -80,6 +55,8 @@ int maxPhraseLength;
 int phraseCount = 0;
 char* fileNameExtract;
 bool orientationFlag = false;
+bool HRorientationFlag = false;
+bool PBorientationFlag = false;
 bool onlyOutputSpanInfo = false;
 bool noFileLimit = false;
 bool zipFiles = false;
@ -91,59 +68,25 @@ int main(int argc, char* argv[])
 			<< "phrase extraction from an aligned parallel corpus\n";
 	time_t starttime = time(NULL);

-	if (argc < 6) {
-		cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning | --hierarchical-reo]\n";
-		exit(1);
-	}
-	char* &fileNameE = argv[1];
-	char* &fileNameF = argv[2];
-	char* &fileNameA = argv[3];
-	fileNameExtract = argv[4];
-	maxPhraseLength = atoi(argv[5]);
-
-	for(int i=6;i<argc;i++) {
-		if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
-			onlyOutputSpanInfo = true;
-		}
-		else if (strcmp(argv[i],"--NoFileLimit") == 0) {
-			noFileLimit = true;
-		}
-		else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
-			orientationFlag = true;
-			selectedModels.insert(make_pair(REO_WORD, REO_MSD));
-		}
-		else if(strcmp(argv[i],"--model") == 0){
-			char* modelParams = argv[++i];
-			char* modelName = strtok(modelParams, "-");
-			char* modelType = strtok(modelParams, "-");
-
-			REO_MODEL_NAME intModelName;
-			REO_MODEL_TYPE intModelType;
-
-			if(strcmp(modelName, "word") == 0)
-				intModelName = REO_WORD;
-			else if(strcmp(modelName, "phrase") == 0)
-				intModelName = REO_PHRASE;
-			else if(strcmp(modelName, "phrase") == 0)
-				intModelName = REO_HIER;
-			else{
-				cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
-				exit(1);
-			}
-
-			if(strcmp(modelType, "msd") == 0)
-				intModelType = REO_MSD;
-			else if(strcmp(modelType, "mslr") == 0)
-				intModelType = REO_MSLR;
-			else if(strcmp(modelType, "mono") == 0)
-				intModelType = REO_MONO;
-			else if(strcmp(modelType, "leftright") == 0)
-				intModelType = REO_LR;
-			else{
-				cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
-				exit(1);
-			}
-		}
+  if (argc < 6) {
+    cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning | --hierarchical-reo]\n";
+    exit(1);
+  }
+  char* &fileNameE = argv[1];
+  char* &fileNameF = argv[2];
+  char* &fileNameA = argv[3];
+  fileNameExtract = argv[4];
+  maxPhraseLength = atoi(argv[5]);
+  
+  for(int i=6;i<argc;i++) {
+    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
+      onlyOutputSpanInfo = true;
+    }
+    else if (strcmp(argv[i],"--NoFileLimit") == 0) {
+      noFileLimit = true;
+    }
+    else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
+      orientationFlag = true;
    }
    else if (strcmp(argv[i],"--ZipFiles") == 0) {
      zipFiles = true;
@ -151,6 +94,12 @@ int main(int argc, char* argv[])
    else if (strcmp(argv[i],"--ProperConditioning") == 0) {
      properConditioning = true;
    }
+    else if (strcmp(argv[i],"--hierarchical-reo") == 0) {
+    	HRorientationFlag = true;
+    }
+    else if (strcmp(argv[i],"--phrase-based-reo") == 0) {
+    	PBorientationFlag = true;
+    }
    else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
@ -192,7 +141,6 @@ int main(int argc, char* argv[])
    		HRextract(sentence);
    	else if(PBorientationFlag)
    		PBextract(sentence);
-    	else
    		extract(sentence);
      if (properConditioning) extractBase(sentence);
    }
@ -286,7 +234,7 @@ void extract( SentenceAlignment &sentence ) {
 		(endF<countF && 
 		 endF<startF+maxPhraseLength && // within length limit
 		 (endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
-		endF++)
+		endF++) 
 	      addPhrase(sentence,startE,endE,startF,endF);
      }
    }
@ -369,11 +317,8 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
    // orientation to previous E
    bool connectedLeftTop  = isAligned( sentence, startF-1, startE-1 );
    bool connectedRightTop = isAligned( sentence, endF+1,   startE-1 );
-    if      ( connectedLeftTop && !connectedRightTop)
-    {
-    	if()
+    if      ( connectedLeftTop && !connectedRightTop) 
      extractFileOrientation << "mono";
-    }
    else if (!connectedLeftTop &&  connectedRightTop) 
      extractFileOrientation << "swap";
    else 
--- a/scripts/training/train-factored-phrase-model-hierreo.perl
+++ b/scripts/training/train-factored-phrase-model-hierreo.perl
--- a/scripts/training/train-factored-phrase-model.perl
+++ b/scripts/training/train-factored-phrase-model.perl
@ -1372,8 +1372,8 @@ sub get_reordering {
    }

    my $smooth = $___REORDERING_SMOOTH;
-    my @REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth,$smooth);
-    my @REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth,$smooth);
+    my @REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth,0);
+    my @REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth,0);

    my (%SMOOTH_PREVIOUS,%SMOOTH_FOLLOWING);
    if ($smooth =~ /(.+)u$/) {
@ -1501,19 +1501,19 @@ sub store_reordering_f {
 					   $f_current, 
 					   $mono_previous_f/$total_previous_f,
 					   $swap_previous_f/$total_previous_f,
-					   $left_previous_f+$right_previous_f/$total_previous_f);
+					   ($left_previous_f+$right_previous_f)/$total_previous_f);
 	}
 	elsif ($model->{"orient"} eq "monotonicity") {
 	    printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
 					   $f_current, 
 					   $mono_previous_f/$total_previous_f,
-					   $swap_previous_f+$left_previous_f+$right_previous_f/$total_previous_f);
+					   ($swap_previous_f+$left_previous_f+$right_previous_f)/$total_previous_f);
 	}
 	elsif ($model->{"orient"} eq "leftright") {
 	    printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
 					   $f_current, 
-					   $mono_previous_f+$left_previous_f/$total_previous_f,
-					   $swap_previous_f+$right_previous_f/$total_previous_f);
+					   ($mono_previous_f+$left_previous_f)/$total_previous_f,
+					   ($swap_previous_f+$right_previous_f)/$total_previous_f);
 	}
    }
 }
@ -1525,14 +1525,14 @@ sub store_reordering_fe {
    foreach my $model (@REORDERING_MODELS) {
 	next if ($model->{"lang"} ne "fe");
 	if ($model->{"orient"} eq "mslr") {
-	    printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g %g",
+	    printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g %g ",
 					   $f_current, $e_current, 
 					   $mono_previous_fe/$total_previous_fe,
 					   $swap_previous_fe/$total_previous_fe,
 					   $left_previous_fe/$total_previous_fe,
 					   $right_previous_fe/$total_previous_fe);
 	    if ($model->{"dir"} eq "bidirectional") {
-		printf { $model->{"filehandle"} } (" ||| %g %g %g %g",
+		printf { $model->{"filehandle"} } ("%g %g %g %g",
 					       $mono_following_fe/$total_following_fe,
 					       $swap_following_fe/$total_following_fe,
 					       $left_following_fe/$total_following_fe,
@ -1541,40 +1541,40 @@ sub store_reordering_fe {
 	    printf { $model->{"filehandle"} } ("\n");
 	}
 	elsif ($model->{"orient"} eq "msd") {
-	    printf { $model->{"filehandle"} } ("%s |||  %s ||| %g %g %g",
+	    printf { $model->{"filehandle"} } ("%s |||  %s ||| %g %g %g ",
 					   $f_current, $e_current,  
 					   $mono_previous_fe/$total_previous_fe,
 					   $swap_previous_fe/$total_previous_fe,
-					   $left_previous_fe+$right_previous_fe/$total_previous_fe);
+					   ($left_previous_fe+$right_previous_fe)/$total_previous_fe);
 	    if ($model->{"dir"} eq "bidirectional") {
-		printf { $model->{"filehandle"} } (" ||| %g %g %g",
+		printf { $model->{"filehandle"} } ("%g %g %g",
 					       $mono_following_fe/$total_following_fe,
 					       $swap_following_fe/$total_following_fe,
-					       $left_following_fe+$right_following_fe/$total_following_fe);
+					       ($left_following_fe+$right_following_fe)/$total_following_fe);
 	    }
 	    printf { $model->{"filehandle"} } ("\n");
 	}
 	elsif ($model->{"orient"} eq "monotonicity") {
-	    printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g",
+	    printf { $model->{"filehandle"} } ("%s %s ||| %g %g ",
 					   $f_current,  $e_current, 
 					   $mono_previous_fe/$total_previous_fe,
-					   $swap_previous_fe+$left_previous_fe+$right_previous_fe/$total_previous_fe);
+					   ($swap_previous_fe+$left_previous_fe+$right_previous_fe)/$total_previous_fe);
 	    if ($model->{"dir"} eq "bidirectional") {
-		printf { $model->{"filehandle"} } (" ||| %g %g",
+		printf { $model->{"filehandle"} } ("%g %g",
 					       $mono_following_fe/$total_following_fe,
-					       $swap_following_fe+$left_following_fe+$right_following_fe/$total_following_fe);
+					       ($swap_following_fe+$left_following_fe+$right_following_fe)/$total_following_fe);
 	    }
 	    printf { $model->{"filehandle"} } ("\n");	
 	}
 	elsif ($model->{"orient"} eq "leftright") {
-	    printf { $model->{"filehandle"} } ("%s |||  %s ||| %g %g",
+	    printf { $model->{"filehandle"} } ("%s |||  %s ||| %g %g ",
 					   $f_current, $e_current, 
-					   $mono_previous_fe+$left_previous_fe/$total_previous_fe,
-					   $swap_previous_fe+$right_previous_fe/$total_previous_fe);
+					   ($mono_previous_fe+$left_previous_fe)/$total_previous_fe,
+					   ($swap_previous_fe+$right_previous_fe)/$total_previous_fe);
 	    if ($model->{"dir"} eq "bidirectional") {
-		printf { $model->{"filehandle"} } (" ||| %g %g",
-					       $mono_following_fe+$left_following_fe/$total_following_fe,
-					       $swap_following_fe+$right_following_fe/$total_following_fe);
+		printf { $model->{"filehandle"} } ("%g %g",
+					       ($mono_following_fe+$left_following_fe)/$total_following_fe,
+					       ($swap_following_fe+$right_following_fe)/$total_following_fe);
 	    }
 	    printf { $model->{"filehandle"} } ("\n");
 	}