rollback parallel training and zipping. Zipping slows it done. A LOT. Redo more carefully

2024-08-16 23:10:31 +03:00 · 2012-05-23 13:04:02 +01:00 · 2012-05-23 13:04:02 +01:00 · 89c8d5643d
commit 89c8d5643d
parent 59a2ab1aaa
4 changed files with 78 additions and 116 deletions
--- a/scripts/training/phrase-extract/Jamfile
+++ b/scripts/training/phrase-extract/Jamfile
@ -10,13 +10,13 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
 alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
 alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;

-exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp InputFileStream ../../..//boost_iostreams ;

 exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../../../moses/src//ThreadPool ../../..//boost_iostreams ;

 exe extract-lex : extract-lex.cpp InputFileStream ;

-exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
+exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp InputFileStream ../../..//boost_iostreams ;

 exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;

--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@ -22,7 +22,6 @@
 #include "SentenceAlignment.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
-#include "OutputFileStream.h"

 using namespace std;

@ -83,16 +82,15 @@ bool hierModel = false;
 REO_MODEL_TYPE hierType = REO_MSD;


-Moses::OutputFileStream extractFile;
-Moses::OutputFileStream extractFileInv;
-Moses::OutputFileStream extractFileOrientation;
-Moses::OutputFileStream extractFileSentenceId;
+ofstream extractFile;
+ofstream extractFileInv;
+ofstream extractFileOrientation;
+ofstream extractFileSentenceId;
 int maxPhraseLength;
 bool orientationFlag = false;
 bool translationFlag = true;
 bool sentenceIdFlag = false; //create extract file with sentence id
 bool onlyOutputSpanInfo = false;
-bool gzOutput = false;

 int main(int argc, char* argv[])
 {
@ -118,8 +116,6 @@ int main(int argc, char* argv[])
      translationFlag = false;
    } else if (strcmp(argv[i], "--SentenceId") == 0) {
      sentenceIdFlag = true;  
-    } else if (strcmp(argv[i], "--GZOutput") == 0) {
-      gzOutput = true;  
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
@ -197,18 +193,18 @@ int main(int argc, char* argv[])

  // open output files
  if (translationFlag) {
-    string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
-    extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
-    extractFileInv.Open(fileNameExtractInv.c_str());
+    string fileNameExtractInv = fileNameExtract + ".inv";
+    extractFile.open(fileNameExtract.c_str());
+    extractFileInv.open(fileNameExtractInv.c_str());
  }
  if (orientationFlag) {
-    string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
-    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
+    string fileNameExtractOrientation = fileNameExtract + ".o";
+    extractFileOrientation.open(fileNameExtractOrientation.c_str());
  }

  if (sentenceIdFlag) {
-    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
-    extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
+    string fileNameExtractSentenceId = fileNameExtract + ".sid";
+    extractFileSentenceId.open(fileNameExtractSentenceId.c_str());
  }

  int i=0;
@ -243,12 +239,12 @@ int main(int argc, char* argv[])
  //az: only close if we actually opened it
  if (!onlyOutputSpanInfo) {
    if (translationFlag) {
-      extractFile.Close();
-      extractFileInv.Close();
+      extractFile.close();
+      extractFileInv.close();
    }
-    if (orientationFlag) extractFileOrientation.Close();
+    if (orientationFlag) extractFileOrientation.close();
    if (sentenceIdFlag) {
-      extractFileSentenceId.Close();
+      extractFileSentenceId.close();
    }
  }
 }
--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@ -32,7 +32,6 @@
 #include "PhraseAlignment.h"
 #include "score.h"
 #include "InputFileStream.h"
-#include "OutputFileStream.h"

 using namespace std;

@ -189,10 +188,9 @@ int main(int argc, char* argv[])
 		phraseTableFile = &cout;
 	}
 	else {
-		Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
-    bool success = outputFile->Open(fileNamePhraseTable);
-    		
-    if (!success) {
+		ofstream *outputFile = new ofstream();
+		outputFile->open(fileNamePhraseTable);
+		if (outputFile->fail()) {
 			cerr << "ERROR: could not open file phrase table file "
 					 << fileNamePhraseTable << endl;
 			exit(1);
@ -247,6 +245,7 @@ int main(int argc, char* argv[])
 	
 	phraseTableFile->flush();
 	if (phraseTableFile != &cout) {
+		(dynamic_cast<ofstream*>(phraseTableFile))->close();
 		delete phraseTableFile;
 	}

@ -259,9 +258,9 @@ int main(int argc, char* argv[])
 void writeCountOfCounts( const char* fileNameCountOfCounts )
 {
  // open file
-	Moses::OutputFileStream countOfCountsFile;
-	bool success = countOfCountsFile.Open(fileNameCountOfCounts);
-	if (!success) {
+	ofstream countOfCountsFile;
+	countOfCountsFile.open(fileNameCountOfCounts);
+	if (countOfCountsFile.fail()) {
 		cerr << "ERROR: could not open count-of-counts file "
 				 << fileNameCountOfCounts << endl;
    return;
@ -274,7 +273,7 @@ void writeCountOfCounts( const char* fileNameCountOfCounts )
  for(int i=1; i<=COC_MAX; i++) {
    countOfCountsFile << countOfCounts[ i ] << endl;
  }
-	countOfCountsFile.Close();
+	countOfCountsFile.close();
 }

 void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
--- a/scripts/training/train-model.perl.missing_bin_dir
+++ b/scripts/training/train-model.perl.missing_bin_dir
@ -19,7 +19,7 @@ if ($SCRIPTS_ROOTDIR eq '') {
 $SCRIPTS_ROOTDIR =~ s/\/training$//;
 $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});

-my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_CORPUS,
+my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_CORPUS,
   $_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH,
   $_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_VERBOSE, $_ALIGNMENT,
   $_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS,
@ -40,7 +40,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
 my $debug = 0; # debug this script, do not delete any files in debug mode

 # the following line is set installation time by 'make release'.  BEWARE!
-my $BINDIR="/Users/hieuhoang/workspace/bin/";
+my $BINDIR="/Users/hieuhoang/workspace/bin/training-tools";

 $_HELP = 1
    unless &GetOptions('root-dir=s' => \$_ROOT_DIR,
@ -58,7 +58,6 @@ $_HELP = 1
 		       'temp-dir=s' => \$_TEMP_DIR,
           'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
           'sort-batch-size=s' => \$_SORT_BATCH_SIZE,
-           'sort-compress=s' => \$_SORT_COMPRESS,
 		       'extract-file=s' => \$_EXTRACT_FILE,
 		       'alignment=s' => \$_ALIGNMENT,
 		       'alignment-file=s' => \$_ALIGNMENT_FILE,
@ -176,8 +175,6 @@ foreach my $step (@step_conf) {
  }
 }

-# don't fork
-my $___NOFORK = !defined $_PARALLEL;


 # supporting binaries from other packages
@ -210,24 +207,14 @@ if(!defined $_MGIZA ){
 my $MKCLS = "$BINDIR/mkcls";

 # supporting scripts/binaries from this package
-
-# parallel extract
-my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; 
-if($SPLIT_EXEC) {
-  $SPLIT_EXEC = 'gsplit';
+my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
+my $RULE_EXTRACT;
+if (defined($_GHKM)) {
+  $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm";
 }
 else {
-  $SPLIT_EXEC = 'split';
+  $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
 }
-
-my $SORT_EXEC = `gsort --help 2>/dev/null`; 
-if($SORT_EXEC) {
-  $SORT_EXEC = 'gsort';
-}
-else {
-  $SORT_EXEC = 'sort';
-}
-
 my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score";
 my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore";
 my $EPPEX = "$SCRIPTS_ROOTDIR/training/eppex/eppex";
@ -327,9 +314,6 @@ $__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
 my $__SORT_BATCH_SIZE = "";
 $__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;

-my $__SORT_COMPRESS = "";
-$__SORT_COMPRESS = "--compress-program=$_SORT_COMPRESS" if $_SORT_COMPRESS;
-
 my $___CONTINUE = 0; 
 $___CONTINUE = $_CONTINUE if $_CONTINUE;

@ -342,33 +326,6 @@ $___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH;
 $___LEXICAL_WEIGHTING = 0 if $_NO_LEXICAL_WEIGHTING;
 $___LEXICAL_FILE = $_LEXICAL_FILE if $_LEXICAL_FILE;

-my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
-if ($___NOFORK != 0)
-{
-  $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl 1 $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_EXTRACT";
-}
-else
-{
-  $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl 3 $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_EXTRACT";
-}
-
-my $RULE_EXTRACT;
-if (defined($_GHKM)) {
-  $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm";
-}
-else {
-  $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
-}
-
-if ($___NOFORK != 0)
-{
-  $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl 1 $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $RULE_EXTRACT";
-}
-else
-{
-  $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl 3 $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $RULE_EXTRACT";
-}
-
 my $___PHRASE_SCORER = "phrase-extract";
 $___PHRASE_SCORER = "memscore" if defined $_MEMSCORE;
 my $___MEMSCORE_OPTIONS = "-s ml -s lexweights \$LEX_E2F -r ml -r lexweights \$LEX_F2E -s const 2.718";
@ -399,6 +356,9 @@ $___PARTS = $_PARTS if $_PARTS;
 my $___DIRECTION = 0;
 $___DIRECTION = $_DIRECTION if $_DIRECTION;

+# don't fork
+my $___NOFORK = !defined $_PARALLEL;
+
 my $___ONLY_PRINT_GIZA = 0;
 $___ONLY_PRINT_GIZA = 1 if $_ONLY_PRINT_GIZA;

@ -1395,23 +1355,27 @@ sub extract_phrase {

      $cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $max_length";
 		}
-		
-     if ($reordering_flag) {
+      if ($reordering_flag) {
        $cmd .= " orientation";
        $cmd .= get_extract_reordering_flags();
        $cmd .= " --NoTTable" if !$ttable_flag;
        $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
      }
    }
-
-		$cmd .= " --GZOutput ";
-
    map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
    print STDERR "$cmd\n";
    safesystem("$cmd") or die "ERROR: Phrase extraction failed (missing input files?)";
    foreach my $f (@tempfiles) {
      unlink $f;
    }
+    if (! $___DONT_ZIP) { 
+      safesystem("gzip $extract_file.o") if -e "$extract_file.o";
+      safesystem("gzip $extract_file.sid") if -e "$extract_file.sid";
+      if ($ttable_flag) {
+        safesystem("gzip $extract_file.inv") or die("ERROR");
+        safesystem("gzip $extract_file") or die("ERROR");
+      }
+    }
 }

 ### (6) PHRASE SCORING
@ -1495,20 +1459,20 @@ sub score_phrase_phrase_extract {
              }
 	      my $extract = "$extract_filename.sorted";

-	      #if (!($___CONTINUE && -e "$extract_filename.sorted")) {
-	      #  # sorting
-	      #  print STDERR "(6.".($substep++).")  sorting $direction @ ".`date`;
-	      #  if (-e "$extract_filename.gz") {
-		    #    safesystem("gunzip < $extract_filename.gz | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR");
-	      #  }
-	      #  else {
-		    #    safesystem("LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR");
-	      #  }
-        #}
+	      if (!($___CONTINUE && -e "$extract_filename.sorted")) {
+	          # sorting
+	          print STDERR "(6.".($substep++).")  sorting $direction @ ".`date`;
+	          if (-e "$extract_filename.gz") {
+		      safesystem("gunzip < $extract_filename.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR");
+	          }
+	          else {
+		      safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR");
+	          }
+              }

 	      print STDERR "(6.".($substep++).")  creating table half $ttable_file.half.$direction @ ".`date`;

-        my $cmd = "$PHRASE_SCORE $extract.gz $lexical_file.$direction $ttable_file.half.$direction.gz $inverse";
+        my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
        $cmd .= " --Hierarchical" if $_HIERARCHICAL;
        $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
        $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
@ -1524,10 +1488,8 @@ sub score_phrase_phrase_extract {
        # sorting inverse phrase-table-half to sync up with regular one
        if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) {
          print STDERR "(6." . ($substep++) . ") sorting inverse e2f table@ ".`date`;
-          $cmd = "zcat $ttable_file.half.e2f.gz | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS  -T $___TEMP_DIR | gzip -c > $ttable_file.half.e2f.sorted.gz";
-          print "Executing: $cmd \n";  
-          safesystem($cmd) or die("ERROR");
-          if (! $debug) { safesystem("rm -f $ttable_file.half.e2f.gz") or die("ERROR"); }
+          safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f > $ttable_file.half.e2f.sorted") or die("ERROR");
+          if (! $debug) { safesystem("rm -f $ttable_file.half.e2f") or die("ERROR"); }
        }

        exit();
@ -1554,7 +1516,7 @@ sub score_phrase_phrase_extract {
    # merging the two halves
    print STDERR "(6.6) consolidating the two halves @ ".`date`;
    return if $___CONTINUE && -e "$ttable_file.gz";
-    my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.sorted.gz $ttable_file.gz";
+    my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
    $cmd .= " --Hierarchical" if $_HIERARCHICAL;
    $cmd .= " --LogProb" if $LOG_PROB;
    $cmd .= " --NegLogProb" if $NEG_LOG_PROB;
@ -1565,6 +1527,9 @@ sub score_phrase_phrase_extract {
    $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
    safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
    if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
+    if (! $___DONT_ZIP) {
+        safesystem("gzip $ttable_file") || die("ERROR: could not gzip $ttable_file");
+    }
 }

 sub score_phrase_memscore {
@ -1578,7 +1543,7 @@ sub score_phrase_memscore {

    # The output is sorted to avoid breaking scripts that rely on the
    # sorting behaviour of the previous scoring algorithm.
-    my $cmd = "$MEMSCORE $options | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS -T $___TEMP_DIR | gzip >$ttable_file.gz";
+    my $cmd = "$MEMSCORE $options | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip >$ttable_file.gz";
    if (-e "$extract_file.gz") {
        $cmd = "$ZCAT $extract_file.gz | ".$cmd;
    } else {
@ -1633,11 +1598,11 @@ sub get_reordering_factored {

 sub get_reordering {
    my ($extract_file,$reo_model_path) = @_;
-    if (-e "$extract_file.o.sorted.gz") {
-      # do nothing
+    if (-e "$extract_file.o.gz") {
+	safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR");
    }
    else {
-        die("ERROR: $extract_file.o.sorted.gz does not exist");
+        safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o > $extract_file.o.sorted") or die("ERROR");
    }

    my $smooth = $___REORDERING_SMOOTH;
@ -1645,20 +1610,22 @@ sub get_reordering {
    print STDERR "(7.2) building tables @ ".`date`;

    #create cmd string for lexical reordering scoring
-    my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path";
+    my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted $smooth $reo_model_path";
    $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
    for my $mtype (keys %REORDERING_MODEL_TYPES) {
-			$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
-			foreach my $model (@REORDERING_MODELS) {
-	    	if ($model->{"type"} eq $mtype) {
-					$cmd .= " ".$model->{"filename"};
-	    	}
-		}
-		$cmd .= "\"";
-  }
+	$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
+	foreach my $model (@REORDERING_MODELS) {
+	    if ($model->{"type"} eq $mtype) {
+		$cmd .= " ".$model->{"filename"};
+	    }
+	}
+	$cmd .= "\"";
+    }
    
-  #Call the lexical reordering scorer
-  safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
+    #Call the lexical reordering scorer
+    safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
+
+    if (! $debug) { safesystem("rm $extract_file.o.sorted") or die("ERROR");}
 }