rollback parallel training and zipping. Zipping slows it done. A LOT. Redo more carefully

This commit is contained in:
Hieu Hoang 2012-05-23 13:04:02 +01:00
parent 59a2ab1aaa
commit 89c8d5643d
4 changed files with 78 additions and 116 deletions

View File

@ -10,13 +10,13 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
exe extract : tables-core.o SentenceAlignment.o extract.cpp InputFileStream ../../..//boost_iostreams ;
exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../../../moses/src//ThreadPool ../../..//boost_iostreams ;
exe extract-lex : extract-lex.cpp InputFileStream ;
exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp InputFileStream ../../..//boost_iostreams ;
exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;

View File

@ -22,7 +22,6 @@
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
using namespace std;
@ -83,16 +82,15 @@ bool hierModel = false;
REO_MODEL_TYPE hierType = REO_MSD;
Moses::OutputFileStream extractFile;
Moses::OutputFileStream extractFileInv;
Moses::OutputFileStream extractFileOrientation;
Moses::OutputFileStream extractFileSentenceId;
ofstream extractFile;
ofstream extractFileInv;
ofstream extractFileOrientation;
ofstream extractFileSentenceId;
int maxPhraseLength;
bool orientationFlag = false;
bool translationFlag = true;
bool sentenceIdFlag = false; //create extract file with sentence id
bool onlyOutputSpanInfo = false;
bool gzOutput = false;
int main(int argc, char* argv[])
{
@ -118,8 +116,6 @@ int main(int argc, char* argv[])
translationFlag = false;
} else if (strcmp(argv[i], "--SentenceId") == 0) {
sentenceIdFlag = true;
} else if (strcmp(argv[i], "--GZOutput") == 0) {
gzOutput = true;
} else if(strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
@ -197,18 +193,18 @@ int main(int argc, char* argv[])
// open output files
if (translationFlag) {
string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
extractFileInv.Open(fileNameExtractInv.c_str());
string fileNameExtractInv = fileNameExtract + ".inv";
extractFile.open(fileNameExtract.c_str());
extractFileInv.open(fileNameExtractInv.c_str());
}
if (orientationFlag) {
string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
extractFileOrientation.Open(fileNameExtractOrientation.c_str());
string fileNameExtractOrientation = fileNameExtract + ".o";
extractFileOrientation.open(fileNameExtractOrientation.c_str());
}
if (sentenceIdFlag) {
string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
string fileNameExtractSentenceId = fileNameExtract + ".sid";
extractFileSentenceId.open(fileNameExtractSentenceId.c_str());
}
int i=0;
@ -243,12 +239,12 @@ int main(int argc, char* argv[])
//az: only close if we actually opened it
if (!onlyOutputSpanInfo) {
if (translationFlag) {
extractFile.Close();
extractFileInv.Close();
extractFile.close();
extractFileInv.close();
}
if (orientationFlag) extractFileOrientation.Close();
if (orientationFlag) extractFileOrientation.close();
if (sentenceIdFlag) {
extractFileSentenceId.Close();
extractFileSentenceId.close();
}
}
}

View File

@ -32,7 +32,6 @@
#include "PhraseAlignment.h"
#include "score.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
using namespace std;
@ -189,10 +188,9 @@ int main(int argc, char* argv[])
phraseTableFile = &cout;
}
else {
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
bool success = outputFile->Open(fileNamePhraseTable);
if (!success) {
ofstream *outputFile = new ofstream();
outputFile->open(fileNamePhraseTable);
if (outputFile->fail()) {
cerr << "ERROR: could not open file phrase table file "
<< fileNamePhraseTable << endl;
exit(1);
@ -247,6 +245,7 @@ int main(int argc, char* argv[])
phraseTableFile->flush();
if (phraseTableFile != &cout) {
(dynamic_cast<ofstream*>(phraseTableFile))->close();
delete phraseTableFile;
}
@ -259,9 +258,9 @@ int main(int argc, char* argv[])
void writeCountOfCounts( const char* fileNameCountOfCounts )
{
// open file
Moses::OutputFileStream countOfCountsFile;
bool success = countOfCountsFile.Open(fileNameCountOfCounts);
if (!success) {
ofstream countOfCountsFile;
countOfCountsFile.open(fileNameCountOfCounts);
if (countOfCountsFile.fail()) {
cerr << "ERROR: could not open count-of-counts file "
<< fileNameCountOfCounts << endl;
return;
@ -274,7 +273,7 @@ void writeCountOfCounts( const char* fileNameCountOfCounts )
for(int i=1; i<=COC_MAX; i++) {
countOfCountsFile << countOfCounts[ i ] << endl;
}
countOfCountsFile.Close();
countOfCountsFile.close();
}
void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )

View File

@ -19,7 +19,7 @@ if ($SCRIPTS_ROOTDIR eq '') {
$SCRIPTS_ROOTDIR =~ s/\/training$//;
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_CORPUS,
my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_CORPUS,
$_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH,
$_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_VERBOSE, $_ALIGNMENT,
$_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS,
@ -40,7 +40,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
my $debug = 0; # debug this script, do not delete any files in debug mode
# the following line is set installation time by 'make release'. BEWARE!
my $BINDIR="/Users/hieuhoang/workspace/bin/";
my $BINDIR="/Users/hieuhoang/workspace/bin/training-tools";
$_HELP = 1
unless &GetOptions('root-dir=s' => \$_ROOT_DIR,
@ -58,7 +58,6 @@ $_HELP = 1
'temp-dir=s' => \$_TEMP_DIR,
'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
'sort-batch-size=s' => \$_SORT_BATCH_SIZE,
'sort-compress=s' => \$_SORT_COMPRESS,
'extract-file=s' => \$_EXTRACT_FILE,
'alignment=s' => \$_ALIGNMENT,
'alignment-file=s' => \$_ALIGNMENT_FILE,
@ -176,8 +175,6 @@ foreach my $step (@step_conf) {
}
}
# don't fork
my $___NOFORK = !defined $_PARALLEL;
# supporting binaries from other packages
@ -210,24 +207,14 @@ if(!defined $_MGIZA ){
my $MKCLS = "$BINDIR/mkcls";
# supporting scripts/binaries from this package
# parallel extract
my $SPLIT_EXEC = `gsplit --help 2>/dev/null`;
if($SPLIT_EXEC) {
$SPLIT_EXEC = 'gsplit';
my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
my $RULE_EXTRACT;
if (defined($_GHKM)) {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm";
}
else {
$SPLIT_EXEC = 'split';
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
}
my $SORT_EXEC = `gsort --help 2>/dev/null`;
if($SORT_EXEC) {
$SORT_EXEC = 'gsort';
}
else {
$SORT_EXEC = 'sort';
}
my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score";
my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore";
my $EPPEX = "$SCRIPTS_ROOTDIR/training/eppex/eppex";
@ -327,9 +314,6 @@ $__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
my $__SORT_BATCH_SIZE = "";
$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;
my $__SORT_COMPRESS = "";
$__SORT_COMPRESS = "--compress-program=$_SORT_COMPRESS" if $_SORT_COMPRESS;
my $___CONTINUE = 0;
$___CONTINUE = $_CONTINUE if $_CONTINUE;
@ -342,33 +326,6 @@ $___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH;
$___LEXICAL_WEIGHTING = 0 if $_NO_LEXICAL_WEIGHTING;
$___LEXICAL_FILE = $_LEXICAL_FILE if $_LEXICAL_FILE;
my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
if ($___NOFORK != 0)
{
$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl 1 $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_EXTRACT";
}
else
{
$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl 3 $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_EXTRACT";
}
my $RULE_EXTRACT;
if (defined($_GHKM)) {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm";
}
else {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
}
if ($___NOFORK != 0)
{
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl 1 $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $RULE_EXTRACT";
}
else
{
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl 3 $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $RULE_EXTRACT";
}
my $___PHRASE_SCORER = "phrase-extract";
$___PHRASE_SCORER = "memscore" if defined $_MEMSCORE;
my $___MEMSCORE_OPTIONS = "-s ml -s lexweights \$LEX_E2F -r ml -r lexweights \$LEX_F2E -s const 2.718";
@ -399,6 +356,9 @@ $___PARTS = $_PARTS if $_PARTS;
my $___DIRECTION = 0;
$___DIRECTION = $_DIRECTION if $_DIRECTION;
# don't fork
my $___NOFORK = !defined $_PARALLEL;
my $___ONLY_PRINT_GIZA = 0;
$___ONLY_PRINT_GIZA = 1 if $_ONLY_PRINT_GIZA;
@ -1395,23 +1355,27 @@ sub extract_phrase {
$cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $max_length";
}
if ($reordering_flag) {
if ($reordering_flag) {
$cmd .= " orientation";
$cmd .= get_extract_reordering_flags();
$cmd .= " --NoTTable" if !$ttable_flag;
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
}
$cmd .= " --GZOutput ";
map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
print STDERR "$cmd\n";
safesystem("$cmd") or die "ERROR: Phrase extraction failed (missing input files?)";
foreach my $f (@tempfiles) {
unlink $f;
}
if (! $___DONT_ZIP) {
safesystem("gzip $extract_file.o") if -e "$extract_file.o";
safesystem("gzip $extract_file.sid") if -e "$extract_file.sid";
if ($ttable_flag) {
safesystem("gzip $extract_file.inv") or die("ERROR");
safesystem("gzip $extract_file") or die("ERROR");
}
}
}
### (6) PHRASE SCORING
@ -1495,20 +1459,20 @@ sub score_phrase_phrase_extract {
}
my $extract = "$extract_filename.sorted";
#if (!($___CONTINUE && -e "$extract_filename.sorted")) {
# # sorting
# print STDERR "(6.".($substep++).") sorting $direction @ ".`date`;
# if (-e "$extract_filename.gz") {
# safesystem("gunzip < $extract_filename.gz | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR");
# }
# else {
# safesystem("LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR");
# }
#}
if (!($___CONTINUE && -e "$extract_filename.sorted")) {
# sorting
print STDERR "(6.".($substep++).") sorting $direction @ ".`date`;
if (-e "$extract_filename.gz") {
safesystem("gunzip < $extract_filename.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR");
}
else {
safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR");
}
}
print STDERR "(6.".($substep++).") creating table half $ttable_file.half.$direction @ ".`date`;
my $cmd = "$PHRASE_SCORE $extract.gz $lexical_file.$direction $ttable_file.half.$direction.gz $inverse";
my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
$cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
@ -1524,10 +1488,8 @@ sub score_phrase_phrase_extract {
# sorting inverse phrase-table-half to sync up with regular one
if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) {
print STDERR "(6." . ($substep++) . ") sorting inverse e2f table@ ".`date`;
$cmd = "zcat $ttable_file.half.e2f.gz | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS -T $___TEMP_DIR | gzip -c > $ttable_file.half.e2f.sorted.gz";
print "Executing: $cmd \n";
safesystem($cmd) or die("ERROR");
if (! $debug) { safesystem("rm -f $ttable_file.half.e2f.gz") or die("ERROR"); }
safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f > $ttable_file.half.e2f.sorted") or die("ERROR");
if (! $debug) { safesystem("rm -f $ttable_file.half.e2f") or die("ERROR"); }
}
exit();
@ -1554,7 +1516,7 @@ sub score_phrase_phrase_extract {
# merging the two halves
print STDERR "(6.6) consolidating the two halves @ ".`date`;
return if $___CONTINUE && -e "$ttable_file.gz";
my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.sorted.gz $ttable_file.gz";
my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --LogProb" if $LOG_PROB;
$cmd .= " --NegLogProb" if $NEG_LOG_PROB;
@ -1565,6 +1527,9 @@ sub score_phrase_phrase_extract {
$cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
if (! $___DONT_ZIP) {
safesystem("gzip $ttable_file") || die("ERROR: could not gzip $ttable_file");
}
}
sub score_phrase_memscore {
@ -1578,7 +1543,7 @@ sub score_phrase_memscore {
# The output is sorted to avoid breaking scripts that rely on the
# sorting behaviour of the previous scoring algorithm.
my $cmd = "$MEMSCORE $options | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS -T $___TEMP_DIR | gzip >$ttable_file.gz";
my $cmd = "$MEMSCORE $options | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip >$ttable_file.gz";
if (-e "$extract_file.gz") {
$cmd = "$ZCAT $extract_file.gz | ".$cmd;
} else {
@ -1633,11 +1598,11 @@ sub get_reordering_factored {
sub get_reordering {
my ($extract_file,$reo_model_path) = @_;
if (-e "$extract_file.o.sorted.gz") {
# do nothing
if (-e "$extract_file.o.gz") {
safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR");
}
else {
die("ERROR: $extract_file.o.sorted.gz does not exist");
safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o > $extract_file.o.sorted") or die("ERROR");
}
my $smooth = $___REORDERING_SMOOTH;
@ -1645,20 +1610,22 @@ sub get_reordering {
print STDERR "(7.2) building tables @ ".`date`;
#create cmd string for lexical reordering scoring
my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path";
my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted $smooth $reo_model_path";
$cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
for my $mtype (keys %REORDERING_MODEL_TYPES) {
$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
foreach my $model (@REORDERING_MODELS) {
if ($model->{"type"} eq $mtype) {
$cmd .= " ".$model->{"filename"};
}
}
$cmd .= "\"";
}
$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
foreach my $model (@REORDERING_MODELS) {
if ($model->{"type"} eq $mtype) {
$cmd .= " ".$model->{"filename"};
}
}
$cmd .= "\"";
}
#Call the lexical reordering scorer
safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
#Call the lexical reordering scorer
safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
if (! $debug) { safesystem("rm $extract_file.o.sorted") or die("ERROR");}
}