From ba99159763658a33c10d9a909377639bda53330f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 May 2012 14:41:49 +0100 Subject: [PATCH 01/38] Gzip consolidate on the fly --- scripts/training/train-model.perl.missing_bin_dir | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index d3748fdc9..a1f2dd2f8 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1516,7 +1516,7 @@ sub score_phrase_phrase_extract { # merging the two halves print STDERR "(6.6) consolidating the two halves @ ".`date`; return if $___CONTINUE && -e "$ttable_file.gz"; - my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file"; + my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file.gz"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --LogProb" if $LOG_PROB; $cmd .= " --NegLogProb" if $NEG_LOG_PROB; @@ -1527,9 +1527,6 @@ sub score_phrase_phrase_extract { $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY; safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed"; if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } - if (! $___DONT_ZIP) { - safesystem("gzip $ttable_file") || die("ERROR: could not gzip $ttable_file"); - } } sub score_phrase_memscore { From 3d67e33b9e88b29216709590e6bead0eeba9b0c3 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 May 2012 15:10:21 +0100 Subject: [PATCH 02/38] GZip extract on the fly --- scripts/training/phrase-extract/Jamfile | 2 +- scripts/training/phrase-extract/extract.cpp | 34 +++++++++++-------- .../training/train-model.perl.missing_bin_dir | 12 +++---- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile index 0872130f9..9c077fb12 100644 --- a/scripts/training/phrase-extract/Jamfile +++ b/scripts/training/phrase-extract/Jamfile @@ -10,7 +10,7 @@ obj XmlTree.o : XmlTree.cpp : . ; alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : . ; alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : . ; -exe extract : tables-core.o SentenceAlignment.o extract.cpp InputFileStream ../../..//boost_iostreams ; +exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ; exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../../../moses/src//ThreadPool ../../..//boost_iostreams ; diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp index f6d6cbb9b..16b413da9 100644 --- a/scripts/training/phrase-extract/extract.cpp +++ b/scripts/training/phrase-extract/extract.cpp @@ -22,6 +22,7 @@ #include "SentenceAlignment.h" #include "tables-core.h" #include "InputFileStream.h" +#include "OutputFileStream.h" using namespace std; @@ -82,15 +83,16 @@ bool hierModel = false; REO_MODEL_TYPE hierType = REO_MSD; -ofstream extractFile; -ofstream extractFileInv; -ofstream extractFileOrientation; -ofstream extractFileSentenceId; +Moses::OutputFileStream extractFile; +Moses::OutputFileStream extractFileInv; +Moses::OutputFileStream extractFileOrientation; +Moses::OutputFileStream extractFileSentenceId; int maxPhraseLength; bool orientationFlag = false; bool translationFlag = true; bool sentenceIdFlag = false; //create extract file with sentence id bool onlyOutputSpanInfo = false; +bool gzOutput = false; int main(int argc, char* argv[]) { @@ -116,6 +118,8 @@ int main(int argc, char* argv[]) translationFlag = false; } else if (strcmp(argv[i], "--SentenceId") == 0) { sentenceIdFlag = true; + } else if (strcmp(argv[i], "--GZOutput") == 0) { + gzOutput = true; } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; @@ -193,18 +197,18 @@ int main(int argc, char* argv[]) // open output files if (translationFlag) { - string fileNameExtractInv = fileNameExtract + ".inv"; - extractFile.open(fileNameExtract.c_str()); - extractFileInv.open(fileNameExtractInv.c_str()); + string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":""); + extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str()); + extractFileInv.Open(fileNameExtractInv.c_str()); } if (orientationFlag) { - string fileNameExtractOrientation = fileNameExtract + ".o"; - extractFileOrientation.open(fileNameExtractOrientation.c_str()); + string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":""); + extractFileOrientation.Open(fileNameExtractOrientation.c_str()); } if (sentenceIdFlag) { - string fileNameExtractSentenceId = fileNameExtract + ".sid"; - extractFileSentenceId.open(fileNameExtractSentenceId.c_str()); + string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":""); + extractFileSentenceId.Open(fileNameExtractSentenceId.c_str()); } int i=0; @@ -239,12 +243,12 @@ int main(int argc, char* argv[]) //az: only close if we actually opened it if (!onlyOutputSpanInfo) { if (translationFlag) { - extractFile.close(); - extractFileInv.close(); + extractFile.Close(); + extractFileInv.Close(); } - if (orientationFlag) extractFileOrientation.close(); + if (orientationFlag) extractFileOrientation.Close(); if (sentenceIdFlag) { - extractFileSentenceId.close(); + extractFileSentenceId.Close(); } } } diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index a1f2dd2f8..82b4fbce8 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1362,20 +1362,16 @@ sub extract_phrase { $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS); } } + + $cmd .= " --GZOutput "; + map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a); print STDERR "$cmd\n"; safesystem("$cmd") or die "ERROR: Phrase extraction failed (missing input files?)"; foreach my $f (@tempfiles) { unlink $f; } - if (! $___DONT_ZIP) { - safesystem("gzip $extract_file.o") if -e "$extract_file.o"; - safesystem("gzip $extract_file.sid") if -e "$extract_file.sid"; - if ($ttable_flag) { - safesystem("gzip $extract_file.inv") or die("ERROR"); - safesystem("gzip $extract_file") or die("ERROR"); - } - } + } ### (6) PHRASE SCORING From 947bee50b3b8d2d9360de011760466aa06a18362 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 May 2012 15:36:34 +0100 Subject: [PATCH 03/38] GZip extract.sorted --- .../training/train-model.perl.missing_bin_dir | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 82b4fbce8..49d1d37ed 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1453,16 +1453,16 @@ sub score_phrase_phrase_extract { $inverse = " --Inverse"; $extract_filename = $extract_file.".inv"; } - my $extract = "$extract_filename.sorted"; + my $extract = "$extract_filename.sorted.gz"; - if (!($___CONTINUE && -e "$extract_filename.sorted")) { + if (!($___CONTINUE && -e "$extract_filename.sorted.gz")) { # sorting print STDERR "(6.".($substep++).") sorting $direction @ ".`date`; if (-e "$extract_filename.gz") { - safesystem("gunzip < $extract_filename.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR"); + safesystem("gunzip < $extract_filename.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip -c > $extract_filename.sorted.gz") or die("ERROR"); } else { - safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR"); + safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename | gzip -c > $extract_filename.sorted.gz") or die("ERROR"); } } @@ -1592,10 +1592,10 @@ sub get_reordering_factored { sub get_reordering { my ($extract_file,$reo_model_path) = @_; if (-e "$extract_file.o.gz") { - safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR"); + safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip -c > $extract_file.o.sorted.gz") or die("ERROR"); } else { - safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o > $extract_file.o.sorted") or die("ERROR"); + safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o | gzip -c > $extract_file.o.sorted.gz") or die("ERROR"); } my $smooth = $___REORDERING_SMOOTH; @@ -1603,7 +1603,7 @@ sub get_reordering { print STDERR "(7.2) building tables @ ".`date`; #create cmd string for lexical reordering scoring - my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted $smooth $reo_model_path"; + my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path"; $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/); for my $mtype (keys %REORDERING_MODEL_TYPES) { $cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}"; @@ -1618,7 +1618,7 @@ sub get_reordering { #Call the lexical reordering scorer safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed"; - if (! $debug) { safesystem("rm $extract_file.o.sorted") or die("ERROR");} + if (! $debug) { safesystem("rm $extract_file.o.sorted.gz") or die("ERROR");} } From 3de14f62795d2cc06f7488731865008db4368a0b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 May 2012 15:55:03 +0100 Subject: [PATCH 04/38] Gzip phrase-table.half.e2f.sorted on the fly --- scripts/training/train-model.perl.missing_bin_dir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 49d1d37ed..19841b948 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1484,7 +1484,7 @@ sub score_phrase_phrase_extract { # sorting inverse phrase-table-half to sync up with regular one if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) { print STDERR "(6." . ($substep++) . ") sorting inverse e2f table@ ".`date`; - safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f > $ttable_file.half.e2f.sorted") or die("ERROR"); + safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f | gzip -c > $ttable_file.half.e2f.sorted.gz") or die("ERROR"); if (! $debug) { safesystem("rm -f $ttable_file.half.e2f") or die("ERROR"); } } @@ -1512,7 +1512,7 @@ sub score_phrase_phrase_extract { # merging the two halves print STDERR "(6.6) consolidating the two halves @ ".`date`; return if $___CONTINUE && -e "$ttable_file.gz"; - my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file.gz"; + my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted.gz $ttable_file.gz"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --LogProb" if $LOG_PROB; $cmd .= " --NegLogProb" if $NEG_LOG_PROB; From 349409d1c6f80af89d48229537a713a6a95a168a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 May 2012 16:38:31 +0100 Subject: [PATCH 05/38] GZip scoring on the fly --- scripts/training/phrase-extract/Jamfile | 2 +- scripts/training/phrase-extract/score.cpp | 8 ++++---- scripts/training/train-model.perl.missing_bin_dir | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile index 9c077fb12..5ed3f20f1 100644 --- a/scripts/training/phrase-extract/Jamfile +++ b/scripts/training/phrase-extract/Jamfile @@ -16,7 +16,7 @@ exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o Sen exe extract-lex : extract-lex.cpp InputFileStream ; -exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp InputFileStream ../../..//boost_iostreams ; +exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ; exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ; diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp index af7401132..935bedaa5 100644 --- a/scripts/training/phrase-extract/score.cpp +++ b/scripts/training/phrase-extract/score.cpp @@ -32,6 +32,7 @@ #include "PhraseAlignment.h" #include "score.h" #include "InputFileStream.h" +#include "OutputFileStream.h" using namespace std; @@ -188,9 +189,9 @@ int main(int argc, char* argv[]) phraseTableFile = &cout; } else { - ofstream *outputFile = new ofstream(); - outputFile->open(fileNamePhraseTable); - if (outputFile->fail()) { + Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); + bool success = outputFile->Open(fileNamePhraseTable); + if (!success) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); @@ -245,7 +246,6 @@ int main(int argc, char* argv[]) phraseTableFile->flush(); if (phraseTableFile != &cout) { - (dynamic_cast(phraseTableFile))->close(); delete phraseTableFile; } diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 19841b948..a4548d872 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1468,7 +1468,7 @@ sub score_phrase_phrase_extract { print STDERR "(6.".($substep++).") creating table half $ttable_file.half.$direction @ ".`date`; - my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse"; + my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction.gz $inverse"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT; $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY; @@ -1484,7 +1484,7 @@ sub score_phrase_phrase_extract { # sorting inverse phrase-table-half to sync up with regular one if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) { print STDERR "(6." . ($substep++) . ") sorting inverse e2f table@ ".`date`; - safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f | gzip -c > $ttable_file.half.e2f.sorted.gz") or die("ERROR"); + safesystem("zcat $ttable_file.half.e2f.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip -c > $ttable_file.half.e2f.sorted.gz") or die("ERROR"); if (! $debug) { safesystem("rm -f $ttable_file.half.e2f") or die("ERROR"); } } @@ -1512,7 +1512,7 @@ sub score_phrase_phrase_extract { # merging the two halves print STDERR "(6.6) consolidating the two halves @ ".`date`; return if $___CONTINUE && -e "$ttable_file.gz"; - my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted.gz $ttable_file.gz"; + my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.sorted.gz $ttable_file.gz"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --LogProb" if $LOG_PROB; $cmd .= " --NegLogProb" if $NEG_LOG_PROB; From ce3ad73ebfaab3e5a1ab72e88b1c8042661e30da Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 May 2012 17:20:40 +0100 Subject: [PATCH 06/38] coc filename determined from phrase-table.half name, rather than passed as argument --- scripts/training/phrase-extract/score.cpp | 26 +++++++------------ .../training/train-model.perl.missing_bin_dir | 8 +++--- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp index 935bedaa5..1d081a054 100644 --- a/scripts/training/phrase-extract/score.cpp +++ b/scripts/training/phrase-extract/score.cpp @@ -57,7 +57,7 @@ public: vector tokenize( const char [] ); -void writeCountOfCounts( const char* fileNameCountOfCounts ); +void writeCountOfCounts( const string &fileNameCountOfCounts ); void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile); PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair ); void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile ); @@ -92,13 +92,13 @@ int main(int argc, char* argv[]) << "scoring methods for extracted rules\n"; if (argc < 4) { - cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n"; + cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; - char* fileNameCountOfCounts; + string fileNameCountOfCounts; char* fileNameFunctionWords; for(int i=4; i Date: Wed, 23 May 2012 19:02:36 +0100 Subject: [PATCH 07/38] parallel extract. Works with reordering --- scripts/generic/extract-parallel.perl | 86 +++++++++++------ .../training/train-model.perl.missing_bin_dir | 92 +++++++++++-------- 2 files changed, 112 insertions(+), 66 deletions(-) diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index ec5366f2e..f60ed8470 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -6,11 +6,15 @@ use strict; use File::Basename; +sub RunFork($); +sub systemCheck($); sub NumStr($); print "Started ".localtime() ."\n"; my $numParallel= $ARGV[0]; +$numParallel = 1 if $numParallel < 1; + my $splitCmd= $ARGV[1]; my $sortCmd= $ARGV[2]; my $extractCmd= $ARGV[3]; @@ -29,25 +33,34 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i) my $TMPDIR=dirname($extract) ."/tmp.$$"; mkdir $TMPDIR; -my $totalLines = int(`wc -l $align`); +my $totalLines = int(`cat $align | wc -l`); my $linesPerSplit = int($totalLines / $numParallel) + 1; print "total=$totalLines line-per-split=$linesPerSplit \n"; +my @children; +my $pid; my $cmd; + if ($numParallel > 1) { $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $target $TMPDIR/target."; - print STDERR "Executing: $cmd \n"; - `$cmd`; + $pid = RunFork($cmd); + push(@children, $pid); $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $source $TMPDIR/source."; - print STDERR "Executing: $cmd \n"; - `$cmd`; + $pid = RunFork($cmd); + push(@children, $pid); $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $align $TMPDIR/align."; - print STDERR "Executing: $cmd \n"; - `$cmd`; + $pid = RunFork($cmd); + push(@children, $pid); + + # wait for everything is finished + foreach (@children) { + waitpid($_, 0); + } + } else { @@ -67,15 +80,13 @@ else } # run extract -my $isParent = 1; -my @childs; +@children = (); for (my $i = 0; $i < $numParallel; ++$i) { my $pid = fork(); if ($pid == 0) { # child - $isParent = 0; my $numStr = NumStr($i); my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $otherExtractArgs \n"; print STDERR $cmd; @@ -85,20 +96,13 @@ for (my $i = 0; $i < $numParallel; ++$i) } else { # parent - push(@childs, $pid); + push(@children, $pid); } } # wait for everything is finished -if ($isParent) -{ - foreach (@childs) { - waitpid($_, 0); - } -} -else -{ - die "shouldn't be here"; +foreach (@children) { + waitpid($_, 0); } # merge @@ -116,29 +120,52 @@ for (my $i = 0; $i < $numParallel; ++$i) $catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.sorted.gz \n"; $catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.inv.sorted.gz \n"; $catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n"; -print STDERR $catCmd; -print STDERR $catInvCmd; -print STDERR $catOCmd; -systemCheck($catCmd); -systemCheck($catInvCmd); + +@children = (); +$pid = RunFork($catCmd); +push(@children, $pid); + +$pid = RunFork($catInvCmd); +push(@children, $pid); my $numStr = NumStr(0); if (-e "$TMPDIR/extract.$numStr.o.gz") { - systemCheck($catOCmd); + $pid = RunFork($catOCmd); + push(@children, $pid); +} + +# wait for all sorting to finish +foreach (@children) { + waitpid($_, 0); } -$cmd = "rm -rf $TMPDIR \n"; -print STDERR $cmd; -`$cmd`; +#$cmd = "rm -rf $TMPDIR \n"; +#print STDERR $cmd; +#`$cmd`; print STDERR "Finished ".localtime() ."\n"; # ----------------------------------------- # ----------------------------------------- +sub RunFork($) +{ + my $cmd = shift; + + my $pid = fork(); + + if ($pid == 0) + { # child + print STDERR $cmd; + systemCheck($cmd); + exit(); + } + return $pid; +} + sub systemCheck($) { my $cmd = shift; @@ -171,4 +198,3 @@ sub NumStr($) return $numStr; } - diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index c3efa4d30..ff5d7454e 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -19,7 +19,7 @@ if ($SCRIPTS_ROOTDIR eq '') { $SCRIPTS_ROOTDIR =~ s/\/training$//; $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); -my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_CORPUS, +my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_CORPUS, $_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH, $_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_VERBOSE, $_ALIGNMENT, $_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS, @@ -36,6 +36,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_ $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS, $_ADDITIONAL_INI, $_DICTIONARY, $_EPPEX); +my $_CORES = 1; my $debug = 0; # debug this script, do not delete any files in debug mode @@ -58,6 +59,7 @@ $_HELP = 1 'temp-dir=s' => \$_TEMP_DIR, 'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE, 'sort-batch-size=s' => \$_SORT_BATCH_SIZE, + 'sort-compress=s' => \$_SORT_COMPRESS, 'extract-file=s' => \$_EXTRACT_FILE, 'alignment=s' => \$_ALIGNMENT, 'alignment-file=s' => \$_ALIGNMENT_FILE, @@ -114,7 +116,8 @@ $_HELP = 1 'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES, 'dictionary=s' => \$_DICTIONARY, 'eppex:s' => \$_EPPEX, - 'additional-ini=s' => \$_ADDITIONAL_INI + 'additional-ini=s' => \$_ADDITIONAL_INI, + 'cores=i' => \$_CORES ); if ($_HELP) { @@ -206,8 +209,36 @@ if(!defined $_MGIZA ){ my $MKCLS = "$BINDIR/mkcls"; +# parallel extract +my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; +if($SPLIT_EXEC) { + $SPLIT_EXEC = 'gsplit'; +} +else { + $SPLIT_EXEC = 'split'; +} + +my $SORT_EXEC = `gsort --help 2>/dev/null`; +if($SORT_EXEC) { + $SORT_EXEC = 'gsort'; +} +else { + $SORT_EXEC = 'sort'; +} + +my $__SORT_BUFFER_SIZE = ""; +$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE; + +my $__SORT_BATCH_SIZE = ""; +$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE; + +my $__SORT_COMPRESS = ""; +$__SORT_COMPRESS = "--compress-program=$_SORT_COMPRESS" if $_SORT_COMPRESS; + # supporting scripts/binaries from this package my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract"; +$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_EXTRACT"; + my $RULE_EXTRACT; if (defined($_GHKM)) { $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm"; @@ -215,6 +246,8 @@ if (defined($_GHKM)) { else { $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules"; } +$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $RULE_EXTRACT"; + my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score"; my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore"; my $EPPEX = "$SCRIPTS_ROOTDIR/training/eppex/eppex"; @@ -308,12 +341,6 @@ $_DONT_ZIP = $___DONT_ZIP unless $___DONT_ZIP; my $___TEMP_DIR = $___MODEL_DIR; $___TEMP_DIR = $_TEMP_DIR if $_TEMP_DIR; -my $__SORT_BUFFER_SIZE = ""; -$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE; - -my $__SORT_BATCH_SIZE = ""; -$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE; - my $___CONTINUE = 0; $___CONTINUE = $_CONTINUE if $_CONTINUE; @@ -1590,35 +1617,28 @@ sub get_reordering_factored { } sub get_reordering { - my ($extract_file,$reo_model_path) = @_; - if (-e "$extract_file.o.gz") { - safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip -c > $extract_file.o.sorted.gz") or die("ERROR"); - } - else { - safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o | gzip -c > $extract_file.o.sorted.gz") or die("ERROR"); - } - - my $smooth = $___REORDERING_SMOOTH; - - print STDERR "(7.2) building tables @ ".`date`; - - #create cmd string for lexical reordering scoring - my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path"; - $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/); - for my $mtype (keys %REORDERING_MODEL_TYPES) { - $cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}"; - foreach my $model (@REORDERING_MODELS) { - if ($model->{"type"} eq $mtype) { - $cmd .= " ".$model->{"filename"}; - } + my ($extract_file,$reo_model_path) = @_; + my $smooth = $___REORDERING_SMOOTH; + + print STDERR "(7.2) building tables @ ".`date`; + + #create cmd string for lexical reordering scoring + my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path"; + $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/); + for my $mtype (keys %REORDERING_MODEL_TYPES) { + $cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}"; + foreach my $model (@REORDERING_MODELS) { + if ($model->{"type"} eq $mtype) { + $cmd .= " ".$model->{"filename"}; + } + } + $cmd .= "\""; } - $cmd .= "\""; - } - - #Call the lexical reordering scorer - safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed"; - - if (! $debug) { safesystem("rm $extract_file.o.sorted.gz") or die("ERROR");} + + #Call the lexical reordering scorer + safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed"; + + if (! $debug) { safesystem("rm $extract_file.o.sorted.gz") or die("ERROR");} } From f33538ea5546e402d92645c8fa95289cdd12c988 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 May 2012 19:12:23 +0100 Subject: [PATCH 08/38] Works with phrase-table scoring --- scripts/training/train-model.perl.missing_bin_dir | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index ff5d7454e..f599e00aa 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1482,17 +1482,6 @@ sub score_phrase_phrase_extract { } my $extract = "$extract_filename.sorted.gz"; - if (!($___CONTINUE && -e "$extract_filename.sorted.gz")) { - # sorting - print STDERR "(6.".($substep++).") sorting $direction @ ".`date`; - if (-e "$extract_filename.gz") { - safesystem("gunzip < $extract_filename.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip -c > $extract_filename.sorted.gz") or die("ERROR"); - } - else { - safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename | gzip -c > $extract_filename.sorted.gz") or die("ERROR"); - } - } - print STDERR "(6.".($substep++).") creating table half $ttable_file.half.$direction @ ".`date`; my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction.gz $inverse"; @@ -1506,7 +1495,6 @@ sub score_phrase_phrase_extract { $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); print $cmd."\n"; safesystem($cmd) or die "ERROR: Scoring of phrases failed"; - if (! $debug) { safesystem("rm -f $extract") or die("ERROR"); } # sorting inverse phrase-table-half to sync up with regular one if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) { From f91b19fa4126ef15f89dcc6c3e870bf6846836ff Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 23 May 2012 19:26:37 +0100 Subject: [PATCH 09/38] Works with phrase-table scoring --- scripts/generic/extract-parallel.perl | 8 ++++---- scripts/training/train-model.perl.missing_bin_dir | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index f60ed8470..b810d9672 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -141,10 +141,10 @@ foreach (@children) { waitpid($_, 0); } - -#$cmd = "rm -rf $TMPDIR \n"; -#print STDERR $cmd; -#`$cmd`; +# delete temporary files +$cmd = "rm -rf $TMPDIR \n"; +print STDERR $cmd; +`$cmd`; print STDERR "Finished ".localtime() ."\n"; diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index f599e00aa..0dda1017f 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1626,7 +1626,6 @@ sub get_reordering { #Call the lexical reordering scorer safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed"; - if (! $debug) { safesystem("rm $extract_file.o.sorted.gz") or die("ERROR");} } From d979b4454d04e75440ce4e1607a31e8c679a0632 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 24 May 2012 15:05:23 +0100 Subject: [PATCH 10/38] parallel scoring --- scripts/generic/score-parallel.perl | 287 ++++++++++++++++++ .../training/train-model.perl.missing_bin_dir | 29 +- 2 files changed, 304 insertions(+), 12 deletions(-) create mode 100755 scripts/generic/score-parallel.perl diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl new file mode 100755 index 000000000..062d0df9c --- /dev/null +++ b/scripts/generic/score-parallel.perl @@ -0,0 +1,287 @@ +#! /usr/bin/perl -w + +# example +# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0 +# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f --Inverse 1 + +use strict; +use File::Basename; + +sub RunFork($); +sub systemCheck($); +sub GetSourcePhrase($); +sub NumStr($); + +#my $EXTRACT_SPLIT_LINES = 5000000; +my $EXTRACT_SPLIT_LINES = 1000; + +print "Started ".localtime() ."\n"; + +my $numParallel = $ARGV[0]; +$numParallel = 1 if $numParallel < 1; + +my $sortCmd = $ARGV[1]; +my $scoreCmd = $ARGV[2]; + +my $extractFile = $ARGV[3]; # 1st arg of extract argument +my $lexFile = $ARGV[4]; +my $ptHalf = $ARGV[5]; # output + +my $otherExtractArgs= ""; +for (my $i = 6; $i < $#ARGV; ++$i) +{ + $otherExtractArgs .= $ARGV[$i] ." "; +} +#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs + +my $doSort = $ARGV[$#ARGV]; # last arg + +my $TMPDIR=dirname($ptHalf) ."/tmp.$$"; +mkdir $TMPDIR; + +my $cmd; + +my $fileCount = 0; +if ($numParallel <= 1) +{ # don't do parallel. Just link the extract file into place + $cmd = "ln -s $extractFile $TMPDIR/extract.0.gz"; + print STDERR "$cmd \n"; + systemCheck($cmd); + + $fileCount = 1; +} +else +{ # cut up extract file into smaller mini-extract files. + if ($extractFile =~ /\.gz$/) { + open(IN, "gunzip -c $extractFile |") || die "can't open pipe to $extractFile"; + } + else { + open(IN, $extractFile) || die "can't open $extractFile"; + } + + my $filePath = "$TMPDIR/extract.$fileCount.gz"; + open (OUT, "| gzip -c > $filePath") or die "error starting gzip $!"; + + my $lineCount = 0; + my $line; + my $prevSourcePhrase = ""; + while ($line=) + { + chomp($line); + ++$lineCount; + + if ($lineCount > $EXTRACT_SPLIT_LINES) + { # over line limit. Cut off at next source phrase change + my $sourcePhrase = GetSourcePhrase($line); + + if ($prevSourcePhrase eq "") + { # start comparing + $prevSourcePhrase = $sourcePhrase; + } + elsif ($sourcePhrase eq $prevSourcePhrase) + { # can't cut off yet. Do nothing + } + else + { # cut off, open next min-extract file & write to that instead + close OUT; + + $prevSourcePhrase = ""; + $lineCount = 0; + ++$fileCount; + my $filePath = $fileCount; + $filePath = "$TMPDIR/extract.$filePath.gz"; + open (OUT, "| gzip -c > $filePath") or die "error starting gzip $!"; + } + } + else + { # keep on writing to current mini-extract file + } + + print OUT "$line\n"; + + } + close OUT; + ++$fileCount; +} + + +# create run scripts +my @runFiles = (0..($numParallel-1)); +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $path = "$TMPDIR/run.$i.sh"; + open(my $fh, ">", $path) or die "cannot open $path: $!"; + $runFiles[$i] = $fh; +} + +# write scoring of mini-extracts to run scripts +for (my $i = 0; $i < $fileCount; ++$i) +{ + my $numStr = NumStr($i); + + my $fileInd = $i % $numParallel; + my $fh = $runFiles[$fileInd]; + my $cmd = "$scoreCmd $TMPDIR/extract.$i.gz $lexFile $TMPDIR/phrase-table.half.$numStr.gz $otherExtractArgs\n"; + print $fh $cmd; +} + +# close run script files +for (my $i = 0; $i < $numParallel; ++$i) +{ + close($runFiles[$i]); + my $path = "$TMPDIR/run.$i.sh"; + systemCheck("chmod +x $path"); +} + +# run each score script in parallel +my @children; +for (my $i = 0; $i < $numParallel; ++$i) +{ + my $cmd = "$TMPDIR/run.$i.sh"; + my $pid = RunFork($cmd); + push(@children, $pid); +} + +# wait for everything is finished +foreach (@children) { + waitpid($_, 0); +} + +# merge & sort +$cmd = "\n\nOH SHIT. This should have been filled in \n\n"; +if ($fileCount == 1 && !$doSort) +{ + my $numStr = NumStr(0); + $cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf.gz \n"; +} +else +{ + $cmd = "zcat $TMPDIR/phrase-table.half.*.gz"; + + if ($doSort) { + $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR "; + } + + $cmd .= " | gzip -c >"; + + if ($doSort) { + $cmd .= " $ptHalf.sorted.gz \n"; + } + else { + $cmd .= " $ptHalf.gz \n"; + } +} +print STDERR $cmd; +systemCheck($cmd); + +# merge coc +my $numStr = NumStr(0); +my $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc"; + +if (-e $cocPath) +{ + my @arrayCOC; + my $line; + + # 1st file + open(FHCOC, $cocPath) || die "can't open pipe to $cocPath"; + while ($line = ) + { + my $coc = int($line); + push(@arrayCOC, $coc); + } + close(FHCOC); + + # all other files + for (my $i = 1; $i < $fileCount; ++$i) + { + $numStr = NumStr($i); + $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc"; + open(FHCOC, $cocPath) || die "can't open pipe to $cocPath"; + my $arrayInd = 0; + while ($line = ) + { + my $coc = int($line); + $arrayCOC[$arrayInd] += $coc; + + ++$arrayInd; + } + + close(FHCOC); + } + + # output + $cocPath = "$ptHalf.coc"; + open(FHCOC, ">", $cocPath) or die "cannot open $cocPath: $!"; + for (my $i = 0; $i < @arrayCOC; ++$i) + { + print FHCOC $arrayCOC[$i]."\n"; + } + close(FHCOC); +} + +$cmd = "rm -rf $TMPDIR \n"; +print STDERR $cmd; +systemCheck($cmd); + +print STDERR "Finished ".localtime() ."\n"; + +# ----------------------------------------- +# ----------------------------------------- + +sub RunFork($) +{ + my $cmd = shift; + + my $pid = fork(); + + if ($pid == 0) + { # child + print STDERR $cmd; + systemCheck($cmd); + exit(); + } + return $pid; +} +sub systemCheck($) +{ + my $cmd = shift; + my $retVal = system($cmd); + if ($retVal != 0) + { + exit(1); + } +} + +sub GetSourcePhrase($) +{ + my $line = shift; + my $pos = index($line, "|||"); + my $sourcePhrase = substr($line, 0, $pos); + return $sourcePhrase; +} + + +sub NumStr($) +{ + my $i = shift; + my $numStr; + if ($i < 10) { + $numStr = "0000$i"; + } + elsif ($i < 100) { + $numStr = "000$i"; + } + elsif ($i < 1000) { + $numStr = "00$i"; + } + elsif ($i < 10000) { + $numStr = "0$i"; + } + else { + $numStr = $i; + } + return $numStr; +} + + diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 0dda1017f..f168f38cb 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -253,7 +253,10 @@ my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore"; my $EPPEX = "$SCRIPTS_ROOTDIR/training/eppex/eppex"; my $SYMAL = "$SCRIPTS_ROOTDIR/training/symal/symal"; my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl"; + my $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/training/phrase-extract/score"; +$PHRASE_SCORE = "$SCRIPTS_ROOTDIR/generic/score-parallel.perl $_CORES \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_SCORE"; + my $PHRASE_CONSOLIDATE = "$SCRIPTS_ROOTDIR/training/phrase-extract/consolidate"; # utilities @@ -1484,7 +1487,7 @@ sub score_phrase_phrase_extract { print STDERR "(6.".($substep++).") creating table half $ttable_file.half.$direction @ ".`date`; - my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction.gz $inverse"; + my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT; $cmd .= " --KneserNey" if $KNESER_NEY; @@ -1493,16 +1496,18 @@ sub score_phrase_phrase_extract { $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT; $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); - print $cmd."\n"; + + # sorting + if ($direction eq "e2f") { + $cmd .= " 1 "; + } + else { + $cmd .= " 0 "; + } + + print $cmd."\n"; safesystem($cmd) or die "ERROR: Scoring of phrases failed"; - # sorting inverse phrase-table-half to sync up with regular one - if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) { - print STDERR "(6." . ($substep++) . ") sorting inverse e2f table@ ".`date`; - safesystem("zcat $ttable_file.half.e2f.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip -c > $ttable_file.half.e2f.sorted.gz") or die("ERROR"); - if (! $debug) { safesystem("rm -f $ttable_file.half.e2f") or die("ERROR"); } - } - exit(); } else @@ -1534,10 +1539,10 @@ sub score_phrase_phrase_extract { $cmd .= " --OnlyDirect" if $ONLY_DIRECT; $cmd .= " --NoPhraseCount" unless $PHRASE_COUNT; $cmd .= " --LowCountFeature" if $LOW_COUNT; - $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING; - $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY; + $cmd .= " --GoodTuring $ttable_file.half.f2e.coc" if $GOOD_TURING; + $cmd .= " --KneserNey $ttable_file.half.f2e.coc" if $KNESER_NEY; safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed"; - if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } + #if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } } sub score_phrase_memscore { From e42f8d36aecdd484dfd5b6ff9393235af9036c38 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 24 May 2012 15:20:23 +0100 Subject: [PATCH 11/38] parallel scoring --- scripts/generic/score-parallel.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index 062d0df9c..fbb4d4d02 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -13,7 +13,7 @@ sub GetSourcePhrase($); sub NumStr($); #my $EXTRACT_SPLIT_LINES = 5000000; -my $EXTRACT_SPLIT_LINES = 1000; +my $EXTRACT_SPLIT_LINES = 1000000; print "Started ".localtime() ."\n"; From 2b8eeac75ee86cdd2f9259b73cf1f3aa43b3fdc9 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 24 May 2012 16:47:40 +0100 Subject: [PATCH 12/38] Minor error --- scripts/training/phrase-extract/score.cpp | 2 +- scripts/training/train-model.perl.missing_bin_dir | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp index 1d081a054..8bcc9be3b 100644 --- a/scripts/training/phrase-extract/score.cpp +++ b/scripts/training/phrase-extract/score.cpp @@ -265,7 +265,7 @@ void writeCountOfCounts( const string &fileNameCountOfCounts ) for(int i=1; i<=COC_MAX; i++) { countOfCountsFile << countOfCounts[ i ] << endl; } - countOfCountsFile.close(); + countOfCountsFile.Close(); } void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile ) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index f168f38cb..b50b5f633 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -41,7 +41,7 @@ my $_CORES = 1; my $debug = 0; # debug this script, do not delete any files in debug mode # the following line is set installation time by 'make release'. BEWARE! -my $BINDIR="/Users/hieuhoang/workspace/bin/training-tools"; +my $BINDIR="/home/hieu/workspace/bin/training-tools/"; $_HELP = 1 unless &GetOptions('root-dir=s' => \$_ROOT_DIR, From 042a9cca8d103d9d6daaef316c64ffac0f524efe Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 24 May 2012 17:10:55 +0100 Subject: [PATCH 13/38] delete pt.half after use --- scripts/training/train-model.perl.missing_bin_dir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index b50b5f633..0a15a72db 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1542,7 +1542,7 @@ sub score_phrase_phrase_extract { $cmd .= " --GoodTuring $ttable_file.half.f2e.coc" if $GOOD_TURING; $cmd .= " --KneserNey $ttable_file.half.f2e.coc" if $KNESER_NEY; safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed"; - #if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } + if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } } sub score_phrase_memscore { From fcf817fda26a010d9f7563c7b14b28534816d7c3 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 25 May 2012 12:42:11 +0100 Subject: [PATCH 14/38] add --parallel to sorting options --- .../training/train-model.perl.missing_bin_dir | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index f168f38cb..9d04cc1a1 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -19,7 +19,7 @@ if ($SCRIPTS_ROOTDIR eq '') { $SCRIPTS_ROOTDIR =~ s/\/training$//; $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); -my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_CORPUS, +my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_SORT_PARALLEL, $_CORPUS, $_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH, $_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_VERBOSE, $_ALIGNMENT, $_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS, @@ -58,8 +58,9 @@ $_HELP = 1 'model-dir=s' => \$_MODEL_DIR, 'temp-dir=s' => \$_TEMP_DIR, 'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE, - 'sort-batch-size=s' => \$_SORT_BATCH_SIZE, + 'sort-batch-size=i' => \$_SORT_BATCH_SIZE, 'sort-compress=s' => \$_SORT_COMPRESS, + 'sort-parallel=i' => \$_SORT_PARALLEL, 'extract-file=s' => \$_EXTRACT_FILE, 'alignment=s' => \$_ALIGNMENT, 'alignment-file=s' => \$_ALIGNMENT_FILE, @@ -233,11 +234,14 @@ my $__SORT_BATCH_SIZE = ""; $__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE; my $__SORT_COMPRESS = ""; -$__SORT_COMPRESS = "--compress-program=$_SORT_COMPRESS" if $_SORT_COMPRESS; +$__SORT_COMPRESS = "--compress-program $_SORT_COMPRESS" if $_SORT_COMPRESS; + +my $__SORT_PARALLEL = ""; +$__SORT_PARALLEL = "--parallel $_SORT_PARALLEL" if $_SORT_PARALLEL; # supporting scripts/binaries from this package my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract"; -$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_EXTRACT"; +$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_EXTRACT"; my $RULE_EXTRACT; if (defined($_GHKM)) { @@ -246,7 +250,7 @@ if (defined($_GHKM)) { else { $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules"; } -$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $RULE_EXTRACT"; +$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $RULE_EXTRACT"; my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score"; my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore"; @@ -255,7 +259,7 @@ my $SYMAL = "$SCRIPTS_ROOTDIR/training/symal/symal"; my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl"; my $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/training/phrase-extract/score"; -$PHRASE_SCORE = "$SCRIPTS_ROOTDIR/generic/score-parallel.perl $_CORES \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS\" $PHRASE_SCORE"; +$PHRASE_SCORE = "$SCRIPTS_ROOTDIR/generic/score-parallel.perl $_CORES \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_SCORE"; my $PHRASE_CONSOLIDATE = "$SCRIPTS_ROOTDIR/training/phrase-extract/consolidate"; From 8844be6f25bd87ae20ee19eda6e1aface0f27cc5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 25 May 2012 13:41:06 +0100 Subject: [PATCH 15/38] Move queryOnDiskPt from contrib to OnDiskPt folder. Easier to amend Jam file --- Jamroot | 2 +- OnDiskPt/Jamfile | 3 +++ {contrib/queryOnDiskPt => OnDiskPt}/queryOnDiskPt.cpp | 0 3 files changed, 4 insertions(+), 1 deletion(-) rename {contrib/queryOnDiskPt => OnDiskPt}/queryOnDiskPt.cpp (100%) diff --git a/Jamroot b/Jamroot index 3c0862ab9..8ed134521 100644 --- a/Jamroot +++ b/Jamroot @@ -104,7 +104,7 @@ build-project scripts ; #Regression tests (only does anything if --with-regtest is passed) build-project regression-testing ; -alias programs : lm//query lm//build_binary moses-chart-cmd/src//moses_chart moses-cmd/src//programs OnDiskPt//CreateOnDisk mert//programs contrib/server//mosesserver misc//programs ; +alias programs : lm//query lm//build_binary moses-chart-cmd/src//moses_chart moses-cmd/src//programs OnDiskPt//CreateOnDisk OnDiskPt//queryOnDiskPt mert//programs contrib/server//mosesserver misc//programs ; install-bin-libs programs ; install-headers headers-base : [ glob-tree *.h *.hh : jam-files dist kenlm moses ] : . ; diff --git a/OnDiskPt/Jamfile b/OnDiskPt/Jamfile index f9811c05b..9aa00fcae 100644 --- a/OnDiskPt/Jamfile +++ b/OnDiskPt/Jamfile @@ -1,2 +1,5 @@ lib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp ../moses/src//headers ; + exe CreateOnDisk : Main.cpp ../moses/src//moses OnDiskPt ; +exe queryOnDiskPt : queryOnDiskPt.cpp ../moses/src//moses OnDiskPt ; + diff --git a/contrib/queryOnDiskPt/queryOnDiskPt.cpp b/OnDiskPt/queryOnDiskPt.cpp similarity index 100% rename from contrib/queryOnDiskPt/queryOnDiskPt.cpp rename to OnDiskPt/queryOnDiskPt.cpp From 7d602aff2fcf94e0d4bfdff0e99c162383731c76 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 25 May 2012 13:41:54 +0100 Subject: [PATCH 16/38] Move queryOnDiskPt from contrib to OnDiskPt folder. Easier to amend Jam file --- contrib/queryOnDiskPt/Jamfile | 41 -------------------------------- contrib/queryOnDiskPt/compile.sh | 6 ----- 2 files changed, 47 deletions(-) delete mode 100644 contrib/queryOnDiskPt/Jamfile delete mode 100755 contrib/queryOnDiskPt/compile.sh diff --git a/contrib/queryOnDiskPt/Jamfile b/contrib/queryOnDiskPt/Jamfile deleted file mode 100644 index 6b4895022..000000000 --- a/contrib/queryOnDiskPt/Jamfile +++ /dev/null @@ -1,41 +0,0 @@ -#If you get compilation errors here, make sure you have xmlrpc-c installed properly, including the abyss server option. - -import option ; -import path ; - -with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ; -if $(with-xmlrpc-c) { - build-moses-server = true ; - xmlrpc-command = $(with-xmlrpc-c)/bin/xmlrpc-c-config ; - if ! [ path.exists $(xmlrpc-command) ] { - exit Could not find $(xmlrpc-command) : 1 ; - } -} else { - xmlrpc-check = [ _shell "xmlrpc-c-config --features 2>/dev/null" : exit-status ] ; - if $(xmlrpc-check[2]) = 0 { - if [ MATCH "(abyss-server)" : $(xmlrpc-check[1]) ] { - build-moses-server = true ; - } else { - echo "Found xmlrpc-c but it does not have abyss-server. Skipping mosesserver." ; - } - } - xmlrpc-command = "xmlrpc-c-config" ; -} - -rule shell_or_die ( cmd ) { - local ret = [ _shell $(cmd) : exit-status ] ; - if $(ret[2]) != 0 { - exit "Failed to run $(cmd)" : 1 ; - } - return $(ret[1]) ; -} - -if $(build-moses-server) = true -{ - xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --libs" ] ; - xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --cflags" ] ; - - exe queryOnDiskPt : queryOnDiskPt.cpp ../../moses/src//moses ../../OnDiskPt//OnDiskPt : $(xmlrpc-linkflags) $(xmlrpc-cxxflags) ; -} else { - alias queryOnDiskPt ; -} diff --git a/contrib/queryOnDiskPt/compile.sh b/contrib/queryOnDiskPt/compile.sh deleted file mode 100755 index 1643e27d1..000000000 --- a/contrib/queryOnDiskPt/compile.sh +++ /dev/null @@ -1,6 +0,0 @@ -SRI=/Users/hieuhoang/workspace/srilm -IRST=/Users/hieuhoang/workspace/irstlm/trunk - -g++ -o queryOnDiskPt queryOnDiskPt.cpp ../../moses/src/PhraseDictionary.cpp -I../../moses/src/ -I../../ -L../../dist/lib/ -I../../OnDiskPt -lmert_lib -ldynsa -lz -lmoses_internal -lOnDiskPt -lLM -lkenlm -lkenutil -lRuleTable -lCYKPlusParser -lScope3Parser -L$SRI/lib/macosx/ -ldstruct -lflm -llattice -lmisc -loolm -L/opt/local/lib -lboost_thread-mt -L$IRST/lib -lirstlm - - From 721ce923ccaa209a4e73fbe9ddadebedfbd96892 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 25 May 2012 15:06:35 +0100 Subject: [PATCH 17/38] add -snt2cooc to train-model.perl. For giza's reduced memory snt2cooc.perl --- .../training/train-model.perl.missing_bin_dir | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index d3748fdc9..3d820a1a7 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -29,7 +29,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_ $_DECODING_GRAPH_BACKOFF, $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE, @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, - $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_HMM_ALIGN, $_CONFIG, + $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS, $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, @@ -72,6 +72,7 @@ $_HELP = 1 'help' => \$_HELP, 'mgiza' => \$_MGIZA, # multi-thread 'mgiza-cpus=i' => \$_MGIZA_CPUS, # multi-thread + 'snt2cooc=s' => \$_SNT2COOC, # override snt2cooc exe. For when you want to run reduced memory snt2cooc.perl from mgiza 'hmm-align' => \$_HMM_ALIGN, 'final-alignment-model=s' => \$_FINAL_ALIGNMENT_MODEL, # use word alignment model 1/2/hmm/3/4/5 as final (default is 4); value 'hmm' equivalent to the --hmm-align switch 'debug' => \$debug, @@ -185,25 +186,28 @@ my $SNT2COOC; if(!defined $_MGIZA ){ $GIZA = "$BINDIR/GIZA++"; if (-x "$BINDIR/snt2cooc.out") { - $SNT2COOC = "$BINDIR/snt2cooc.out"; + $SNT2COOC = "$BINDIR/snt2cooc.out"; } elsif (-x "$BINDIR/snt2cooc") { # Since "snt2cooc.out" and "snt2cooc" work the same $SNT2COOC = "$BINDIR/snt2cooc"; } print STDERR "Using single-thread GIZA\n"; } else { - $GIZA = "$BINDIR/mgiza"; + $GIZA = "$BINDIR/mgiza"; if (-x "$BINDIR/snt2cooc") { - $SNT2COOC = "$BINDIR/snt2cooc"; - } elsif (-x "$BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $BINDIR - $SNT2COOC = "$BINDIR/snt2cooc.out"; - } + $SNT2COOC = "$BINDIR/snt2cooc"; + } elsif (-x "$BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $BINDIR + $SNT2COOC = "$BINDIR/snt2cooc.out"; + } print STDERR "Using multi-thread GIZA\n"; - if (!defined($_MGIZA_CPUS)) { - $_MGIZA_CPUS=4; - } - die("ERROR: Cannot find $MGIZA_MERGE_ALIGN") unless (-x $MGIZA_MERGE_ALIGN); + if (!defined($_MGIZA_CPUS)) { + $_MGIZA_CPUS=4; + } + die("ERROR: Cannot find $MGIZA_MERGE_ALIGN") unless (-x $MGIZA_MERGE_ALIGN); } +# override +$SNT2COOC = "$BINDIR/$_SNT2COOC" if defined($_SNT2COOC); + my $MKCLS = "$BINDIR/mkcls"; # supporting scripts/binaries from this package From 2fab137aaeeda8077734e4c6e5627bfb44d27691 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 25 May 2012 17:24:08 +0100 Subject: [PATCH 18/38] Fix compile error. --- OnDiskPt/queryOnDiskPt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OnDiskPt/queryOnDiskPt.cpp b/OnDiskPt/queryOnDiskPt.cpp index f7435649a..9a2d97680 100644 --- a/OnDiskPt/queryOnDiskPt.cpp +++ b/OnDiskPt/queryOnDiskPt.cpp @@ -6,7 +6,7 @@ #include #include -#include "util.h" +#include "Util.h" #include "OnDiskWrapper.h" #include "SourcePhrase.h" From 90c0bc9f5ceec4e7d33386ec597fd753e7d23d4a Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 25 May 2012 17:29:47 +0100 Subject: [PATCH 19/38] Add an optional PCFG scoring feature for target syntax models (similar to the p_cfg feature used in Marcu, Wang, Echihabi, and Knight (2006)). --- scripts/Jamfile | 2 + scripts/ems/experiment.meta | 15 +- scripts/ems/experiment.perl | 2 + .../training/phrase-extract/ExtractedRule.h | 2 + scripts/training/phrase-extract/Jamfile | 2 + .../phrase-extract/PhraseAlignment.cpp | 7 +- .../training/phrase-extract/PhraseAlignment.h | 1 + .../phrase-extract/RuleExtractionOptions.h | 2 + .../training/phrase-extract/SyntaxTree.cpp | 3 +- scripts/training/phrase-extract/SyntaxTree.h | 11 +- scripts/training/phrase-extract/XmlTree.cpp | 9 +- .../extract-ghkm/AlignmentGraph.cpp | 4 + .../extract-ghkm/ExtractGHKM.cpp | 5 + .../phrase-extract/extract-ghkm/Node.h | 6 +- .../phrase-extract/extract-ghkm/Options.h | 2 + .../phrase-extract/extract-ghkm/ParseTree.h | 7 +- .../phrase-extract/extract-ghkm/ScfgRule.cpp | 1 + .../phrase-extract/extract-ghkm/ScfgRule.h | 2 + .../extract-ghkm/ScfgRuleWriter.cpp | 69 ++++---- .../extract-ghkm/ScfgRuleWriter.h | 4 +- .../phrase-extract/extract-ghkm/Subgraph.cpp | 16 ++ .../phrase-extract/extract-ghkm/Subgraph.h | 8 +- .../extract-ghkm/XmlTreeParser.cpp | 1 + .../training/phrase-extract/extract-rules.cpp | 32 +++- .../phrase-extract/pcfg-common/Jamfile | 1 + .../phrase-extract/pcfg-common/exception.h | 41 +++++ .../phrase-extract/pcfg-common/numbered_set.h | 109 +++++++++++++ .../phrase-extract/pcfg-common/pcfg.cc | 106 ++++++++++++ .../phrase-extract/pcfg-common/pcfg.h | 61 +++++++ .../phrase-extract/pcfg-common/pcfg_tree.h | 77 +++++++++ .../phrase-extract/pcfg-common/syntax_tree.h | 91 +++++++++++ .../phrase-extract/pcfg-common/tool.cc | 80 +++++++++ .../phrase-extract/pcfg-common/tool.h | 91 +++++++++++ .../phrase-extract/pcfg-common/typedef.h | 37 +++++ .../pcfg-common/xml_tree_parser.cc | 85 ++++++++++ .../pcfg-common/xml_tree_parser.h | 56 +++++++ .../pcfg-common/xml_tree_writer.h | 127 +++++++++++++++ .../phrase-extract/pcfg-extract/Jamfile | 1 + .../phrase-extract/pcfg-extract/main.cc | 25 +++ .../phrase-extract/pcfg-extract/options.h | 36 +++++ .../pcfg-extract/pcfg_extract.cc | 131 +++++++++++++++ .../pcfg-extract/pcfg_extract.h | 42 +++++ .../pcfg-extract/rule_collection.cc | 58 +++++++ .../pcfg-extract/rule_collection.h | 59 +++++++ .../pcfg-extract/rule_extractor.cc | 51 ++++++ .../pcfg-extract/rule_extractor.h | 45 ++++++ .../phrase-extract/pcfg-score/Jamfile | 1 + .../phrase-extract/pcfg-score/main.cc | 25 +++ .../phrase-extract/pcfg-score/options.h | 36 +++++ .../phrase-extract/pcfg-score/pcfg_score.cc | 152 ++++++++++++++++++ .../phrase-extract/pcfg-score/pcfg_score.h | 42 +++++ .../phrase-extract/pcfg-score/tree_scorer.cc | 68 ++++++++ .../phrase-extract/pcfg-score/tree_scorer.h | 47 ++++++ scripts/training/phrase-extract/score.cpp | 23 +++ .../training/train-model.perl.missing_bin_dir | 4 + 55 files changed, 1970 insertions(+), 51 deletions(-) create mode 100644 scripts/training/phrase-extract/pcfg-common/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-common/exception.h create mode 100644 scripts/training/phrase-extract/pcfg-common/numbered_set.h create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg.h create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg_tree.h create mode 100644 scripts/training/phrase-extract/pcfg-common/syntax_tree.h create mode 100644 scripts/training/phrase-extract/pcfg-common/tool.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/tool.h create mode 100644 scripts/training/phrase-extract/pcfg-common/typedef.h create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-extract/main.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/options.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_collection.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_collection.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_extractor.h create mode 100644 scripts/training/phrase-extract/pcfg-score/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-score/main.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/options.h create mode 100644 scripts/training/phrase-extract/pcfg-score/pcfg_score.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/pcfg_score.h create mode 100644 scripts/training/phrase-extract/pcfg-score/tree_scorer.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/tree_scorer.h diff --git a/scripts/Jamfile b/scripts/Jamfile index 6fb9bad39..b9eefcffe 100644 --- a/scripts/Jamfile +++ b/scripts/Jamfile @@ -42,6 +42,8 @@ if $(location) { install compactify : training/compact-rule-table//compactify : $(location)/training/compact-rule-table/tools ; install phrase-extract : training/phrase-extract//programs : $(location)/training/phrase-extract ; + install pcfg-extract : training/phrase-extract/pcfg-extract//pcfg-extract : $(location)/training/phrase-extract/pcfg-extract ; + install pcfg-score : training/phrase-extract/pcfg-score//pcfg-score : $(location)/training/phrase-extract/pcfg-score ; install lexical-reordering : training/lexical-reordering//score : $(location)/training/lexical-reordering ; install symal : training/symal//symal : $(location)/training/symal ; diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 51ac0f67a..b33c589d2 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -344,8 +344,21 @@ parse-relax pass-unless: input-parse-relaxer output-parse-relaxer template-if: input-parse-relaxer IN.$input-extension OUT.$input-extension template-if: output-parse-relaxer IN.$output-extension OUT.$output-extension +pcfg-extract + in: parse-relaxed-corpus + out: pcfg + default-name: model/pcfg + ignore-unless: use-pcfg-feature + rerun-on-change: use-pcfg-feature + template: $moses-script-dir/training/phrase-extract/pcfg-extract/pcfg-extract < IN.$output-extension > OUT.$output-extension +pcfg-score + in: parse-relaxed-corpus pcfg + out: scored-corpus + default-name: model/scored-corpus + pass-unless: use-pcfg-feature + template: ln -s IN.$input-extension OUT.$input-extension ; $moses-script-dir/training/phrase-extract/pcfg-score/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension extract-phrases - in: word-alignment parse-relaxed-corpus + in: word-alignment scored-corpus out: extracted-phrases rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm default-name: model/extract diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 59bd2788f..0c61a2a05 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2007,6 +2007,7 @@ sub get_training_setting { my $target_syntax = &get("GENERAL:output-parser"); my $score_settings = &get("TRAINING:score-settings"); my $parallel = &get("TRAINING:parallel"); + my $pcfg = &get("TRAINING:use-pcfg-feature"); my $xml = $source_syntax || $target_syntax; @@ -2029,6 +2030,7 @@ sub get_training_setting { $cmd .= "-glue-grammar " if $hierarchical; $cmd .= "-score-options '".$score_settings."' " if $score_settings; $cmd .= "-parallel " if $parallel; + $cmd .= "-pcfg " if $pcfg; # factored training if (&backoff_and_get("TRAINING:input-factors")) { diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h index 170ccf892..be6e30836 100644 --- a/scripts/training/phrase-extract/ExtractedRule.h +++ b/scripts/training/phrase-extract/ExtractedRule.h @@ -43,6 +43,7 @@ public: int startS; int endS; float count; + double pcfgScore; std::map > m_ntLengths; @@ -58,6 +59,7 @@ public: , startS(sS) , endS(eS) , count(0) + , pcfgScore(0.0) {} void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile index 5ed3f20f1..9be67e80a 100644 --- a/scripts/training/phrase-extract/Jamfile +++ b/scripts/training/phrase-extract/Jamfile @@ -33,3 +33,5 @@ alias programs : extract extract-rules extract-lex score consolidate consolidate install legacy : programs : . EXE ; build-project extract-ghkm ; +build-project pcfg-extract ; +build-project pcfg-score ; diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp index c0bfbde3e..ceb74f04c 100644 --- a/scripts/training/phrase-extract/PhraseAlignment.cpp +++ b/scripts/training/phrase-extract/PhraseAlignment.cpp @@ -13,6 +13,8 @@ #include "tables-core.h" #include "score.h" +#include + using namespace std; extern Vocabulary vcbT; @@ -111,6 +113,9 @@ void PhraseAlignment::create( char line[], int lineID ) } else if (item == 5) { // non-term lengths addNTLength(token[j]); + } else if (item == 6) { // target syntax PCFG score + float pcfgScore = std::atof(token[j].c_str()); + pcfgSum = pcfgScore * count; } } @@ -119,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID ) if (item == 3) { count = 1.0; } - if (item < 3 || item > 5) { + if (item < 3 || item > 6) { cerr << "ERROR: faulty line " << lineID << ": " << line << endl; } } diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h index 8b8f5115c..8bd83503d 100644 --- a/scripts/training/phrase-extract/PhraseAlignment.h +++ b/scripts/training/phrase-extract/PhraseAlignment.h @@ -25,6 +25,7 @@ protected: void createAlignVec(size_t sourceSize, size_t targetSize); void addNTLength(const std::string &tok); public: + float pcfgSum; float count; std::vector< std::set > alignedToT; std::vector< std::set > alignedToS; diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h index 70bb548c9..f9123de86 100644 --- a/scripts/training/phrase-extract/RuleExtractionOptions.h +++ b/scripts/training/phrase-extract/RuleExtractionOptions.h @@ -45,6 +45,7 @@ public: bool targetSyntax; bool duplicateRules; bool fractionalCounting; + bool pcfgScore; bool outputNTLengths; bool gzOutput; @@ -74,6 +75,7 @@ public: , targetSyntax(false) , duplicateRules(true) , fractionalCounting(true) + , pcfgScore(false) , outputNTLengths(false) , gzOutput(false) {} diff --git a/scripts/training/phrase-extract/SyntaxTree.cpp b/scripts/training/phrase-extract/SyntaxTree.cpp index e181b1e8a..f2783ffd2 100644 --- a/scripts/training/phrase-extract/SyntaxTree.cpp +++ b/scripts/training/phrase-extract/SyntaxTree.cpp @@ -42,11 +42,12 @@ void SyntaxTree::Clear() m_index.clear(); } -void SyntaxTree::AddNode( int startPos, int endPos, std::string label ) +SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) { SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); + return newNode; } ParentNodes SyntaxTree::Parse() diff --git a/scripts/training/phrase-extract/SyntaxTree.h b/scripts/training/phrase-extract/SyntaxTree.h index 0ca5ca472..17c106b49 100644 --- a/scripts/training/phrase-extract/SyntaxTree.h +++ b/scripts/training/phrase-extract/SyntaxTree.h @@ -34,12 +34,14 @@ protected: std::string m_label; std::vector< SyntaxNode* > m_children; SyntaxNode* m_parent; + float m_pcfgScore; public: SyntaxNode( int startPos, int endPos, std::string label ) :m_start(startPos) ,m_end(endPos) ,m_label(label) ,m_parent(0) + ,m_pcfgScore(0.0f) {} int GetStart() const { return m_start; @@ -50,6 +52,12 @@ public: std::string GetLabel() const { return m_label; } + float GetPcfgScore() const { + return m_pcfgScore; + } + void SetPcfgScore(float score) { + m_pcfgScore = score; + } SyntaxNode *GetParent() { return m_parent; } @@ -89,11 +97,12 @@ public: } ~SyntaxTree(); + SyntaxNode *AddNode( int startPos, int endPos, std::string label ); + SyntaxNode *GetTop() { return m_top; } - void AddNode( int startPos, int endPos, std::string label ); ParentNodes Parse(); bool HasNode( int startPos, int endPos ) const; const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; diff --git a/scripts/training/phrase-extract/XmlTree.cpp b/scripts/training/phrase-extract/XmlTree.cpp index 716414f86..b22c159a1 100644 --- a/scripts/training/phrase-extract/XmlTree.cpp +++ b/scripts/training/phrase-extract/XmlTree.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include "SyntaxTree.h" #include "XmlException.h" @@ -345,13 +345,18 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label string label = ParseXmlTagAttribute(tagContent,"label"); labelCollection.insert( label ); + string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg"); + float pcfgScore = pcfgString == "" ? 0.0f + : std::atof(pcfgString.c_str()); + // report what we have processed so far if (0) { cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; cerr << "XML TAG LABEL IS: '" << label << "'" << endl; cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } - tree.AddNode( startPos, endPos-1, label ); + SyntaxNode *node = tree.AddNode( startPos, endPos-1, label ); + node->SetPcfgScore(pcfgScore); } } } diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 0ecffae5c..6bd32a13b 100644 --- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -212,6 +212,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root) std::auto_ptr n(new Node(root->GetLabel(), nodeType)); + if (nodeType == TREE) { + n->SetPcfgScore(root->GetPcfgScore()); + } + const std::vector &children = root->GetChildren(); std::vector childNodes; childNodes.reserve(children.size()); diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 008026e1a..397ce1e3c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -285,6 +285,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], "set maximum allowed scope") ("Minimal", "extract minimal rules only") + ("PCFG", + "include score based on PCFG scores in target corpus") ("UnknownWordLabel", po::value(&options.unknownWordFile), "write unknown word labels to named file") @@ -361,6 +363,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], if (vm.count("Minimal")) { options.minimal = true; } + if (vm.count("PCFG")) { + options.pcfg = true; + } if (vm.count("UnpairedExtractFormat")) { options.unpairedExtractFormat = true; } diff --git a/scripts/training/phrase-extract/extract-ghkm/Node.h b/scripts/training/phrase-extract/extract-ghkm/Node.h index 228fdc812..775473362 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Node.h +++ b/scripts/training/phrase-extract/extract-ghkm/Node.h @@ -41,8 +41,7 @@ class Node Node(const std::string &label, NodeType type) : m_label(label) , m_type(type) - , m_children() - , m_parents() {} + , m_pcfgScore(0.0f) {} ~Node(); @@ -50,12 +49,14 @@ class Node NodeType GetType() const { return m_type; } const std::vector &GetChildren() const { return m_children; } const std::vector &GetParents() const { return m_parents; } + float GetPcfgScore() const { return m_pcfgScore; } const Span &GetSpan() const { return m_span; } const Span &GetComplementSpan() const { return m_complementSpan; } const std::vector &GetRules() const { return m_rules; } void SetChildren(const std::vector &c) { m_children = c; } void SetParents(const std::vector &p) { m_parents = p; } + void SetPcfgScore(float s) { m_pcfgScore = s; } void SetSpan(const Span &s) { m_span = s; } void SetComplementSpan(const Span &cs) { m_complementSpan = cs; } @@ -92,6 +93,7 @@ class Node NodeType m_type; std::vector m_children; std::vector m_parents; + float m_pcfgScore; Span m_span; Span m_complementSpan; std::vector m_rules; diff --git a/scripts/training/phrase-extract/extract-ghkm/Options.h b/scripts/training/phrase-extract/extract-ghkm/Options.h index 108e19d66..c4b57f311 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Options.h +++ b/scripts/training/phrase-extract/extract-ghkm/Options.h @@ -36,6 +36,7 @@ struct Options { , maxRuleSize(3) , maxScope(3) , minimal(false) + , pcfg(false) , unpairedExtractFormat(false) {} // Positional options @@ -53,6 +54,7 @@ struct Options { int maxRuleSize; int maxScope; bool minimal; + bool pcfg; bool unpairedExtractFormat; std::string unknownWordFile; }; diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h index ec6fc147a..273e2e04e 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h +++ b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h @@ -32,17 +32,19 @@ class ParseTree public: ParseTree(const std::string &label) : m_label(label) - , m_children() - , m_parent() {} + , m_parent(0) + , m_pcfgScore(0.0) {} ~ParseTree(); const std::string &GetLabel() const { return m_label; } const std::vector &GetChildren() const { return m_children; } const ParseTree *GetParent() const { return m_parent; } + float GetPcfgScore() const { return m_pcfgScore; } void SetParent(ParseTree *); void SetChildren(const std::vector &); + void SetPcfgScore(float score) { m_pcfgScore = score; } void AddChild(ParseTree *); @@ -59,6 +61,7 @@ class ParseTree std::string m_label; std::vector m_children; ParseTree *m_parent; + float m_pcfgScore; // log probability }; template diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp index 8473e4283..5dc70052c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -30,6 +30,7 @@ namespace GHKM { ScfgRule::ScfgRule(const Subgraph &fragment) : m_sourceLHS("X", NonTerminal) , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) + , m_pcfgScore(fragment.GetPcfgScore()) { // Source RHS diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h index 1ed534d9e..2405d8fa3 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h @@ -57,6 +57,7 @@ class ScfgRule const std::vector &GetSourceRHS() const { return m_sourceRHS; } const std::vector &GetTargetRHS() const { return m_targetRHS; } const Alignment &GetAlignment() const { return m_alignment; } + float GetPcfgScore() const { return m_pcfgScore; } int Scope() const; @@ -68,6 +69,7 @@ class ScfgRule std::vector m_sourceRHS; std::vector m_targetRHS; Alignment m_alignment; + float m_pcfgScore; }; } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp index 4be3f048d..d5d16b790 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp @@ -24,6 +24,7 @@ #include "ScfgRule.h" #include +#include #include #include #include @@ -34,14 +35,43 @@ namespace GHKM { void ScfgRuleWriter::Write(const ScfgRule &rule) { + std::ostringstream sourceSS; + std::ostringstream targetSS; + if (m_options.unpairedExtractFormat) { - WriteUnpairedFormat(rule); + WriteUnpairedFormat(rule, sourceSS, targetSS); } else { - WriteStandardFormat(rule); + WriteStandardFormat(rule, sourceSS, targetSS); } + + // Write the rule to the forward and inverse extract files. + m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; + m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; + + const Alignment &alignment = rule.GetAlignment(); + for (Alignment::const_iterator p = alignment.begin(); + p != alignment.end(); ++p) { + m_fwd << " " << p->first << "-" << p->second; + m_inv << " " << p->second << "-" << p->first; + } + + // Write a count of 1 and an empty NT length column to the forward extract + // file. + // TODO Add option to write NT length? + m_fwd << " ||| 1 ||| |||"; + if (m_options.pcfg) { + // Write the PCFG score. + m_fwd << " " << std::exp(rule.GetPcfgScore()); + } + m_fwd << std::endl; + + // Write a count of 1 to the inverse extract file. + m_inv << " ||| 1" << std::endl; } -void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) +void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule, + std::ostream &sourceSS, + std::ostream &targetSS) { const std::vector &sourceRHS = rule.GetSourceRHS(); const std::vector &targetRHS = rule.GetTargetRHS(); @@ -60,9 +90,6 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) } } - std::ostringstream sourceSS; - std::ostringstream targetSS; - // Write the source side of the rule to sourceSS. int i = 0; for (std::vector::const_iterator p(sourceRHS.begin()); @@ -88,27 +115,14 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) targetSS << " "; } WriteSymbol(rule.GetTargetLHS(), targetSS); - - // Write the rule to the forward and inverse extract files. - m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; - m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; - for (Alignment::const_iterator p(alignment.begin()); - p != alignment.end(); ++p) { - m_fwd << " " << p->first << "-" << p->second; - m_inv << " " << p->second << "-" << p->first; - } - m_fwd << " ||| 1" << std::endl; - m_inv << " ||| 1" << std::endl; } -void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule) +void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule, + std::ostream &sourceSS, + std::ostream &targetSS) { const std::vector &sourceRHS = rule.GetSourceRHS(); const std::vector &targetRHS = rule.GetTargetRHS(); - const Alignment &alignment = rule.GetAlignment(); - - std::ostringstream sourceSS; - std::ostringstream targetSS; // Write the source side of the rule to sourceSS. int i = 0; @@ -127,17 +141,6 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule) targetSS << " "; } WriteSymbol(rule.GetTargetLHS(), targetSS); - - // Write the rule to the forward and inverse extract files. - m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; - m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; - for (Alignment::const_iterator p(alignment.begin()); - p != alignment.end(); ++p) { - m_fwd << " " << p->first << "-" << p->second; - m_inv << " " << p->second << "-" << p->first; - } - m_fwd << " ||| 1" << std::endl; - m_inv << " ||| 1" << std::endl; } void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out) diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h index 738d09ce9..b92a432a1 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h @@ -45,8 +45,8 @@ class ScfgRuleWriter ScfgRuleWriter(const ScfgRuleWriter &); ScfgRuleWriter &operator=(const ScfgRuleWriter &); - void WriteStandardFormat(const ScfgRule &); - void WriteUnpairedFormat(const ScfgRule &); + void WriteStandardFormat(const ScfgRule &, std::ostream &, std::ostream &); + void WriteUnpairedFormat(const ScfgRule &, std::ostream &, std::ostream &); void WriteSymbol(const Symbol &, std::ostream &); std::ostream &m_fwd; diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp index e5aedbb16..e048f2c55 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp @@ -101,5 +101,21 @@ int Subgraph::CalcDepth(const Node *n) const return maxChildDepth + 1; } +float Subgraph::CalcPcfgScore() const +{ + if (m_root->GetType() != TREE || m_leaves.empty()) { + return 0.0f; + } + float score = m_root->GetPcfgScore(); + for (std::set::const_iterator p = m_leaves.begin(); + p != m_leaves.end(); ++p) { + const Node *leaf = *p; + if (leaf->GetType() == TREE) { + score -= leaf->GetPcfgScore(); + } + } + return score; +} + } // namespace Moses } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h index e84903502..ede1233e9 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h +++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h @@ -38,7 +38,8 @@ class Subgraph : m_root(root) , m_depth(0) , m_size(root->GetType() == TREE ? 1 : 0) - , m_nodeCount(1) {} + , m_nodeCount(1) + , m_pcfgScore(0.0f) {} Subgraph(const Node *root, const std::set &leaves) : m_root(root) @@ -46,10 +47,12 @@ class Subgraph , m_depth(-1) , m_size(-1) , m_nodeCount(-1) + , m_pcfgScore(0.0f) { m_depth = CalcDepth(m_root); m_size = CalcSize(m_root); m_nodeCount = CountNodes(m_root); + m_pcfgScore = CalcPcfgScore(); } const Node *GetRoot() const { return m_root; } @@ -57,6 +60,7 @@ class Subgraph int GetDepth() const { return m_depth; } int GetSize() const { return m_size; } int GetNodeCount() const { return m_nodeCount; } + float GetPcfgScore() const { return m_pcfgScore; } bool IsTrivial() const { return m_leaves.empty(); } @@ -66,6 +70,7 @@ class Subgraph void GetTargetLeaves(const Node *, std::vector &) const; int CalcDepth(const Node *) const; int CalcSize(const Node *) const; + float CalcPcfgScore() const; int CountNodes(const Node *) const; const Node *m_root; @@ -73,6 +78,7 @@ class Subgraph int m_depth; int m_size; int m_nodeCount; + float m_pcfgScore; }; } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 31c0e3843..cc961dc0c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -61,6 +61,7 @@ std::auto_ptr XmlTreeParser::ConvertTree( const std::vector &words) { std::auto_ptr root(new ParseTree(tree.GetLabel())); + root->SetPcfgScore(tree.GetPcfgScore()); const std::vector &children = tree.GetChildren(); if (children.empty()) { if (tree.GetStart() != tree.GetEnd()) { diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp index 2cc9dc54d..a00667b82 100644 --- a/scripts/training/phrase-extract/extract-rules.cpp +++ b/scripts/training/phrase-extract/extract-rules.cpp @@ -90,7 +90,7 @@ void addHieroRule( int startT, int endT, int startS, int endS void printHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, LabelIndex &labelIndex); string printTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex); + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore); string printSourceHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, const LabelIndex &labelIndex); void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS @@ -257,6 +257,8 @@ int main(int argc, char* argv[]) // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { options.fractionalCounting = false; + } else if (strcmp(argv[i],"--PCFG") == 0) { + options.pcfgScore = true; } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { options.outputNTLengths = true; #ifdef WITH_THREADS @@ -517,7 +519,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, } string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex) + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore) { HoleList::iterator iterHoleList = holeColl.GetHoles().begin(); assert(iterHoleList != holeColl.GetHoles().end()); @@ -545,6 +547,11 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in out += "[" + sourceLabel + "][" + targetLabel + "] "; + if (m_options.pcfgScore) { + double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore(); + logPCFGScore -= score; + } + currPos = hole.GetEnd(1); hole.SetPos(outPos, 1); ++iterHoleList; @@ -658,8 +665,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex); // target - rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex) + if (m_options.pcfgScore) { + double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore(); + rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore) + " [" + targetLabel + "]"; + rule.pcfgScore = std::exp(logPCFGScore); + } else { + double logPCFGScore = 0.0f; + rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore) + + " [" + targetLabel + "]"; + } // source // holeColl.SortSourceHoles(); @@ -877,6 +892,11 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist rule.target += m_sentence->target[ti] + " "; rule.target += "[" + targetLabel + "]"; + if (m_options.pcfgScore) { + double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore(); + rule.pcfgScore = std::exp(logPCFGScore); + } + // alignment for(int ti=startT; ti<=endT; ti++) { for(unsigned int i=0; ialignedToT[ti].size(); i++) { @@ -957,11 +977,13 @@ void ExtractTask::writeRulesToFile() out << rule->source << " ||| " << rule->target << " ||| " << rule->alignment << " ||| " - << rule->count; + << rule->count << " ||| "; if (m_options.outputNTLengths) { - out << " ||| "; rule->OutputNTLengths(out); } + if (m_options.pcfgScore) { + out << " ||| " << rule->pcfgScore; + } out << "\n"; if (!m_options.onlyDirectFlag) { diff --git a/scripts/training/phrase-extract/pcfg-common/Jamfile b/scripts/training/phrase-extract/pcfg-common/Jamfile new file mode 100644 index 000000000..3dc272a56 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/Jamfile @@ -0,0 +1 @@ +lib pcfg_common : [ glob *.cc ] ..//trees ; diff --git a/scripts/training/phrase-extract/pcfg-common/exception.h b/scripts/training/phrase-extract/pcfg-common/exception.h new file mode 100644 index 000000000..3dbd59d0e --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/exception.h @@ -0,0 +1,41 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXCEPTION_H_ +#define PCFG_EXCEPTION_H_ + +#include + +namespace Moses { +namespace PCFG { + +class Exception { + public: + Exception(const char *msg) : msg_(msg) {} + Exception(const std::string &msg) : msg_(msg) {} + const std::string &msg() const { return msg_; } + private: + std::string msg_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/numbered_set.h b/scripts/training/phrase-extract/pcfg-common/numbered_set.h new file mode 100644 index 000000000..f88d710ed --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/numbered_set.h @@ -0,0 +1,109 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_NUMBERED_SET_H_ +#define PCFG_NUMBERED_SET_H_ + +#include "exception.h" + +#include + +#include +#include +#include + +namespace Moses { +namespace PCFG { + +// Stores a set of elements of type T, each of which is allocated an integral +// ID of type I. IDs are contiguous starting at 0. Individual elements cannot +// be removed once inserted (but the whole set can be cleared). +template +class NumberedSet { + private: + typedef boost::unordered_map ElementToIdMap; + typedef std::vector IdToElementMap; + + public: + typedef I IdType; + typedef typename IdToElementMap::const_iterator const_iterator; + + NumberedSet() {} + + const_iterator begin() const { return id_to_element_.begin(); } + const_iterator end() const { return id_to_element_.end(); } + + // Static value + static I NullId() { return std::numeric_limits::max(); } + + bool Empty() const { return id_to_element_.empty(); } + size_t Size() const { return id_to_element_.size(); } + + // Insert the given object and return its ID. + I Insert(const T &); + + I Lookup(const T &) const; + const T &Lookup(I) const; + + void Clear(); + + private: + ElementToIdMap element_to_id_; + IdToElementMap id_to_element_; +}; + +template +I NumberedSet::Lookup(const T &s) const { + typename ElementToIdMap::const_iterator p = element_to_id_.find(s); + return (p == element_to_id_.end()) ? NullId() : p->second; +} + +template +const T &NumberedSet::Lookup(I id) const { + if (id < 0 || id >= id_to_element_.size()) { + std::ostringstream msg; + msg << "Value not found: " << id; + throw Exception(msg.str()); + } + return *(id_to_element_[id]); +} + +template +I NumberedSet::Insert(const T &x) { + std::pair value(x, id_to_element_.size()); + std::pair result = + element_to_id_.insert(value); + if (result.second) { + // x is a new element. + id_to_element_.push_back(&result.first->first); + } + return result.first->second; +} + +template +void NumberedSet::Clear() { + element_to_id_.clear(); + id_to_element_.clear(); +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.cc b/scripts/training/phrase-extract/pcfg-common/pcfg.cc new file mode 100644 index 000000000..d045b820b --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg.cc @@ -0,0 +1,106 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg.h" + +#include "exception.h" + +#include +#include + +#include + +namespace Moses { +namespace PCFG { + +void Pcfg::Add(const Key &key, double score) { + rules_[key] = score; +} + +bool Pcfg::Lookup(const Key &key, double &score) const { + Map::const_iterator p = rules_.find(key); + if (p == rules_.end()) { + return false; + } + score = p->second; + return true; +} + +void Pcfg::Read(std::istream &input, Vocabulary &vocab) { + std::string line; + std::string lhs_string; + std::vector rhs_strings; + std::string score_string; + Key key; + while (std::getline(input, line)) { + // Read LHS. + size_t pos = line.find("|||"); + if (pos == std::string::npos) { + throw Exception("missing first delimiter"); + } + lhs_string = line.substr(0, pos); + boost::trim(lhs_string); + + // Read RHS. + size_t begin = pos+3; + pos = line.find("|||", begin); + if (pos == std::string::npos) { + throw Exception("missing second delimiter"); + } + std::string rhs_text = line.substr(begin, pos-begin); + boost::trim(rhs_text); + rhs_strings.clear(); + boost::split(rhs_strings, rhs_text, boost::algorithm::is_space(), + boost::algorithm::token_compress_on); + + // Read score. + score_string = line.substr(pos+3); + boost::trim(score_string); + + // Construct key. + key.clear(); + key.reserve(rhs_strings.size()+1); + key.push_back(vocab.Insert(lhs_string)); + for (std::vector::const_iterator p = rhs_strings.begin(); + p != rhs_strings.end(); ++p) { + key.push_back(vocab.Insert(*p)); + } + + // Add rule. + double score = boost::lexical_cast(score_string); + Add(key, score); + } +} + +void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const { + for (const_iterator p = begin(); p != end(); ++p) { + const Key &key = p->first; + double score = p->second; + std::vector::const_iterator q = key.begin(); + std::vector::const_iterator end = key.end(); + output << vocab.Lookup(*q++) << " |||"; + while (q != end) { + output << " " << vocab.Lookup(*q++); + } + output << " ||| " << score << std::endl; + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.h b/scripts/training/phrase-extract/pcfg-common/pcfg.h new file mode 100644 index 000000000..757eea449 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg.h @@ -0,0 +1,61 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_PCFG_H_ +#define PCFG_PCFG_H_ + +#include "typedef.h" + +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +class Pcfg { + public: + typedef std::vector Key; + typedef std::map Map; + typedef Map::iterator iterator; + typedef Map::const_iterator const_iterator; + + Pcfg() {} + + iterator begin() { return rules_.begin(); } + const_iterator begin() const { return rules_.begin(); } + + iterator end() { return rules_.end(); } + const_iterator end() const { return rules_.end(); } + + void Add(const Key &, double); + bool Lookup(const Key &, double &) const; + void Read(std::istream &, Vocabulary &); + void Write(const Vocabulary &, std::ostream &) const; + + private: + Map rules_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h new file mode 100644 index 000000000..bdac64dfc --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h @@ -0,0 +1,77 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_PCFG_TREE_H_ +#define PCFG_PCFG_TREE_H_ + +#include "syntax_tree.h" +#include "xml_tree_writer.h" + +#include + +namespace Moses { +namespace PCFG { + +template +class PcfgTreeBase : public SyntaxTreeBase { + public: + typedef std::string LabelType; + typedef SyntaxTreeBase BaseType; + + PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {} + + double score() const { return score_; } + void set_score(double s) { score_ = s; } + + private: + double score_; +}; + +class PcfgTree : public PcfgTreeBase { + public: + typedef PcfgTreeBase BaseType; + PcfgTree(const BaseType::LabelType &label) : BaseType(label) {} +}; + +// Specialise XmlOutputHandler for PcfgTree. +template<> +class XmlOutputHandler { + public: + typedef std::map AttributeMap; + + void GetLabel(const PcfgTree &tree, std::string &label) const { + label = tree.label(); + } + + void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const { + attribute_map.clear(); + double score = tree.score(); + if (score != 0.0) { + std::ostringstream out; + out << tree.score(); + attribute_map["pcfg"] = out.str(); + } + } +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h new file mode 100644 index 000000000..37f72dd58 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h @@ -0,0 +1,91 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SYNTAX_TREE_H_ +#define PCFG_SYNTAX_TREE_H_ + +#include +#include + +namespace Moses { +namespace PCFG { + +// Base class for SyntaxTree, AgreementTree, and friends. +template +class SyntaxTreeBase { + public: + // Constructors + SyntaxTreeBase(const T &label) + : label_(label) + , children_() + , parent_(0) {} + + SyntaxTreeBase(const T &label, const std::vector &children) + : label_(label) + , children_(children) + , parent_(0) {} + + // Destructor + virtual ~SyntaxTreeBase(); + + const T &label() const { return label_; } + const DerivedType *parent() const { return parent_; } + DerivedType *parent() { return parent_; } + const std::vector &children() const { return children_; } + std::vector &children() { return children_; } + + void set_label(const T &label) { label_ = label; } + void set_parent(DerivedType *parent) { parent_ = parent; } + void set_children(const std::vector &c) { children_ = c; } + + bool IsLeaf() const { return children_.empty(); } + + bool IsPreterminal() const { + return children_.size() == 1 && children_[0]->IsLeaf(); + } + + void AddChild(DerivedType *child) { children_.push_back(child); } + + private: + T label_; + std::vector children_; + DerivedType *parent_; +}; + +template +class SyntaxTree : public SyntaxTreeBase > { + public: + typedef SyntaxTreeBase > BaseType; + SyntaxTree(const T &label) : BaseType(label) {} + SyntaxTree(const T &label, const std::vector &children) + : BaseType(label, children) {} +}; + +template +SyntaxTreeBase::~SyntaxTreeBase() { + for (size_t i = 0; i < children_.size(); ++i) { + delete children_[i]; + } +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/tool.cc b/scripts/training/phrase-extract/pcfg-common/tool.cc new file mode 100644 index 000000000..bebd220e1 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/tool.cc @@ -0,0 +1,80 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "tool.h" + +#include + +namespace Moses { +namespace PCFG { + +std::istream &Tool::OpenInputOrDie(const std::string &filename) { + // TODO Check that function is only called once? + if (filename.empty() || filename == "-") { + input_ptr_ = &(std::cin); + } else { + input_file_stream_.open(filename.c_str()); + if (!input_file_stream_) { + std::ostringstream msg; + msg << "failed to open input file: " << filename; + Error(msg.str()); + } + input_ptr_ = &input_file_stream_; + } + return *input_ptr_; +} + +std::ostream &Tool::OpenOutputOrDie(const std::string &filename) { + // TODO Check that function is only called once? + if (filename.empty() || filename == "-") { + output_ptr_ = &(std::cout); + } else { + output_file_stream_.open(filename.c_str()); + if (!output_file_stream_) { + std::ostringstream msg; + msg << "failed to open output file: " << filename; + Error(msg.str()); + } + output_ptr_ = &output_file_stream_; + } + return *output_ptr_; +} + +void Tool::OpenNamedInputOrDie(const std::string &filename, + std::ifstream &stream) { + stream.open(filename.c_str()); + if (!stream) { + std::ostringstream msg; + msg << "failed to open input file: " << filename; + Error(msg.str()); + } +} + +void Tool::OpenNamedOutputOrDie(const std::string &filename, + std::ofstream &stream) { + stream.open(filename.c_str()); + if (!stream) { + std::ostringstream msg; + msg << "failed to open output file: " << filename; + Error(msg.str()); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/tool.h b/scripts/training/phrase-extract/pcfg-common/tool.h new file mode 100644 index 000000000..0af342569 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/tool.h @@ -0,0 +1,91 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_TOOL_H_ +#define PCFG_TOOL_H_ + +#include + +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +class Tool { + public: + virtual ~Tool() {} + + const std::string &name() const { return name_; } + + virtual int Main(int argc, char *argv[]) = 0; + + protected: + Tool(const std::string &name) : name_(name) {} + + // Returns the boost::program_options style that should be used by all tools. + static int CommonOptionStyle() { + namespace cls = boost::program_options::command_line_style; + return cls::default_style & (~cls::allow_guessing); + } + + void Warn(const std::string &msg) const { + std::cerr << name_ << ": warning: " << msg << std::endl; + } + + void Error(const std::string &msg) const { + std::cerr << name_ << ": error: " << msg << std::endl; + std::exit(1); + } + + // Initialises the tool's main input stream and returns a reference that is + // valid for the remainder of the tool's lifetime. If filename is empty or + // "-" then input is standard input; otherwise it is the named file. Calls + // Error() if the file cannot be opened for reading. + std::istream &OpenInputOrDie(const std::string &filename); + + // Initialises the tool's main output stream and returns a reference that is + // valid for the remainder of the tool's lifetime. If filename is empty or + // "-" then output is standard output; otherwise it is the named file. Calls + // Error() if the file cannot be opened for writing. + std::ostream &OpenOutputOrDie(const std::string &filename); + + // Opens the named input file using the supplied ifstream. Calls Error() if + // the file cannot be opened for reading. + void OpenNamedInputOrDie(const std::string &, std::ifstream &); + + // Opens the named output file using the supplied ofstream. Calls Error() if + // the file cannot be opened for writing. + void OpenNamedOutputOrDie(const std::string &, std::ofstream &); + + private: + std::string name_; + std::istream *input_ptr_; + std::ifstream input_file_stream_; + std::ostream *output_ptr_; + std::ofstream output_file_stream_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/typedef.h b/scripts/training/phrase-extract/pcfg-common/typedef.h new file mode 100644 index 000000000..49a12d681 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/typedef.h @@ -0,0 +1,37 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_TYPEDEF_H_ +#define PCFG_TYPEDEF_H_ + +#include "numbered_set.h" +#include "syntax_tree.h" + +#include + +namespace Moses { +namespace PCFG { + +typedef NumberedSet Vocabulary; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc new file mode 100644 index 000000000..5c596a0fb --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc @@ -0,0 +1,85 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "xml_tree_parser.h" + +#include "exception.h" +#include "tables-core.h" +#include "XmlException.h" +#include "XmlTree.h" + +#include +#include + +namespace Moses { +namespace PCFG { + +XmlTreeParser::XmlTreeParser() +{ +} + +std::auto_ptr XmlTreeParser::Parse(const std::string &line) +{ + m_line = line; + m_tree.Clear(); + try { + if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) { + throw Exception(""); + } + } catch (const XmlException &e) { + throw Exception(e.getMsg()); + } + m_tree.ConnectNodes(); + SyntaxNode *root = m_tree.GetTop(); + assert(root); + m_words = tokenize(m_line.c_str()); + return ConvertTree(*root, m_words); +} + +// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree. +std::auto_ptr XmlTreeParser::ConvertTree( + const SyntaxNode &tree, + const std::vector &words) +{ + std::auto_ptr root(new PcfgTree(tree.GetLabel())); + const std::vector &children = tree.GetChildren(); + if (children.empty()) { + if (tree.GetStart() != tree.GetEnd()) { + std::ostringstream msg; + msg << "leaf node covers multiple words (" << tree.GetStart() + << "-" << tree.GetEnd() << "): this is currently unsupported"; + throw Exception(msg.str()); + } + std::auto_ptr leaf(new PcfgTree(words[tree.GetStart()])); + leaf->set_parent(root.get()); + root->AddChild(leaf.release()); + } else { + for (std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + assert(*p); + std::auto_ptr child = ConvertTree(**p, words); + child->set_parent(root.get()); + root->AddChild(child.release()); + } + } + return root; +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h new file mode 100644 index 000000000..6b418c44e --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h @@ -0,0 +1,56 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_XML_TREE_PARSER_H_ +#define PCFG_XML_TREE_PARSER_H_ + +#include "pcfg_tree.h" +#include "SyntaxTree.h" + +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +// Parses a string in Moses' XML parse tree format and returns a PcfgTree +// object. +class XmlTreeParser { + public: + XmlTreeParser(); + std::auto_ptr Parse(const std::string &); + private: + std::auto_ptr ConvertTree(const SyntaxNode &, + const std::vector &); + + std::set m_labelSet; + std::map m_topLabelSet; + std::string m_line; + ::SyntaxTree m_tree; + std::vector m_words; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h new file mode 100644 index 000000000..347c352bb --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h @@ -0,0 +1,127 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_XML_TREE_WRITER_H_ +#define PCFG_XML_TREE_WRITER_H_ + +#include "syntax_tree.h" + +#include "XmlTree.h" + +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +template +class XmlOutputHandler { + public: + typedef std::map AttributeMap; + + void GetLabel(const InputTree &, std::string &) const; + void GetAttributes(const InputTree &, AttributeMap &) const; +}; + +template +class XmlTreeWriter : public XmlOutputHandler { + public: + typedef XmlOutputHandler Base; + void Write(const InputTree &, std::ostream &) const; + private: + std::string Escape(const std::string &) const; +}; + +template +void XmlTreeWriter::Write(const InputTree &tree, + std::ostream &out) const { + assert(!tree.IsLeaf()); + + // Opening tag + + std::string label; + Base::GetLabel(tree, label); + out << "first << "=\"" << p->second << "\""; + } + + out << ">"; + + // Children + + const std::vector &children = tree.children(); + for (typename std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + InputTree &child = **p; + if (child.IsLeaf()) { + Base::GetLabel(child, label); + out << " " << Escape(label); + } else { + out << " "; + Write(**p, out); + } + } + + // Closing tag + out << " "; + + if (tree.parent() == 0) { + out << std::endl; + } +} + +// Escapes XML special characters. +template +std::string XmlTreeWriter::Escape(const std::string &s) const { + std::string t; + size_t len = s.size(); + t.reserve(len); + for (size_t i = 0; i < len; ++i) { + if (s[i] == '<') { + t += "<"; + } else if (s[i] == '>') { + t += ">"; + } else if (s[i] == '&') { + t += "&"; + } else if (s[i] == '\'') { + t += "'"; + } else if (s[i] == '"') { + t += """; + } else { + t += s[i]; + } + } + return t; +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/Jamfile b/scripts/training/phrase-extract/pcfg-extract/Jamfile new file mode 100644 index 000000000..be91d6d2f --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/Jamfile @@ -0,0 +1 @@ +exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ; diff --git a/scripts/training/phrase-extract/pcfg-extract/main.cc b/scripts/training/phrase-extract/pcfg-extract/main.cc new file mode 100644 index 000000000..47b45afc3 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/main.cc @@ -0,0 +1,25 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_extract.h" + +int main(int argc, char *argv[]) { + Moses::PCFG::PcfgExtract tool; + return tool.Main(argc, argv); +} diff --git a/scripts/training/phrase-extract/pcfg-extract/options.h b/scripts/training/phrase-extract/pcfg-extract/options.h new file mode 100644 index 000000000..3acb31b58 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/options.h @@ -0,0 +1,36 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_OPTIONS_H_ +#define PCFG_EXTRACT_OPTIONS_H_ + +#include + +namespace Moses { +namespace PCFG { + +struct Options { + std::string corpus_file; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc new file mode 100644 index 000000000..151c9959c --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -0,0 +1,131 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_extract.h" + +#include "options.h" +#include "rule_collection.h" +#include "rule_extractor.h" + +#include "pcfg-common/exception.h" +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/syntax_tree.h" +#include "pcfg-common/typedef.h" +#include "pcfg-common/xml_tree_parser.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +int PcfgExtract::Main(int argc, char *argv[]) { + // Process command-line options. + Options options; + ProcessOptions(argc, argv, options); + + // Extract PCFG rules from corpus. + Vocabulary non_term_vocab; + RuleExtractor rule_extractor(non_term_vocab); + RuleCollection rule_collection; + XmlTreeParser parser; + std::string line; + size_t line_num = 0; + std::auto_ptr tree; + while (std::getline(std::cin, line)) { + ++line_num; + try { + tree = parser.Parse(line); + } catch (Exception &e) { + std::ostringstream msg; + msg << "line " << line_num << ": " << e.msg(); + Error(msg.str()); + } + if (!tree.get()) { + std::ostringstream msg; + msg << "no tree at line " << line_num; + Warn(msg.str()); + continue; + } + rule_extractor.Extract(*tree, rule_collection); + } + + // Score rules and write PCFG to output. + Pcfg pcfg; + rule_collection.CreatePcfg(pcfg); + pcfg.Write(non_term_vocab, std::cout); + + return 0; +} + +void PcfgExtract::ProcessOptions(int argc, char *argv[], + Options &options) const { + namespace po = boost::program_options; + + std::ostringstream usage_top; + usage_top << "Usage: " << name() << "\n\n" << "Options"; + + // Declare the command line options that are visible to the user. + po::options_description visible(usage_top.str()); + visible.add_options() + ("help", "print help message and exit") + ; + + // Declare the command line options that are hidden from the user + // (these are used as positional options). + po::options_description hidden("Hidden options"); + hidden.add_options(); + + // Compose the full set of command-line options. + po::options_description cmd_line_options; + cmd_line_options.add(visible).add(hidden); + + // Register the positional options. + po::positional_options_description p; + + // Process the command-line. + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()). + options(cmd_line_options).positional(p).run(), vm); + po::notify(vm); + } catch (const std::exception &e) { + std::ostringstream msg; + msg << e.what() << "\n\n" << visible; + Error(msg.str()); + } + + if (vm.count("help")) { + std::cout << visible << std::endl; + std::exit(0); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h new file mode 100644 index 000000000..1af6cb4fe --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h @@ -0,0 +1,42 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_ +#define PCFG_EXTRACT_PCFG_EXTRACT_H_ + +#include "pcfg-common/tool.h" + +namespace Moses { +namespace PCFG { + +class Options; + +class PcfgExtract : public Tool { + public: + PcfgExtract() : Tool("pcfg-extract") {} + virtual int Main(int, char *[]); + private: + void ProcessOptions(int, char *[], Options &) const; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc new file mode 100644 index 000000000..503b1a9e6 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc @@ -0,0 +1,58 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "rule_collection.h" + +#include "pcfg-common/pcfg.h" + +#include + +namespace Moses { +namespace PCFG { + +void RuleCollection::Add(size_t lhs, const std::vector &rhs) { + ++collection_[lhs][rhs]; +} + +void RuleCollection::CreatePcfg(Pcfg &pcfg) { + std::vector key; + for (const_iterator p = begin(); p != end(); ++p) { + size_t lhs = p->first; + const RhsCountMap &rhs_counts = p->second; + size_t total = 0; + for (RhsCountMap::const_iterator q = rhs_counts.begin(); + q != rhs_counts.end(); ++q) { + total += q->second; + } + for (RhsCountMap::const_iterator q = rhs_counts.begin(); + q != rhs_counts.end(); ++q) { + const std::vector &rhs = q->first; + size_t count = q->second; + double score = std::log(static_cast(count) / + static_cast(total)); + key.clear(); + key.push_back(lhs); + key.insert(key.end(), rhs.begin(), rhs.end()); + pcfg.Add(key, score); + } + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h new file mode 100644 index 000000000..1b768dd21 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h @@ -0,0 +1,59 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_ +#define PCFG_EXTRACT_RULE_COLLECTION_H_ + +#include "pcfg-common/pcfg.h" + +#include + +#include + +namespace Moses { +namespace PCFG { + +// Contains PCFG rules and their counts. +class RuleCollection { + public: + typedef boost::unordered_map, size_t> RhsCountMap; + typedef boost::unordered_map Map; + typedef Map::iterator iterator; + typedef Map::const_iterator const_iterator; + + RuleCollection() {} + + iterator begin() { return collection_.begin(); } + const_iterator begin() const { return collection_.begin(); } + + iterator end() { return collection_.end(); } + const_iterator end() const { return collection_.end(); } + + void Add(size_t, const std::vector &); + void CreatePcfg(Pcfg &); + + private: + Map collection_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc new file mode 100644 index 000000000..48a82a6d0 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc @@ -0,0 +1,51 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "rule_extractor.h" + +#include "pcfg-common/pcfg_tree.h" + +namespace Moses { +namespace PCFG { + +RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab) + : non_term_vocab_(non_term_vocab) { +} + +void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const { + if (tree.IsPreterminal() || tree.IsLeaf()) { + return; + } + + size_t lhs = non_term_vocab_.Insert(tree.label()); + std::vector rhs; + + const std::vector &children = tree.children(); + rhs.reserve(children.size()); + for (std::vector::const_iterator p(children.begin()); + p != children.end(); ++p) { + const PcfgTree &child = **p; + rhs.push_back(non_term_vocab_.Insert(child.label())); + Extract(child, rc); + } + rc.Add(lhs, rhs); +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h new file mode 100644 index 000000000..6bcffbc61 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h @@ -0,0 +1,45 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_ +#define PCFG_EXTRACT_RULE_EXTRACTOR_H_ + +#include "rule_collection.h" + +#include "pcfg-common/typedef.h" + +namespace Moses { +namespace PCFG { + +class PcfgTree; + +// Extracts PCFG rules from syntax trees and adds them to a RuleCollection. +class RuleExtractor { + public: + RuleExtractor(Vocabulary &); + void Extract(const PcfgTree &, RuleCollection &) const; + private: + Vocabulary &non_term_vocab_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/Jamfile b/scripts/training/phrase-extract/pcfg-score/Jamfile new file mode 100644 index 000000000..7225381c0 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/Jamfile @@ -0,0 +1 @@ +exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ; diff --git a/scripts/training/phrase-extract/pcfg-score/main.cc b/scripts/training/phrase-extract/pcfg-score/main.cc new file mode 100644 index 000000000..da5392add --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/main.cc @@ -0,0 +1,25 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_score.h" + +int main(int argc, char *argv[]) { + Moses::PCFG::PcfgScore tool; + return tool.Main(argc, argv); +} diff --git a/scripts/training/phrase-extract/pcfg-score/options.h b/scripts/training/phrase-extract/pcfg-score/options.h new file mode 100644 index 000000000..e54b2a0b9 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/options.h @@ -0,0 +1,36 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_OPTIONS_H_ +#define PCFG_SCORE_OPTIONS_H_ + +#include + +namespace Moses { +namespace PCFG { + +struct Options { + std::string pcfg_file; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc new file mode 100644 index 000000000..d780200ad --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc @@ -0,0 +1,152 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_score.h" + +#include "options.h" +#include "tree_scorer.h" + +#include "pcfg-common/exception.h" +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/syntax_tree.h" +#include "pcfg-common/typedef.h" +#include "pcfg-common/xml_tree_parser.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +int PcfgScore::Main(int argc, char *argv[]) { + // Process command-line options. + Options options; + ProcessOptions(argc, argv, options); + + // Open PCFG stream. + std::ifstream pcfg_stream; + OpenNamedInputOrDie(options.pcfg_file, pcfg_stream); + + // Read PCFG. + Pcfg pcfg; + Vocabulary non_term_vocab; + pcfg.Read(pcfg_stream, non_term_vocab); + + // Score corpus according to PCFG. + TreeScorer scorer(pcfg, non_term_vocab); + XmlTreeParser parser; + XmlTreeWriter writer; + std::string line; + size_t line_num = 0; + std::auto_ptr tree; + while (std::getline(std::cin, line)) { + ++line_num; + try { + tree = parser.Parse(line); + } catch (Exception &e) { + std::ostringstream msg; + msg << "line " << line_num << ": " << e.msg(); + Error(msg.str()); + } + if (!tree.get()) { + std::ostringstream msg; + msg << "no tree at line " << line_num; + Warn(msg.str()); + std::cout << std::endl; + continue; + } + if (!scorer.Score(*tree)) { + std::ostringstream msg; + msg << "failed to score tree at line " << line_num; + Warn(msg.str()); + std::cout << std::endl; + continue; + } + writer.Write(*tree, std::cout); + } + + return 0; +} + +void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const { + namespace po = boost::program_options; + + std::ostringstream usage_top; + usage_top << "Usage: " << name() << " PCFG\n\n" + << "Options"; + + // Declare the command line options that are visible to the user. + po::options_description visible(usage_top.str()); + visible.add_options() + ("help", "print help message and exit") + ; + + // Declare the command line options that are hidden from the user + // (these are used as positional options). + po::options_description hidden("Hidden options"); + hidden.add_options() + ("pcfg-file", po::value(&options.pcfg_file), "pcfg file") + ; + + // Compose the full set of command-line options. + po::options_description cmd_line_options; + cmd_line_options.add(visible).add(hidden); + + // Register the positional options. + po::positional_options_description p; + p.add("pcfg-file", 1); + + // Process the command-line. + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()). + options(cmd_line_options).positional(p).run(), vm); + po::notify(vm); + } catch (const std::exception &e) { + std::ostringstream msg; + msg << e.what() << "\n\n" << visible; + Error(msg.str()); + } + + if (vm.count("help")) { + std::cout << visible << std::endl; + std::exit(0); + } + + // Check positional options were given. + + if (!vm.count("pcfg-file")) { + std::ostringstream msg; + msg << "missing required argument\n\n" << visible << std::endl; + Error(msg.str()); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.h b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h new file mode 100644 index 000000000..5e506c39d --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h @@ -0,0 +1,42 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_PCFG_SCORE_H_ +#define PCFG_SCORE_PCFG_SCORE_H_ + +#include "pcfg-common/tool.h" + +namespace Moses { +namespace PCFG { + +class Options; + +class PcfgScore : public Tool { + public: + PcfgScore() : Tool("pcfg-score") {} + virtual int Main(int, char *[]); + private: + void ProcessOptions(int, char *[], Options &) const; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc new file mode 100644 index 000000000..5f695e4fc --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc @@ -0,0 +1,68 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "tree_scorer.h" + +#include + +namespace Moses { +namespace PCFG { + +TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab) + : pcfg_(pcfg) + , non_term_vocab_(non_term_vocab) { +} + +bool TreeScorer::Score(PcfgTree &root) const { + if (root.IsPreterminal() || root.IsLeaf()) { + return true; + } + + const std::vector &children = root.children(); + + double log_prob = 0.0; + + std::vector key; + key.reserve(children.size()+1); + key.push_back(non_term_vocab_.Lookup(root.label())); + + for (std::vector::const_iterator p(children.begin()); + p != children.end(); ++p) { + PcfgTree *child = *p; + assert(!child->IsLeaf()); + key.push_back(non_term_vocab_.Lookup(child->label())); + if (!Score(*child)) { + return false; + } + if (!child->IsPreterminal()) { + log_prob += child->score(); + } + } + double rule_score; + bool found = pcfg_.Lookup(key, rule_score); + if (!found) { + return false; + } + log_prob += rule_score; + root.set_score(log_prob); + return true; +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.h b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h new file mode 100644 index 000000000..36f4e1e99 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h @@ -0,0 +1,47 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_TREE_SCORER_H_ +#define PCFG_SCORE_TREE_SCORER_H_ + +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/typedef.h" + +namespace Moses { +namespace PCFG { + +class TreeScorer { + public: + TreeScorer(const Pcfg &, const Vocabulary &); + + // Score tree according to PCFG. Returns false if unsuccessful (due to + // missing rule). + bool Score(PcfgTree &) const; + + private: + const Pcfg &pcfg_; + const Vocabulary &non_term_vocab_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp index 8bcc9be3b..c5fb0b99f 100644 --- a/scripts/training/phrase-extract/score.cpp +++ b/scripts/training/phrase-extract/score.cpp @@ -72,6 +72,7 @@ void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs LexicalTable lexTable; bool inverseFlag = false; bool hierarchicalFlag = false; +bool pcfgFlag = false; bool wordAlignmentFlag = false; bool goodTuringFlag = false; bool kneserNeyFlag = false; @@ -108,6 +109,9 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; + } else if (strcmp(argv[i],"--PCFG") == 0) { + pcfgFlag = true; + cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; @@ -193,6 +197,7 @@ int main(int argc, char* argv[]) // loop through all extracted phrase translations float lastCount = 0.0f; + float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; @@ -207,6 +212,7 @@ int main(int argc, char* argv[]) // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; + lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); @@ -215,10 +221,12 @@ int main(int argc, char* argv[]) PhraseAlignment phrasePair; phrasePair.create( line, i ); lastCount = phrasePair.count; + lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; + lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } @@ -438,6 +446,16 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo countOfCounts[ countInt ]++; } + // compute PCFG score + float pcfgScore; + if (pcfgFlag && !inverseFlag) { + float pcfgSum = 0; + for(size_t i=0; ipcfgSum; + } + pcfgScore = pcfgSum / count; + } + // output phrases const PHRASE &phraseS = phrasePair[0]->GetSource(); const PHRASE &phraseT = phrasePair[0]->GetTarget(); @@ -493,6 +511,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } + // target-side PCFG score + if (pcfgFlag && !inverseFlag) { + phraseTableFile << " " << pcfgScore; + } + phraseTableFile << " ||| "; // alignment info for non-terminals diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 1a7cb3a39..41ea2d682 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -105,6 +105,7 @@ $_HELP = 1 'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE, 'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE, 'ghkm' => \$_GHKM, + 'pcfg' => \$_PCFG, 'extract-options=s' => \$_EXTRACT_OPTIONS, 'score-options=s' => \$_SCORE_OPTIONS, 'source-syntax' => \$_SOURCE_SYNTAX, @@ -1373,6 +1374,7 @@ sub extract_phrase { $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file"; $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR; $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE); + $cmd .= " --PCFG" if $_PCFG; if (!defined($_GHKM)) { $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX; $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX; @@ -1503,6 +1505,7 @@ sub score_phrase_phrase_extract { $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT; $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT; $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL; + $cmd .= " --PCFG" if $_PCFG; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); # sorting @@ -1801,6 +1804,7 @@ sub create_ini { $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/; $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature + $basic_weight_count++ if $_PCFG; foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { $num_of_ttables++; my $ff = $f; From a72744c49b7821bf0355e7fe4638c392a74b0d60 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 25 May 2012 17:39:21 +0100 Subject: [PATCH 20/38] Fix bug in previous commit. --- scripts/training/train-model.perl.missing_bin_dir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 41ea2d682..869f979fc 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -30,7 +30,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_ $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE, @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, - $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS, + $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS, $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS, From 180dd773f6507829c551c5512aaad7128f958385 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Sat, 26 May 2012 00:06:34 +0100 Subject: [PATCH 21/38] bolt specific settings --- scripts/ems/experiment.meta | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 51ac0f67a..aed6049ea 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -738,6 +738,20 @@ ibm-bleu-c ignore-unless: ibm-bleu-c rerun-on-change: ibm-bleu-c template: $ibm-bleu-c -s $input-sgm -r IN1 -t IN > OUT +bolt-bleu + in: detokenized-output + out: bolt-bleu-score + default-name: evaluation/bolt-bleu + ignore-unless: bolt-bleu + rerun-on-change: bolt-bleu + template: $bolt-bleu IN > OUT +bolt-bleu-c + in: detokenized-output + out: bolt-bleu-c-score + default-name: evaluation/bolt-bleu-c + ignore-unless: bolt-bleu-c + rerun-on-change: bolt-bleu-c + template: $bolt-bleu-c IN > OUT multi-bleu in: cleaned-output reference out: multi-bleu-score @@ -793,6 +807,6 @@ analysis-precision [REPORTING] single report - in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model + in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model out: report default-name: evaluation/report From 561b9ac9567d3e5b0bbc56fdae3b29961b8bc728 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Sat, 26 May 2012 00:09:50 +0100 Subject: [PATCH 22/38] minor fixes --- scripts/ems/experiment.machines | 3 ++- scripts/ems/experiment.perl | 2 +- .../generic-multicore-parallelizer.perl | 15 +++++++++++--- .../ems/support/report-experiment-scores.perl | 20 +++++++++++++++++++ scripts/tokenizer/deescape-special-chars.perl | 2 ++ scripts/tokenizer/detokenizer.perl | 7 +++++-- scripts/tokenizer/escape-special-chars.perl | 8 +++++--- scripts/tokenizer/tokenizer.perl | 4 ++-- scripts/training/mert-moses.pl | 4 ++-- scripts/training/phrase-extract/XmlTree.cpp | 10 ++++++++++ 10 files changed, 61 insertions(+), 14 deletions(-) diff --git a/scripts/ems/experiment.machines b/scripts/ems/experiment.machines index 9e0294d60..7fdecd9cd 100644 --- a/scripts/ems/experiment.machines +++ b/scripts/ems/experiment.machines @@ -1,3 +1,4 @@ cluster: townhill seville hermes lion seville sannox lutzow frontend -multicore-8: tyr thor odin crom saxnot vali vili freyja bragi hoenir +multicore-8: tyr thor odin crom +multicore-16: saxnot vali vili freyja bragi hoenir multicore-24: syn hel skaol saga diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 59bd2788f..45537681a 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1020,7 +1020,7 @@ sub execute_steps { } } - print "number of steps doable or running: ".(scalar keys %DO)."\n"; + print "number of steps doable or running: ".(scalar keys %DO)." at ".`date`; foreach my $step (keys %DO) { print "\t".($DO{$step}==2?"running: ":"doable: ").$DO_STEP[$step]."\n"; } return unless scalar keys %DO; diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl index 862536137..d7e030ad2 100755 --- a/scripts/ems/support/generic-multicore-parallelizer.perl +++ b/scripts/ems/support/generic-multicore-parallelizer.perl @@ -3,6 +3,7 @@ use strict; my $cores = 8; +my $serial = 1; my ($infile,$outfile,$cmd,$tmpdir); my $parent = $$; @@ -12,6 +13,7 @@ GetOptions('cores=i' => \$cores, 'in=s' => \$infile, 'out=s' => \$outfile, 'cmd=s' => \$cmd, + 'serial=i' => \$serial ) or exit(1); die("ERROR: specify command with -cmd") unless $cmd; @@ -24,8 +26,9 @@ die("ERROR: you need to specify a tempdir with -tmpdir") unless $tmpdir; # create split input files my $sentenceN = `cat $infile | wc -l`; -my $splitN = int(($sentenceN+$cores-0.5) / $cores); -`split -a 2 -l $splitN $infile $tmpdir/in-$parent-`; +my $splitN = int(($sentenceN+($cores*$serial)-0.5) / ($cores*$serial)); +print STDERR "split -a 3 -l $splitN $infile $tmpdir/in-$parent-\n"; +`split -a 4 -l $splitN $infile $tmpdir/in-$parent-`; # find out the names of the processes my @CORE=`ls $tmpdir/in-$parent-*`; @@ -33,17 +36,23 @@ chomp(@CORE); grep(s/.+in\-\d+\-([a-z]+)$/$1/e,@CORE); # create core scripts -foreach my $core (@CORE){ +for(my $i=0;$i$tmpdir/core-$parent-$core.bash") or die "Cannot open: $!"; print BASH "#bash\n\n"; # print BASH "export PATH=$ENV{PATH}\n\n"; printf BASH $cmd."\n", "$tmpdir/in-$parent-$core", "$tmpdir/out-$parent-$core"; + for(my $j=2;$j<=$serial;$j++) { + $core = $CORE[++$i]; + printf BASH $cmd."\n", "$tmpdir/in-$parent-$core", "$tmpdir/out-$parent-$core"; + } close(BASH); } # fork processes my (@CHILDREN); foreach my $core (@CORE){ + next unless -e "$tmpdir/core-$parent-$core.bash"; my $child = fork(); if (! $child) { # I am child print STDERR "running child $core\n"; diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl index 2efd86517..e881ec17a 100755 --- a/scripts/ems/support/report-experiment-scores.perl +++ b/scripts/ems/support/report-experiment-scores.perl @@ -14,6 +14,10 @@ $TYPE{"multi-bleu-c"}= "BLEU-c"; $TYPE{"ibm-bleu"} = "IBM"; $TYPE{"ibm-bleu-c"} = "IBM-c"; $TYPE{"meteor"} = "METEOR"; +$TYPE{"bolt-bleu"} = "BLEU"; +$TYPE{"bolt-bleu-c"} = "BLEU-c"; +$TYPE{"bolt-ter"} = "TER"; +$TYPE{"bolt-ter-c"} = "TER-c"; my %SCORE; my %AVERAGE; @@ -60,6 +64,9 @@ sub process { elsif ($type eq 'meteor') { $SCORE{$set} .= &extract_meteor($file,$type)." "; } + elsif ($type =~ /^bolt-(.+)$/) { + $SCORE{$set} .= &extract_bolt($file,$1)." "; + } } sub extract_nist_bleu { @@ -115,6 +122,19 @@ sub extract_multi_bleu { return $output.$TYPE{$type}; } +sub extract_bolt { + my ($file,$type) = @_; + my $score; + foreach (`cat $file`) { + $score = $1 if $type eq 'bleu' && /Lowercase BLEU\s+([\d\.]+)/; + $score = $1 if $type eq 'bleu-c' && /Cased BLEU\s+([\d\.]+)/; + $score = $1 if $type eq 'ter' && /Lowercase TER\s+([\d\.]+)/; + $score = $1 if $type eq 'ter-c' && /Cased TER\s+([\d\.]+)/; + } + my $output = sprintf("%.02f ",$score*100); + $AVERAGE{"bolt-".$type} += $score*100; + return $output.$TYPE{"bolt-".$type}; +} sub extract_meteor { my ($file,$type) = @_; my ($meteor, $precision); diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl index c98e01ccc..55035ae6d 100755 --- a/scripts/tokenizer/deescape-special-chars.perl +++ b/scripts/tokenizer/deescape-special-chars.perl @@ -8,6 +8,8 @@ while() { s/\>/\>/g; s/\&bra;/\[/g; s/\&ket;/\]/g; + s/\[/\[/g; + s/\]/\]/g; s/\&/\&/g; print $_; } diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index e2d7ea0bb..e55a1a26e 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -33,8 +33,9 @@ if ($HELP) { exit; } -die "No built-in rules for language $language, claim en for default behaviour." - if $language !~ /^(cs|en|fr|it)$/; +if ($language !~ /^(cs|en|fr|it)$/) { + print STDERR "Warning: No built-in rules for language $language.\n" +} if (!$QUIET) { print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n"; @@ -70,6 +71,8 @@ sub detokenize { $text =~ s/\>/\>/g; $text =~ s/\&bra;/\[/g; $text =~ s/\&ket;/\]/g; + $text =~ s/\[/\[/g; + $text =~ s/\]/\]/g; $text =~ s/\&/\&/g; my $word; diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl index 5c4dc9bb3..f4c1b4dd5 100755 --- a/scripts/tokenizer/escape-special-chars.perl +++ b/scripts/tokenizer/escape-special-chars.perl @@ -6,18 +6,20 @@ while() { chop; # avoid general madness + s/[\000-\037]//g; s/\s+/ /g; s/^ //g; s/ $//g; - s/[\000-\037]//g; # special characters in moses s/\&/\&/g; s/\|/\&bar;/g; s/\/\>/g; - s/\[/\&bra;/g; - s/\]/\&ket;/g; + s/\[/\[/g; + s/\]/\]/g; + # restore xml instructions + s/\<(\S+) translation="([^\"]+)"> (.+?) <\/(\S+)>/\<$1 translation=\"$2\"> $3 <\/$4>/g; print $_."\n"; } diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index a97d5e160..70bb318f7 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -153,8 +153,8 @@ sub tokenize { $text =~ s/\|/\&bar;/g; $text =~ s/\/\>/g; - $text =~ s/\[/\&bra;/g; - $text =~ s/\]/\&ket;/g; + $text =~ s/\[/\[/g; + $text =~ s/\]/\]/g; #ensure final line break $text .= "\n" unless $text =~ /\n$/; diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 6ce8341c0..2abd5ef7c 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1089,7 +1089,7 @@ sub get_order_of_scores_from_nbestlist { # return the score labels in order my $fname_or_source = shift; # print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n"; - open my $fh, '<', $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source': $!"; + open my $fh, $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source': $!"; my $line = <$fh>; close $fh; die "Line empty in nbestlist '$fname_or_source'" if !defined $line; @@ -1169,7 +1169,7 @@ sub create_config { } if (defined($sparse_weights_file)) { - push @{$P{"weights-file"}}, File::Spec->catfile($___WORKING_DIR, $sparse_weights_file); + push @{$P{"weight-file"}}, File::Spec->catfile($___WORKING_DIR, $sparse_weights_file); } # create new moses.ini decoder config file by cloning and overriding the original one diff --git a/scripts/training/phrase-extract/XmlTree.cpp b/scripts/training/phrase-extract/XmlTree.cpp index 716414f86..19825c02c 100644 --- a/scripts/training/phrase-extract/XmlTree.cpp +++ b/scripts/training/phrase-extract/XmlTree.cpp @@ -128,6 +128,16 @@ string unescape(const string& str) s += string("<"); } else if (name == "gt") { s += string(">"); + } else if (name == "#91") { + s += string("["); + } else if (name == "#93") { + s += string("]"); + } else if (name == "bra") { + s += string("["); + } else if (name == "ket") { + s += string("]"); + } else if (name == "bar") { + s += string("|"); } else if (name == "amp") { s += string("&"); } else if (name == "apos") { From 145df588c2e03624156a38276b8bb339d8067c35 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Sat, 26 May 2012 12:15:34 +0100 Subject: [PATCH 23/38] Fix some input handling bugs in pcfg-extract and pcfg-score. --- .../training/phrase-extract/pcfg-common/xml_tree_parser.cc | 5 ++++- .../training/phrase-extract/pcfg-common/xml_tree_writer.h | 6 ++++++ scripts/training/phrase-extract/pcfg-score/pcfg_score.cc | 4 ++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc index 5c596a0fb..fd9d11334 100644 --- a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc @@ -47,7 +47,10 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) } m_tree.ConnectNodes(); SyntaxNode *root = m_tree.GetTop(); - assert(root); + if (!root) { + // There is no XML tree. + return std::auto_ptr(); + } m_words = tokenize(m_line.c_str()); return ConvertTree(*root, m_words); } diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h index 347c352bb..c5171a905 100644 --- a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h @@ -108,6 +108,12 @@ std::string XmlTreeWriter::Escape(const std::string &s) const { t += "<"; } else if (s[i] == '>') { t += ">"; + } else if (s[i] == '[') { + t += "["; + } else if (s[i] == ']') { + t += "]"; + } else if (s[i] == '|') { + t += "&bar;"; } else if (s[i] == '&') { t += "&"; } else if (s[i] == '\'') { diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc index d780200ad..16691707b 100644 --- a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc +++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc @@ -78,14 +78,14 @@ int PcfgScore::Main(int argc, char *argv[]) { std::ostringstream msg; msg << "no tree at line " << line_num; Warn(msg.str()); - std::cout << std::endl; + std::cout << line << std::endl; continue; } if (!scorer.Score(*tree)) { std::ostringstream msg; msg << "failed to score tree at line " << line_num; Warn(msg.str()); - std::cout << std::endl; + std::cout << line << std::endl; continue; } writer.Write(*tree, std::cout); From 4c90c88733ed2bfa5a131394f1e75eb2adf0a863 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 26 May 2012 12:21:32 +0100 Subject: [PATCH 24/38] make phrase-table.half naming consistent. Requested by Phil Williams --- scripts/generic/score-parallel.perl | 11 ++--------- scripts/training/train-model.perl.missing_bin_dir | 10 +++++----- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index fbb4d4d02..b399a83ba 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -152,7 +152,7 @@ $cmd = "\n\nOH SHIT. This should have been filled in \n\n"; if ($fileCount == 1 && !$doSort) { my $numStr = NumStr(0); - $cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf.gz \n"; + $cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf"; } else { @@ -162,14 +162,7 @@ else $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR "; } - $cmd .= " | gzip -c >"; - - if ($doSort) { - $cmd .= " $ptHalf.sorted.gz \n"; - } - else { - $cmd .= " $ptHalf.gz \n"; - } + $cmd .= " | gzip -c > $ptHalf"; } print STDERR $cmd; systemCheck($cmd); diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 869f979fc..61e49970f 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1486,7 +1486,7 @@ sub score_phrase_phrase_extract { if ($pid == 0) { next if $___CONTINUE && -e "$ttable_file.half.$direction"; - next if $___CONTINUE && $direction eq "e2f" && -e "$ttable_file.half.e2f.sorted"; + next if $___CONTINUE && $direction eq "e2f" && -e "$ttable_file.half.e2f.gz"; my $inverse = ""; my $extract_filename = $extract_file; if ($direction eq "e2f") { @@ -1497,7 +1497,7 @@ sub score_phrase_phrase_extract { print STDERR "(6.".($substep++).") creating table half $ttable_file.half.$direction @ ".`date`; - my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse"; + my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction.gz $inverse"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT; $cmd .= " --KneserNey" if $KNESER_NEY; @@ -1543,15 +1543,15 @@ sub score_phrase_phrase_extract { # merging the two halves print STDERR "(6.6) consolidating the two halves @ ".`date`; return if $___CONTINUE && -e "$ttable_file.gz"; - my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.sorted.gz $ttable_file.gz"; + my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.gz $ttable_file.gz"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --LogProb" if $LOG_PROB; $cmd .= " --NegLogProb" if $NEG_LOG_PROB; $cmd .= " --OnlyDirect" if $ONLY_DIRECT; $cmd .= " --NoPhraseCount" unless $PHRASE_COUNT; $cmd .= " --LowCountFeature" if $LOW_COUNT; - $cmd .= " --GoodTuring $ttable_file.half.f2e.coc" if $GOOD_TURING; - $cmd .= " --KneserNey $ttable_file.half.f2e.coc" if $KNESER_NEY; + $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING; + $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY; safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed"; if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } } From cae4f803c3186d3f0c0143bb2f93be1929f7119a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 26 May 2012 12:27:50 +0100 Subject: [PATCH 25/38] faster consolidating - redirect to stdout then zip --- scripts/training/train-model.perl.missing_bin_dir | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 61e49970f..aac6cef96 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -1543,7 +1543,7 @@ sub score_phrase_phrase_extract { # merging the two halves print STDERR "(6.6) consolidating the two halves @ ".`date`; return if $___CONTINUE && -e "$ttable_file.gz"; - my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.gz $ttable_file.gz"; + my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.gz /dev/stdout"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --LogProb" if $LOG_PROB; $cmd .= " --NegLogProb" if $NEG_LOG_PROB; @@ -1552,6 +1552,9 @@ sub score_phrase_phrase_extract { $cmd .= " --LowCountFeature" if $LOW_COUNT; $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING; $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY; + + $cmd .= " | gzip -c > $ttable_file.gz"; + safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed"; if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } } From 82580280bc0b30607b00a55ffe0f22d5665269a3 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Sat, 26 May 2012 13:13:23 +0100 Subject: [PATCH 26/38] Fix compile error by using std::size_t instead of size_t. Thanks to Tomas Hudik for reporting that. --- .../phrase-extract/pcfg-common/numbered_set.h | 4 ++-- scripts/training/phrase-extract/pcfg-common/pcfg.cc | 8 ++++---- scripts/training/phrase-extract/pcfg-common/pcfg.h | 2 +- .../phrase-extract/pcfg-common/syntax_tree.h | 2 +- .../phrase-extract/pcfg-common/xml_tree_writer.h | 4 ++-- .../phrase-extract/pcfg-extract/pcfg_extract.cc | 2 +- .../phrase-extract/pcfg-extract/rule_collection.cc | 12 ++++++------ .../phrase-extract/pcfg-extract/rule_collection.h | 6 +++--- .../phrase-extract/pcfg-extract/rule_extractor.cc | 4 ++-- .../training/phrase-extract/pcfg-score/pcfg_score.cc | 2 +- .../phrase-extract/pcfg-score/tree_scorer.cc | 2 +- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/scripts/training/phrase-extract/pcfg-common/numbered_set.h b/scripts/training/phrase-extract/pcfg-common/numbered_set.h index f88d710ed..15e768b4c 100644 --- a/scripts/training/phrase-extract/pcfg-common/numbered_set.h +++ b/scripts/training/phrase-extract/pcfg-common/numbered_set.h @@ -35,7 +35,7 @@ namespace PCFG { // Stores a set of elements of type T, each of which is allocated an integral // ID of type I. IDs are contiguous starting at 0. Individual elements cannot // be removed once inserted (but the whole set can be cleared). -template +template class NumberedSet { private: typedef boost::unordered_map ElementToIdMap; @@ -54,7 +54,7 @@ class NumberedSet { static I NullId() { return std::numeric_limits::max(); } bool Empty() const { return id_to_element_.empty(); } - size_t Size() const { return id_to_element_.size(); } + std::size_t Size() const { return id_to_element_.size(); } // Insert the given object and return its ID. I Insert(const T &); diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.cc b/scripts/training/phrase-extract/pcfg-common/pcfg.cc index d045b820b..054e20a48 100644 --- a/scripts/training/phrase-extract/pcfg-common/pcfg.cc +++ b/scripts/training/phrase-extract/pcfg-common/pcfg.cc @@ -50,7 +50,7 @@ void Pcfg::Read(std::istream &input, Vocabulary &vocab) { Key key; while (std::getline(input, line)) { // Read LHS. - size_t pos = line.find("|||"); + std::size_t pos = line.find("|||"); if (pos == std::string::npos) { throw Exception("missing first delimiter"); } @@ -58,7 +58,7 @@ void Pcfg::Read(std::istream &input, Vocabulary &vocab) { boost::trim(lhs_string); // Read RHS. - size_t begin = pos+3; + std::size_t begin = pos+3; pos = line.find("|||", begin); if (pos == std::string::npos) { throw Exception("missing second delimiter"); @@ -92,8 +92,8 @@ void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const { for (const_iterator p = begin(); p != end(); ++p) { const Key &key = p->first; double score = p->second; - std::vector::const_iterator q = key.begin(); - std::vector::const_iterator end = key.end(); + std::vector::const_iterator q = key.begin(); + std::vector::const_iterator end = key.end(); output << vocab.Lookup(*q++) << " |||"; while (q != end) { output << " " << vocab.Lookup(*q++); diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.h b/scripts/training/phrase-extract/pcfg-common/pcfg.h index 757eea449..b87336584 100644 --- a/scripts/training/phrase-extract/pcfg-common/pcfg.h +++ b/scripts/training/phrase-extract/pcfg-common/pcfg.h @@ -33,7 +33,7 @@ namespace PCFG { class Pcfg { public: - typedef std::vector Key; + typedef std::vector Key; typedef std::map Map; typedef Map::iterator iterator; typedef Map::const_iterator const_iterator; diff --git a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h index 37f72dd58..89c6ec0c3 100644 --- a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h +++ b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h @@ -80,7 +80,7 @@ class SyntaxTree : public SyntaxTreeBase > { template SyntaxTreeBase::~SyntaxTreeBase() { - for (size_t i = 0; i < children_.size(); ++i) { + for (std::size_t i = 0; i < children_.size(); ++i) { delete children_[i]; } } diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h index c5171a905..6a9a3de05 100644 --- a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h @@ -101,9 +101,9 @@ void XmlTreeWriter::Write(const InputTree &tree, template std::string XmlTreeWriter::Escape(const std::string &s) const { std::string t; - size_t len = s.size(); + std::size_t len = s.size(); t.reserve(len); - for (size_t i = 0; i < len; ++i) { + for (std::size_t i = 0; i < len; ++i) { if (s[i] == '<') { t += "<"; } else if (s[i] == '>') { diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc index 151c9959c..71c2e31c3 100644 --- a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc +++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -56,7 +56,7 @@ int PcfgExtract::Main(int argc, char *argv[]) { RuleCollection rule_collection; XmlTreeParser parser; std::string line; - size_t line_num = 0; + std::size_t line_num = 0; std::auto_ptr tree; while (std::getline(std::cin, line)) { ++line_num; diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc index 503b1a9e6..32b63e0ef 100644 --- a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc +++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc @@ -26,24 +26,24 @@ namespace Moses { namespace PCFG { -void RuleCollection::Add(size_t lhs, const std::vector &rhs) { +void RuleCollection::Add(std::size_t lhs, const std::vector &rhs) { ++collection_[lhs][rhs]; } void RuleCollection::CreatePcfg(Pcfg &pcfg) { - std::vector key; + std::vector key; for (const_iterator p = begin(); p != end(); ++p) { - size_t lhs = p->first; + std::size_t lhs = p->first; const RhsCountMap &rhs_counts = p->second; - size_t total = 0; + std::size_t total = 0; for (RhsCountMap::const_iterator q = rhs_counts.begin(); q != rhs_counts.end(); ++q) { total += q->second; } for (RhsCountMap::const_iterator q = rhs_counts.begin(); q != rhs_counts.end(); ++q) { - const std::vector &rhs = q->first; - size_t count = q->second; + const std::vector &rhs = q->first; + std::size_t count = q->second; double score = std::log(static_cast(count) / static_cast(total)); key.clear(); diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h index 1b768dd21..452fa0e97 100644 --- a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h +++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h @@ -33,8 +33,8 @@ namespace PCFG { // Contains PCFG rules and their counts. class RuleCollection { public: - typedef boost::unordered_map, size_t> RhsCountMap; - typedef boost::unordered_map Map; + typedef boost::unordered_map, std::size_t> RhsCountMap; + typedef boost::unordered_map Map; typedef Map::iterator iterator; typedef Map::const_iterator const_iterator; @@ -46,7 +46,7 @@ class RuleCollection { iterator end() { return collection_.end(); } const_iterator end() const { return collection_.end(); } - void Add(size_t, const std::vector &); + void Add(std::size_t, const std::vector &); void CreatePcfg(Pcfg &); private: diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc index 48a82a6d0..217574e7d 100644 --- a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc +++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc @@ -33,8 +33,8 @@ void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const { return; } - size_t lhs = non_term_vocab_.Insert(tree.label()); - std::vector rhs; + std::size_t lhs = non_term_vocab_.Insert(tree.label()); + std::vector rhs; const std::vector &children = tree.children(); rhs.reserve(children.size()); diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc index 16691707b..345d7fc60 100644 --- a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc +++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc @@ -63,7 +63,7 @@ int PcfgScore::Main(int argc, char *argv[]) { XmlTreeParser parser; XmlTreeWriter writer; std::string line; - size_t line_num = 0; + std::size_t line_num = 0; std::auto_ptr tree; while (std::getline(std::cin, line)) { ++line_num; diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc index 5f695e4fc..f9ce97ae0 100644 --- a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc +++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc @@ -38,7 +38,7 @@ bool TreeScorer::Score(PcfgTree &root) const { double log_prob = 0.0; - std::vector key; + std::vector key; key.reserve(children.size()+1); key.push_back(non_term_vocab_.Lookup(root.label())); From e3e62846bfe84d9a7edd78affd23f020d8ae2468 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Sun, 27 May 2012 12:43:16 +0100 Subject: [PATCH 27/38] train-model.perl: add -alt-direct-rule-score-1 and -alt-direct-rule-score-2 options, which use either p(RHS_t|RHS_s,LHS) or p(LHS,RHS_t|RHS_s), respectively, as a grammar rule's direct translation score. --- .../phrase-extract/RuleExtractionOptions.h | 4 + .../extract-ghkm/ExtractGHKM.cpp | 3 + .../phrase-extract/extract-ghkm/Options.h | 2 + .../extract-ghkm/ScfgRuleWriter.cpp | 12 ++- .../training/phrase-extract/extract-rules.cpp | 43 ++++++--- scripts/training/phrase-extract/score.cpp | 94 +++++++++++++++---- scripts/training/phrase-extract/score.h | 8 +- .../training/train-model.perl.missing_bin_dir | 9 +- 8 files changed, 139 insertions(+), 36 deletions(-) diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h index f9123de86..272af2c76 100644 --- a/scripts/training/phrase-extract/RuleExtractionOptions.h +++ b/scripts/training/phrase-extract/RuleExtractionOptions.h @@ -48,6 +48,8 @@ public: bool pcfgScore; bool outputNTLengths; bool gzOutput; + bool unpairedExtractFormat; + bool conditionOnTargetLhs; RuleExtractionOptions() : maxSpan(10) @@ -78,6 +80,8 @@ public: , pcfgScore(false) , outputNTLengths(false) , gzOutput(false) + , unpairedExtractFormat(false) + , conditionOnTargetLhs(false) {} }; diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 397ce1e3c..6b6fbb7eb 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -357,6 +357,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], if (vm.count("AllowUnary")) { options.allowUnary = true; } + if (vm.count("ConditionOnTargetLHS")) { + options.conditionOnTargetLhs = true; + } if (vm.count("GZOutput")) { options.gzOutput = true; } diff --git a/scripts/training/phrase-extract/extract-ghkm/Options.h b/scripts/training/phrase-extract/extract-ghkm/Options.h index c4b57f311..362fc95d2 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Options.h +++ b/scripts/training/phrase-extract/extract-ghkm/Options.h @@ -30,6 +30,7 @@ struct Options { public: Options() : allowUnary(false) + , conditionOnTargetLhs(false) , gzOutput(false) , maxNodes(15) , maxRuleDepth(3) @@ -47,6 +48,7 @@ struct Options { // All other options bool allowUnary; + bool conditionOnTargetLhs; std::string glueGrammarFile; bool gzOutput; int maxNodes; diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp index d5d16b790..cd993d6e8 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp @@ -101,7 +101,11 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule, } sourceSS << " "; } - WriteSymbol(rule.GetSourceLHS(), sourceSS); + if (m_options.conditionOnTargetLhs) { + WriteSymbol(rule.GetTargetLHS(), sourceSS); + } else { + WriteSymbol(rule.GetSourceLHS(), sourceSS); + } // Write the target side of the rule to targetSS. i = 0; @@ -131,7 +135,11 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule, WriteSymbol(*p, sourceSS); sourceSS << " "; } - WriteSymbol(rule.GetSourceLHS(), sourceSS); + if (m_options.conditionOnTargetLhs) { + WriteSymbol(rule.GetTargetLHS(), sourceSS); + } else { + WriteSymbol(rule.GetSourceLHS(), sourceSS); + } // Write the target side of the rule to targetSS. i = 0; diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp index a00667b82..997038224 100644 --- a/scripts/training/phrase-extract/extract-rules.cpp +++ b/scripts/training/phrase-extract/extract-rules.cpp @@ -140,7 +140,9 @@ int main(int argc, char* argv[]) << " | --MaxNonTerm[" << options.maxNonTerm << "]" << " | --MaxScope[" << options.maxScope << "]" << " | --SourceSyntax | --TargetSyntax" - << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting ]\n"; + << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting" + << " | --UnpairedExtractFormat" + << " | --ConditionOnTargetLHS ]\n"; exit(1); } char* &fileNameT = argv[1]; @@ -261,6 +263,10 @@ int main(int argc, char* argv[]) options.pcfgScore = true; } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { options.outputNTLengths = true; + } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { + options.unpairedExtractFormat = true; + } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { + options.conditionOnTargetLhs = true; #ifdef WITH_THREADS } else if (strcmp(argv[i],"-threads") == 0 || strcmp(argv[i],"--threads") == 0 || @@ -545,7 +551,11 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X"; hole.SetLabel(targetLabel, 1); - out += "[" + sourceLabel + "][" + targetLabel + "] "; + if (m_options.unpairedExtractFormat) { + out += "[" + targetLabel + "] "; + } else { + out += "[" + sourceLabel + "][" + targetLabel + "] "; + } if (m_options.pcfgScore) { double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore(); @@ -591,7 +601,11 @@ string ExtractTask::printSourceHieroPhrase( int startT, int endT, int startS, in assert(targetLabel != ""); const string &sourceLabel = hole.GetLabel(0); - out += "[" + sourceLabel + "][" + targetLabel + "] "; + if (m_options.unpairedExtractFormat) { + out += "[" + sourceLabel + "] "; + } else { + out += "[" + sourceLabel + "][" + targetLabel + "] "; + } currPos = hole.GetEnd(0); hole.SetPos(outPos, 0); @@ -659,7 +673,6 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS m_sentence->targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X"; string sourceLabel = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X"; - //string sourceLabel = "X"; // create non-terms on the source side preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex); @@ -677,9 +690,12 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS } // source - // holeColl.SortSourceHoles(); - rule.source = printSourceHieroPhrase(startT, endT, startS, endS, holeColl, labelIndex) - + " [" + sourceLabel + "]"; + rule.source = printSourceHieroPhrase(startT, endT, startS, endS, holeColl, labelIndex); + if (m_options.conditionOnTargetLhs) { + rule.source += " [" + targetLabel + "]"; + } else { + rule.source += " [" + sourceLabel + "]"; + } // alignment printHieroAlignment(startT, endT, startS, endS, indexS, indexT, holeColl, rule); @@ -875,10 +891,15 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist // phrase labels string targetLabel,sourceLabel; - sourceLabel = m_options.sourceSyntax ? - m_sentence->sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X"; - targetLabel = m_options.targetSyntax ? - m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X"; + if (m_options.targetSyntax && m_options.conditionOnTargetLhs) { + sourceLabel = targetLabel = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel(); + } + else { + sourceLabel = m_options.sourceSyntax ? + m_sentence->sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X"; + targetLabel = m_options.targetSyntax ? + m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X"; + } // source rule.source = ""; diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp index c5fb0b99f..5e0ade627 100644 --- a/scripts/training/phrase-extract/score.cpp +++ b/scripts/training/phrase-extract/score.cpp @@ -69,10 +69,15 @@ double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignmen void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs , map > &sourceProb , map > &targetProb); +void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &); +void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &); + LexicalTable lexTable; bool inverseFlag = false; bool hierarchicalFlag = false; bool pcfgFlag = false; +bool unpairedExtractFormatFlag = false; +bool conditionOnTargetLhsFlag = false; bool wordAlignmentFlag = false; bool goodTuringFlag = false; bool kneserNeyFlag = false; @@ -93,7 +98,7 @@ int main(int argc, char* argv[]) << "scoring methods for extracted rules\n"; if (argc < 4) { - cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n"; + cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS]\n"; exit(1); } char* fileNameExtract = argv[1]; @@ -112,6 +117,12 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--PCFG") == 0) { pcfgFlag = true; cerr << "including PCFG scores\n"; + } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { + unpairedExtractFormatFlag = true; + cerr << "processing unpaired extract format\n"; + } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { + conditionOnTargetLhsFlag = true; + cerr << "processing unpaired extract format\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; @@ -470,27 +481,18 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo // source phrase (unless inverse) if (! inverseFlag) { - for(size_t j=0; j alignmentPoints = bestAlignment.alignedToS[i]; + assert(alignmentPoints.size() == 1); + int j = *(alignmentPoints.begin()); + if (inverseFlag) { + out << vcbT.getWord(phraseT[j]) << word << " "; + } else { + out << word << vcbT.getWord(phraseT[j]) << " "; + } + } + // output source root symbol + if (conditionOnTargetLhsFlag && !inverseFlag) { + out << "[X]"; + } else { + out << vcbS.getWord(phraseS.back()); + } +} + +void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT, + const PhraseAlignment &bestAlignment, ostream &out) +{ + // output target symbols, except root, in rule table format + for (std::size_t i = 0; i < phraseT.size()-1; ++i) { + const std::string &word = vcbT.getWord(phraseT[i]); + if (!unpairedExtractFormatFlag || !isNonTerminal(word)) { + out << word << " "; + continue; + } + // get corresponding source non-terminal and output pair + std::set alignmentPoints = bestAlignment.alignedToT[i]; + assert(alignmentPoints.size() == 1); + int j = *(alignmentPoints.begin()); + if (inverseFlag) { + out << word << vcbS.getWord(phraseS[j]) << " "; + } else { + out << vcbS.getWord(phraseS[j]) << word << " "; + } + } + // output target root symbol + if (conditionOnTargetLhsFlag) { + if (inverseFlag) { + out << "[X]"; + } else { + out << vcbS.getWord(phraseS.back()); + } + } else { + out << vcbT.getWord(phraseT.back()); + } +} + std::pair PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj ) { std::pair ret = m_coll.insert(obj); diff --git a/scripts/training/phrase-extract/score.h b/scripts/training/phrase-extract/score.h index dc94ecfde..9faa144c5 100644 --- a/scripts/training/phrase-extract/score.h +++ b/scripts/training/phrase-extract/score.h @@ -59,11 +59,7 @@ private: }; // other functions ********************************************* -inline bool isNonTerminal( std::string &word ) +inline bool isNonTerminal( const std::string &word ) { - return (word.length()>=3 && - word.substr(0,1).compare("[") == 0 && - word.substr(word.length()-1,1).compare("]") == 0); + return (word.length()>=3 && word[0] == '[' && word[word.length()-1] == ']'); } - - diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index aac6cef96..0db2ee437 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -31,6 +31,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_ @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS, + $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS, @@ -106,6 +107,8 @@ $_HELP = 1 'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE, 'ghkm' => \$_GHKM, 'pcfg' => \$_PCFG, + 'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1, + 'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2, 'extract-options=s' => \$_EXTRACT_OPTIONS, 'score-options=s' => \$_SCORE_OPTIONS, 'source-syntax' => \$_SOURCE_SYNTAX, @@ -1375,6 +1378,8 @@ sub extract_phrase { $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR; $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE); $cmd .= " --PCFG" if $_PCFG; + $cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2; + $cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1; if (!defined($_GHKM)) { $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX; $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX; @@ -1506,10 +1511,12 @@ sub score_phrase_phrase_extract { $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT; $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL; $cmd .= " --PCFG" if $_PCFG; + $cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2; + $cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); # sorting - if ($direction eq "e2f") { + if ($direction eq "e2f" || $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2) { $cmd .= " 1 "; } else { From db1e6040b241c74ed01b9da0e4a8bd2f4c15f176 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Sun, 27 May 2012 17:58:13 +0100 Subject: [PATCH 28/38] Fix bug in previous commit. --- scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 6b6fbb7eb..dae876116 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -266,6 +266,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], //("help", "print this help message and exit") ("AllowUnary", "allow fully non-lexical unary rules") + ("ConditionOnTargetLHS", + "write target LHS instead of \"X\" as source LHS") ("GlueGrammar", po::value(&options.glueGrammarFile), "write glue grammar to named file") From ef26388aff03e95882091c96eb3764c872f6c81f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 28 May 2012 17:29:46 +0100 Subject: [PATCH 29/38] eclipse project --- contrib/other-builds/OnDiskPt/.cproject | 131 + contrib/other-builds/OnDiskPt/.project | 185 + contrib/other-builds/lm/.cproject | 125 + contrib/other-builds/lm/.project | 360 ++ contrib/other-builds/moses-cmd/.cproject | 140 + contrib/other-builds/moses-cmd/.project | 199 ++ contrib/other-builds/moses/.cproject | 164 + contrib/other-builds/moses/.project | 3055 +++++++++++++++++ contrib/other-builds/util/.cproject | 133 + contrib/other-builds/util/.project | 90 + lm/bhiksha.cc | 1 + moses-cmd/src/IOWrapper.h | 9 +- moses/src/LM/ORLM.h | 2 +- .../training/train-model.perl.missing_bin_dir | 2 +- util/bit_packing.cc | 2 +- 15 files changed, 4591 insertions(+), 7 deletions(-) create mode 100644 contrib/other-builds/OnDiskPt/.cproject create mode 100644 contrib/other-builds/OnDiskPt/.project create mode 100644 contrib/other-builds/lm/.cproject create mode 100644 contrib/other-builds/lm/.project create mode 100644 contrib/other-builds/moses-cmd/.cproject create mode 100644 contrib/other-builds/moses-cmd/.project create mode 100644 contrib/other-builds/moses/.cproject create mode 100644 contrib/other-builds/moses/.project create mode 100644 contrib/other-builds/util/.cproject create mode 100644 contrib/other-builds/util/.project diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject new file mode 100644 index 000000000..41f2a5141 --- /dev/null +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -0,0 +1,131 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/OnDiskPt/.project b/contrib/other-builds/OnDiskPt/.project new file mode 100644 index 000000000..73a7ac0a7 --- /dev/null +++ b/contrib/other-builds/OnDiskPt/.project @@ -0,0 +1,185 @@ + + + OnDiskPt + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + ?name? + + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.autoBuildTarget + all + + + org.eclipse.cdt.make.core.buildArguments + + + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.buildLocation + ${workspace_loc:/OnDiskPt/Debug} + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + org.eclipse.cdt.make.core.fullBuildTarget + all + + + org.eclipse.cdt.make.core.stopOnError + true + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + Jamfile + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Jamfile + + + Main.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Main.cpp + + + Main.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Main.h + + + OnDiskWrapper.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/OnDiskWrapper.cpp + + + OnDiskWrapper.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/OnDiskWrapper.h + + + Phrase.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Phrase.cpp + + + Phrase.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Phrase.h + + + PhraseNode.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/PhraseNode.cpp + + + PhraseNode.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/PhraseNode.h + + + SourcePhrase.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/SourcePhrase.cpp + + + SourcePhrase.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/SourcePhrase.h + + + TargetPhrase.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/TargetPhrase.cpp + + + TargetPhrase.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/TargetPhrase.h + + + TargetPhraseCollection.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/TargetPhraseCollection.cpp + + + TargetPhraseCollection.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/TargetPhraseCollection.h + + + Vocab.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Vocab.cpp + + + Vocab.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Vocab.h + + + Word.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Word.cpp + + + Word.h + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/Word.h + + + queryOnDiskPt.cpp + 1 + PARENT-3-PROJECT_LOC/OnDiskPt/queryOnDiskPt.cpp + + + diff --git a/contrib/other-builds/lm/.cproject b/contrib/other-builds/lm/.cproject new file mode 100644 index 000000000..f89e80f49 --- /dev/null +++ b/contrib/other-builds/lm/.cproject @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/lm/.project b/contrib/other-builds/lm/.project new file mode 100644 index 000000000..0d30e24cb --- /dev/null +++ b/contrib/other-builds/lm/.project @@ -0,0 +1,360 @@ + + + lm + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + ?name? + + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.autoBuildTarget + all + + + org.eclipse.cdt.make.core.buildArguments + + + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.buildLocation + ${workspace_loc:/lm/Debug} + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + org.eclipse.cdt.make.core.fullBuildTarget + all + + + org.eclipse.cdt.make.core.stopOnError + true + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + .DS_Store + 1 + PARENT-3-PROJECT_LOC/lm/.DS_Store + + + COPYING + 1 + PARENT-3-PROJECT_LOC/lm/COPYING + + + COPYING.LESSER + 1 + PARENT-3-PROJECT_LOC/lm/COPYING.LESSER + + + Jamfile + 1 + PARENT-3-PROJECT_LOC/lm/Jamfile + + + LICENSE + 1 + PARENT-3-PROJECT_LOC/lm/LICENSE + + + README + 1 + PARENT-3-PROJECT_LOC/lm/README + + + bhiksha.cc + 1 + PARENT-3-PROJECT_LOC/lm/bhiksha.cc + + + bhiksha.hh + 1 + PARENT-3-PROJECT_LOC/lm/bhiksha.hh + + + binary_format.cc + 1 + PARENT-3-PROJECT_LOC/lm/binary_format.cc + + + binary_format.hh + 1 + PARENT-3-PROJECT_LOC/lm/binary_format.hh + + + blank.hh + 1 + PARENT-3-PROJECT_LOC/lm/blank.hh + + + build_binary + 1 + PARENT-3-PROJECT_LOC/lm/build_binary + + + build_binary.cc + 1 + PARENT-3-PROJECT_LOC/lm/build_binary.cc + + + clean.sh + 1 + PARENT-3-PROJECT_LOC/lm/clean.sh + + + compile.sh + 1 + PARENT-3-PROJECT_LOC/lm/compile.sh + + + config.cc + 1 + PARENT-3-PROJECT_LOC/lm/config.cc + + + config.hh + 1 + PARENT-3-PROJECT_LOC/lm/config.hh + + + enumerate_vocab.hh + 1 + PARENT-3-PROJECT_LOC/lm/enumerate_vocab.hh + + + facade.hh + 1 + PARENT-3-PROJECT_LOC/lm/facade.hh + + + left.hh + 1 + PARENT-3-PROJECT_LOC/lm/left.hh + + + left_test.cc + 1 + PARENT-3-PROJECT_LOC/lm/left_test.cc + + + libkenlm.dylib + 1 + PARENT-3-PROJECT_LOC/lm/libkenlm.dylib + + + libkenutil.dylib + 1 + PARENT-3-PROJECT_LOC/lm/libkenutil.dylib + + + lm_exception.cc + 1 + PARENT-3-PROJECT_LOC/lm/lm_exception.cc + + + lm_exception.hh + 1 + PARENT-3-PROJECT_LOC/lm/lm_exception.hh + + + max_order.hh + 1 + PARENT-3-PROJECT_LOC/lm/max_order.hh + + + model.cc + 1 + PARENT-3-PROJECT_LOC/lm/model.cc + + + model.hh + 1 + PARENT-3-PROJECT_LOC/lm/model.hh + + + model_test.cc + 1 + PARENT-3-PROJECT_LOC/lm/model_test.cc + + + model_type.hh + 1 + PARENT-3-PROJECT_LOC/lm/model_type.hh + + + ngram_query.cc + 1 + PARENT-3-PROJECT_LOC/lm/ngram_query.cc + + + ngram_query.hh + 1 + PARENT-3-PROJECT_LOC/lm/ngram_query.hh + + + quantize.cc + 1 + PARENT-3-PROJECT_LOC/lm/quantize.cc + + + quantize.hh + 1 + PARENT-3-PROJECT_LOC/lm/quantize.hh + + + query + 1 + PARENT-3-PROJECT_LOC/lm/query + + + read_arpa.cc + 1 + PARENT-3-PROJECT_LOC/lm/read_arpa.cc + + + read_arpa.hh + 1 + PARENT-3-PROJECT_LOC/lm/read_arpa.hh + + + return.hh + 1 + PARENT-3-PROJECT_LOC/lm/return.hh + + + search_hashed.cc + 1 + PARENT-3-PROJECT_LOC/lm/search_hashed.cc + + + search_hashed.hh + 1 + PARENT-3-PROJECT_LOC/lm/search_hashed.hh + + + search_trie.cc + 1 + PARENT-3-PROJECT_LOC/lm/search_trie.cc + + + search_trie.hh + 1 + PARENT-3-PROJECT_LOC/lm/search_trie.hh + + + test.arpa + 1 + PARENT-3-PROJECT_LOC/lm/test.arpa + + + test.sh + 1 + PARENT-3-PROJECT_LOC/lm/test.sh + + + test_nounk.arpa + 1 + PARENT-3-PROJECT_LOC/lm/test_nounk.arpa + + + trie.cc + 1 + PARENT-3-PROJECT_LOC/lm/trie.cc + + + trie.hh + 1 + PARENT-3-PROJECT_LOC/lm/trie.hh + + + trie_sort.cc + 1 + PARENT-3-PROJECT_LOC/lm/trie_sort.cc + + + trie_sort.hh + 1 + PARENT-3-PROJECT_LOC/lm/trie_sort.hh + + + virtual_interface.cc + 1 + PARENT-3-PROJECT_LOC/lm/virtual_interface.cc + + + virtual_interface.hh + 1 + PARENT-3-PROJECT_LOC/lm/virtual_interface.hh + + + vocab.cc + 1 + PARENT-3-PROJECT_LOC/lm/vocab.cc + + + vocab.hh + 1 + PARENT-3-PROJECT_LOC/lm/vocab.hh + + + weights.hh + 1 + PARENT-3-PROJECT_LOC/lm/weights.hh + + + word_index.hh + 1 + PARENT-3-PROJECT_LOC/lm/word_index.hh + + + diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject new file mode 100644 index 000000000..53c112cb8 --- /dev/null +++ b/contrib/other-builds/moses-cmd/.cproject @@ -0,0 +1,140 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/moses-cmd/.project b/contrib/other-builds/moses-cmd/.project new file mode 100644 index 000000000..c71651563 --- /dev/null +++ b/contrib/other-builds/moses-cmd/.project @@ -0,0 +1,199 @@ + + + moses-cmd + + + lm + moses + OnDiskPt + util + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + ?name? + + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.autoBuildTarget + all + + + org.eclipse.cdt.make.core.buildArguments + + + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.buildLocation + ${workspace_loc:/moses-cmd/Debug} + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + org.eclipse.cdt.make.core.fullBuildTarget + all + + + org.eclipse.cdt.make.core.stopOnError + true + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + IOWrapper.cpp + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/IOWrapper.cpp + + + IOWrapper.h + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/IOWrapper.h + + + IOWrapper.o + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/IOWrapper.o + + + Jamfile + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/Jamfile + + + LatticeMBR.cpp + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/LatticeMBR.cpp + + + LatticeMBR.h + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/LatticeMBR.h + + + LatticeMBR.o + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/LatticeMBR.o + + + LatticeMBRGrid.cpp + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/LatticeMBRGrid.cpp + + + LatticeMBRGrid.o + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/LatticeMBRGrid.o + + + Main.cpp + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/Main.cpp + + + Main.h + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/Main.h + + + Main.o + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/Main.o + + + TranslationAnalysis.cpp + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/TranslationAnalysis.cpp + + + TranslationAnalysis.h + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/TranslationAnalysis.h + + + TranslationAnalysis.o + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/TranslationAnalysis.o + + + libkenlm.dylib + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/libkenlm.dylib + + + libkenutil.dylib + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/libkenutil.dylib + + + lmbrgrid + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/lmbrgrid + + + mbr.cpp + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/mbr.cpp + + + mbr.h + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/mbr.h + + + mbr.o + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/mbr.o + + + moses + 1 + PARENT-3-PROJECT_LOC/moses-cmd/src/moses + + + diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject new file mode 100644 index 000000000..2995d5eae --- /dev/null +++ b/contrib/other-builds/moses/.cproject @@ -0,0 +1,164 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project new file mode 100644 index 000000000..8d534dbd4 --- /dev/null +++ b/contrib/other-builds/moses/.project @@ -0,0 +1,3055 @@ + + + moses + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + ?name? + + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.autoBuildTarget + all + + + org.eclipse.cdt.make.core.buildArguments + + + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.buildLocation + ${workspace_loc:/moses/Debug} + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + org.eclipse.cdt.make.core.fullBuildTarget + all + + + org.eclipse.cdt.make.core.stopOnError + true + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + AlignmentInfo.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/AlignmentInfo.cpp + + + AlignmentInfo.h + 1 + PARENT-3-PROJECT_LOC/moses/src/AlignmentInfo.h + + + AlignmentInfoCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/AlignmentInfoCollection.cpp + + + AlignmentInfoCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/AlignmentInfoCollection.h + + + BilingualDynSuffixArray.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/BilingualDynSuffixArray.cpp + + + BilingualDynSuffixArray.h + 1 + PARENT-3-PROJECT_LOC/moses/src/BilingualDynSuffixArray.h + + + BitmapContainer.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/BitmapContainer.cpp + + + BitmapContainer.h + 1 + PARENT-3-PROJECT_LOC/moses/src/BitmapContainer.h + + + CYKPlusParser + 2 + virtual:/virtual + + + CellCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/CellCollection.h + + + ChartCell.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartCell.cpp + + + ChartCell.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartCell.h + + + ChartCellCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartCellCollection.cpp + + + ChartCellCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartCellCollection.h + + + ChartCellLabel.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartCellLabel.h + + + ChartCellLabelSet.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartCellLabelSet.h + + + ChartHypothesis.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartHypothesis.cpp + + + ChartHypothesis.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartHypothesis.h + + + ChartHypothesisCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartHypothesisCollection.cpp + + + ChartHypothesisCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartHypothesisCollection.h + + + ChartManager.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartManager.cpp + + + ChartManager.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartManager.h + + + ChartRuleLookupManager.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartRuleLookupManager.h + + + ChartTranslationOption.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTranslationOption.cpp + + + ChartTranslationOption.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTranslationOption.h + + + ChartTranslationOptionCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTranslationOptionCollection.cpp + + + ChartTranslationOptionCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTranslationOptionCollection.h + + + ChartTranslationOptionList.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTranslationOptionList.cpp + + + ChartTranslationOptionList.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTranslationOptionList.h + + + ChartTrellisDetour.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisDetour.cpp + + + ChartTrellisDetour.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisDetour.h + + + ChartTrellisDetourQueue.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisDetourQueue.cpp + + + ChartTrellisDetourQueue.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisDetourQueue.h + + + ChartTrellisNode.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisNode.cpp + + + ChartTrellisNode.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisNode.h + + + ChartTrellisPath.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisPath.cpp + + + ChartTrellisPath.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisPath.h + + + ChartTrellisPathList.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ChartTrellisPathList.h + + + ConfusionNet.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ConfusionNet.cpp + + + ConfusionNet.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ConfusionNet.h + + + DecodeFeature.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeFeature.cpp + + + DecodeFeature.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeFeature.h + + + DecodeGraph.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeGraph.cpp + + + DecodeGraph.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeGraph.h + + + DecodeStep.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeStep.cpp + + + DecodeStep.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeStep.h + + + DecodeStepGeneration.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeStepGeneration.cpp + + + DecodeStepGeneration.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeStepGeneration.h + + + DecodeStepTranslation.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeStepTranslation.cpp + + + DecodeStepTranslation.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DecodeStepTranslation.h + + + Dictionary.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Dictionary.cpp + + + Dictionary.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Dictionary.h + + + DummyScoreProducers.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DummyScoreProducers.cpp + + + DummyScoreProducers.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DummyScoreProducers.h + + + DynSAInclude + 2 + virtual:/virtual + + + DynSuffixArray.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSuffixArray.cpp + + + DynSuffixArray.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSuffixArray.h + + + FFState.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/FFState.cpp + + + FFState.h + 1 + PARENT-3-PROJECT_LOC/moses/src/FFState.h + + + Factor.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Factor.cpp + + + Factor.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Factor.h + + + FactorCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/FactorCollection.cpp + + + FactorCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/FactorCollection.h + + + FactorTypeSet.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/FactorTypeSet.cpp + + + FactorTypeSet.h + 1 + PARENT-3-PROJECT_LOC/moses/src/FactorTypeSet.h + + + FeatureFunction.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/FeatureFunction.cpp + + + FeatureFunction.h + 1 + PARENT-3-PROJECT_LOC/moses/src/FeatureFunction.h + + + File.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/File.cpp + + + File.h + 1 + PARENT-3-PROJECT_LOC/moses/src/File.h + + + FilePtr.h + 1 + PARENT-3-PROJECT_LOC/moses/src/FilePtr.h + + + FloydWarshall.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/FloydWarshall.cpp + + + FloydWarshall.h + 1 + PARENT-3-PROJECT_LOC/moses/src/FloydWarshall.h + + + GenerationDictionary.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/GenerationDictionary.cpp + + + GenerationDictionary.h + 1 + PARENT-3-PROJECT_LOC/moses/src/GenerationDictionary.h + + + GlobalLexicalModel.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/GlobalLexicalModel.cpp + + + GlobalLexicalModel.h + 1 + PARENT-3-PROJECT_LOC/moses/src/GlobalLexicalModel.h + + + HypoList.h + 1 + PARENT-3-PROJECT_LOC/moses/src/HypoList.h + + + Hypothesis.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Hypothesis.cpp + + + Hypothesis.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Hypothesis.h + + + HypothesisStack.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/HypothesisStack.cpp + + + HypothesisStack.h + 1 + PARENT-3-PROJECT_LOC/moses/src/HypothesisStack.h + + + HypothesisStackCubePruning.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/HypothesisStackCubePruning.cpp + + + HypothesisStackCubePruning.h + 1 + PARENT-3-PROJECT_LOC/moses/src/HypothesisStackCubePruning.h + + + HypothesisStackNormal.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/HypothesisStackNormal.cpp + + + HypothesisStackNormal.h + 1 + PARENT-3-PROJECT_LOC/moses/src/HypothesisStackNormal.h + + + IRST.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/IRST.lo + + + IRST.o + 1 + PARENT-3-PROJECT_LOC/moses/src/IRST.o + + + InputFileStream.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/InputFileStream.cpp + + + InputFileStream.h + 1 + PARENT-3-PROJECT_LOC/moses/src/InputFileStream.h + + + InputType.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/InputType.cpp + + + InputType.h + 1 + PARENT-3-PROJECT_LOC/moses/src/InputType.h + + + Jamfile + 1 + PARENT-3-PROJECT_LOC/moses/src/Jamfile + + + LM + 2 + virtual:/virtual + + + LMList.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LMList.cpp + + + LMList.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LMList.h + + + LVoc.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LVoc.cpp + + + LVoc.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LVoc.h + + + LexicalReordering.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LexicalReordering.cpp + + + LexicalReordering.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LexicalReordering.h + + + LexicalReorderingState.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LexicalReorderingState.cpp + + + LexicalReorderingState.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LexicalReorderingState.h + + + LexicalReorderingTable.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LexicalReorderingTable.cpp + + + LexicalReorderingTable.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LexicalReorderingTable.h + + + Manager.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Manager.cpp + + + Manager.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Manager.h + + + NonTerminal.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/NonTerminal.cpp + + + NonTerminal.h + 1 + PARENT-3-PROJECT_LOC/moses/src/NonTerminal.h + + + ObjectPool.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ObjectPool.h + + + OutputCollector.h + 1 + PARENT-3-PROJECT_LOC/moses/src/OutputCollector.h + + + PCNTools.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PCNTools.cpp + + + PCNTools.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PCNTools.h + + + PDTAimp.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PDTAimp.h + + + ParallelBackoff.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/ParallelBackoff.lo + + + ParallelBackoff.o + 1 + PARENT-3-PROJECT_LOC/moses/src/ParallelBackoff.o + + + Parameter.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Parameter.cpp + + + Parameter.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Parameter.h + + + PartialTranslOptColl.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PartialTranslOptColl.cpp + + + PartialTranslOptColl.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PartialTranslOptColl.h + + + Phrase.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Phrase.cpp + + + Phrase.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Phrase.h + + + PhraseDictionary.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionary.cpp + + + PhraseDictionary.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionary.h + + + PhraseDictionaryDynSuffixArray.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryDynSuffixArray.cpp + + + PhraseDictionaryDynSuffixArray.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryDynSuffixArray.h + + + PhraseDictionaryMemory.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryMemory.cpp + + + PhraseDictionaryMemory.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryMemory.h + + + PhraseDictionaryNode.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryNode.cpp + + + PhraseDictionaryNode.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryNode.h + + + PhraseDictionaryTree.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryTree.cpp + + + PhraseDictionaryTree.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryTree.h + + + PhraseDictionaryTreeAdaptor.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryTreeAdaptor.cpp + + + PhraseDictionaryTreeAdaptor.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PhraseDictionaryTreeAdaptor.h + + + PrefixTree.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PrefixTree.h + + + PrefixTreeMap.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/PrefixTreeMap.cpp + + + PrefixTreeMap.h + 1 + PARENT-3-PROJECT_LOC/moses/src/PrefixTreeMap.h + + + ReorderingConstraint.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ReorderingConstraint.cpp + + + ReorderingConstraint.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ReorderingConstraint.h + + + ReorderingStack.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ReorderingStack.cpp + + + ReorderingStack.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ReorderingStack.h + + + RuleCube.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleCube.cpp + + + RuleCube.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleCube.h + + + RuleCubeItem.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleCubeItem.cpp + + + RuleCubeItem.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleCubeItem.h + + + RuleCubeQueue.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleCubeQueue.cpp + + + RuleCubeQueue.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleCubeQueue.h + + + RuleTable + 2 + virtual:/virtual + + + SRI.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/SRI.lo + + + SRI.o + 1 + PARENT-3-PROJECT_LOC/moses/src/SRI.o + + + Scope3Parser + 2 + virtual:/virtual + + + ScoreComponentCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ScoreComponentCollection.cpp + + + ScoreComponentCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ScoreComponentCollection.h + + + ScoreIndexManager.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ScoreIndexManager.cpp + + + ScoreIndexManager.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ScoreIndexManager.h + + + ScoreProducer.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ScoreProducer.cpp + + + ScoreProducer.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ScoreProducer.h + + + Search.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Search.cpp + + + Search.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Search.h + + + SearchCubePruning.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/SearchCubePruning.cpp + + + SearchCubePruning.h + 1 + PARENT-3-PROJECT_LOC/moses/src/SearchCubePruning.h + + + SearchNormal.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/SearchNormal.cpp + + + SearchNormal.h + 1 + PARENT-3-PROJECT_LOC/moses/src/SearchNormal.h + + + Sentence.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Sentence.cpp + + + Sentence.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Sentence.h + + + SentenceStats.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/SentenceStats.cpp + + + SentenceStats.h + 1 + PARENT-3-PROJECT_LOC/moses/src/SentenceStats.h + + + SquareMatrix.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/SquareMatrix.cpp + + + SquareMatrix.h + 1 + PARENT-3-PROJECT_LOC/moses/src/SquareMatrix.h + + + StackVec.h + 1 + PARENT-3-PROJECT_LOC/moses/src/StackVec.h + + + StaticData.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/StaticData.cpp + + + StaticData.h + 1 + PARENT-3-PROJECT_LOC/moses/src/StaticData.h + + + StaticData.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/StaticData.lo + + + StaticData.o + 1 + PARENT-3-PROJECT_LOC/moses/src/StaticData.o + + + SyntacticLanguageModel.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/SyntacticLanguageModel.cpp + + + SyntacticLanguageModel.h + 1 + PARENT-3-PROJECT_LOC/moses/src/SyntacticLanguageModel.h + + + SyntacticLanguageModelFiles.h + 1 + PARENT-3-PROJECT_LOC/moses/src/SyntacticLanguageModelFiles.h + + + SyntacticLanguageModelState.h + 1 + PARENT-3-PROJECT_LOC/moses/src/SyntacticLanguageModelState.h + + + TargetPhrase.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TargetPhrase.cpp + + + TargetPhrase.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TargetPhrase.h + + + TargetPhraseCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TargetPhraseCollection.cpp + + + TargetPhraseCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TargetPhraseCollection.h + + + Terminal.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Terminal.h + + + ThreadPool.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/ThreadPool.cpp + + + ThreadPool.h + 1 + PARENT-3-PROJECT_LOC/moses/src/ThreadPool.h + + + Timer.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Timer.cpp + + + Timer.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Timer.h + + + TranslationOption.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOption.cpp + + + TranslationOption.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOption.h + + + TranslationOptionCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOptionCollection.cpp + + + TranslationOptionCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOptionCollection.h + + + TranslationOptionCollectionConfusionNet.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOptionCollectionConfusionNet.cpp + + + TranslationOptionCollectionConfusionNet.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOptionCollectionConfusionNet.h + + + TranslationOptionCollectionText.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOptionCollectionText.cpp + + + TranslationOptionCollectionText.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOptionCollectionText.h + + + TranslationOptionList.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOptionList.cpp + + + TranslationOptionList.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationOptionList.h + + + TranslationSystem.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationSystem.cpp + + + TranslationSystem.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TranslationSystem.h + + + TreeInput.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TreeInput.cpp + + + TreeInput.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TreeInput.h + + + TreeInput.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/TreeInput.lo + + + TreeInput.o + 1 + PARENT-3-PROJECT_LOC/moses/src/TreeInput.o + + + TrellisPath.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPath.cpp + + + TrellisPath.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPath.h + + + TrellisPath.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPath.lo + + + TrellisPath.o + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPath.o + + + TrellisPathCollection.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPathCollection.cpp + + + TrellisPathCollection.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPathCollection.h + + + TrellisPathCollection.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPathCollection.lo + + + TrellisPathCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPathCollection.o + + + TrellisPathList.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TrellisPathList.h + + + TypeDef.h + 1 + PARENT-3-PROJECT_LOC/moses/src/TypeDef.h + + + UniqueObject.h + 1 + PARENT-3-PROJECT_LOC/moses/src/UniqueObject.h + + + UserMessage.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/UserMessage.cpp + + + UserMessage.h + 1 + PARENT-3-PROJECT_LOC/moses/src/UserMessage.h + + + UserMessage.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/UserMessage.lo + + + UserMessage.o + 1 + PARENT-3-PROJECT_LOC/moses/src/UserMessage.o + + + Util.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Util.cpp + + + Util.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Util.h + + + Util.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/Util.lo + + + Util.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Util.o + + + Word.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Word.cpp + + + Word.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Word.h + + + Word.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/Word.lo + + + Word.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Word.o + + + WordLattice.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/WordLattice.cpp + + + WordLattice.h + 1 + PARENT-3-PROJECT_LOC/moses/src/WordLattice.h + + + WordLattice.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/WordLattice.lo + + + WordLattice.o + 1 + PARENT-3-PROJECT_LOC/moses/src/WordLattice.o + + + WordsBitmap.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/WordsBitmap.cpp + + + WordsBitmap.h + 1 + PARENT-3-PROJECT_LOC/moses/src/WordsBitmap.h + + + WordsBitmap.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/WordsBitmap.lo + + + WordsBitmap.o + 1 + PARENT-3-PROJECT_LOC/moses/src/WordsBitmap.o + + + WordsRange.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/WordsRange.cpp + + + WordsRange.h + 1 + PARENT-3-PROJECT_LOC/moses/src/WordsRange.h + + + WordsRange.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/WordsRange.lo + + + WordsRange.o + 1 + PARENT-3-PROJECT_LOC/moses/src/WordsRange.o + + + XmlOption.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/XmlOption.cpp + + + XmlOption.h + 1 + PARENT-3-PROJECT_LOC/moses/src/XmlOption.h + + + XmlOption.lo + 1 + PARENT-3-PROJECT_LOC/moses/src/XmlOption.lo + + + XmlOption.o + 1 + PARENT-3-PROJECT_LOC/moses/src/XmlOption.o + + + bin + 2 + virtual:/virtual + + + gzfilebuf.h + 1 + PARENT-3-PROJECT_LOC/moses/src/gzfilebuf.h + + + hash.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/hash.cpp + + + hash.h + 1 + PARENT-3-PROJECT_LOC/moses/src/hash.h + + + hypergraph.proto + 1 + PARENT-3-PROJECT_LOC/moses/src/hypergraph.proto + + + libmoses.la + 1 + PARENT-3-PROJECT_LOC/moses/src/libmoses.la + + + rule.proto + 1 + PARENT-3-PROJECT_LOC/moses/src/rule.proto + + + CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp + + + CYKPlusParser/ChartRuleLookupManagerCYKPlus.h + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h + + + CYKPlusParser/ChartRuleLookupManagerMemory.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerMemory.cpp + + + CYKPlusParser/ChartRuleLookupManagerMemory.h + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerMemory.h + + + CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp + + + CYKPlusParser/ChartRuleLookupManagerOnDisk.h + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerOnDisk.h + + + CYKPlusParser/DotChart.h + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/DotChart.h + + + CYKPlusParser/DotChartInMemory.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/DotChartInMemory.cpp + + + CYKPlusParser/DotChartInMemory.h + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/DotChartInMemory.h + + + CYKPlusParser/DotChartOnDisk.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/DotChartOnDisk.cpp + + + CYKPlusParser/DotChartOnDisk.h + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/DotChartOnDisk.h + + + CYKPlusParser/Jamfile + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/Jamfile + + + CYKPlusParser/bin + 2 + virtual:/virtual + + + DynSAInclude/Jamfile + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/Jamfile + + + DynSAInclude/RandLMCache.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/RandLMCache.h + + + DynSAInclude/RandLMFilter.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/RandLMFilter.h + + + DynSAInclude/bin + 2 + virtual:/virtual + + + DynSAInclude/fdstream.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/fdstream.h + + + DynSAInclude/file.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/file.cpp + + + DynSAInclude/file.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/file.h + + + DynSAInclude/hash.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/hash.h + + + DynSAInclude/onlineRLM.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/onlineRLM.h + + + DynSAInclude/params.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/params.cpp + + + DynSAInclude/params.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/params.h + + + DynSAInclude/perfectHash.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/perfectHash.h + + + DynSAInclude/quantizer.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/quantizer.h + + + DynSAInclude/types.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/types.h + + + DynSAInclude/utils.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/utils.h + + + DynSAInclude/vocab.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/vocab.cpp + + + DynSAInclude/vocab.h + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/vocab.h + + + LM/Base.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Base.cpp + + + LM/Base.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Base.h + + + LM/Factory.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Factory.cpp + + + LM/Factory.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Factory.h + + + LM/IRST.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/IRST.cpp + + + LM/IRST.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/IRST.h + + + LM/Implementation.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Implementation.cpp + + + LM/Implementation.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Implementation.h + + + LM/Jamfile + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Jamfile + + + LM/Joint.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Joint.cpp + + + LM/Joint.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Joint.h + + + LM/Ken.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Ken.cpp + + + LM/Ken.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Ken.h + + + LM/LDHT.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/LDHT.cpp + + + LM/LDHT.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/LDHT.h + + + LM/MultiFactor.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/MultiFactor.cpp + + + LM/MultiFactor.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/MultiFactor.h + + + LM/ORLM.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/ORLM.cpp + + + LM/ORLM.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/ORLM.h + + + LM/ParallelBackoff.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/ParallelBackoff.cpp + + + LM/ParallelBackoff.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/ParallelBackoff.h + + + LM/Rand.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Rand.cpp + + + LM/Rand.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Rand.h + + + LM/Remote.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Remote.cpp + + + LM/Remote.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/Remote.h + + + LM/SRI.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/SRI.cpp + + + LM/SRI.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/SRI.h + + + LM/SingleFactor.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/SingleFactor.cpp + + + LM/SingleFactor.h + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/SingleFactor.h + + + LM/bin + 2 + virtual:/virtual + + + RuleTable/Jamfile + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/Jamfile + + + RuleTable/Loader.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/Loader.h + + + RuleTable/LoaderCompact.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderCompact.cpp + + + RuleTable/LoaderCompact.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderCompact.h + + + RuleTable/LoaderFactory.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderFactory.cpp + + + RuleTable/LoaderFactory.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderFactory.h + + + RuleTable/LoaderHiero.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderHiero.cpp + + + RuleTable/LoaderHiero.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderHiero.h + + + RuleTable/LoaderStandard.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderStandard.cpp + + + RuleTable/LoaderStandard.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderStandard.h + + + RuleTable/PhraseDictionaryALSuffixArray.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp + + + RuleTable/PhraseDictionaryALSuffixArray.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryALSuffixArray.h + + + RuleTable/PhraseDictionaryNodeSCFG.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryNodeSCFG.cpp + + + RuleTable/PhraseDictionaryNodeSCFG.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryNodeSCFG.h + + + RuleTable/PhraseDictionaryOnDisk.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryOnDisk.cpp + + + RuleTable/PhraseDictionaryOnDisk.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryOnDisk.h + + + RuleTable/PhraseDictionarySCFG.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionarySCFG.cpp + + + RuleTable/PhraseDictionarySCFG.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionarySCFG.h + + + RuleTable/Trie.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/Trie.cpp + + + RuleTable/Trie.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/Trie.h + + + RuleTable/UTrie.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/UTrie.cpp + + + RuleTable/UTrie.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/UTrie.h + + + RuleTable/UTrieNode.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/UTrieNode.cpp + + + RuleTable/UTrieNode.h + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/UTrieNode.h + + + RuleTable/bin + 2 + virtual:/virtual + + + Scope3Parser/ApplicableRuleTrie.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/ApplicableRuleTrie.cpp + + + Scope3Parser/ApplicableRuleTrie.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/ApplicableRuleTrie.h + + + Scope3Parser/IntermediateVarSpanNode.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/IntermediateVarSpanNode.h + + + Scope3Parser/Jamfile + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/Jamfile + + + Scope3Parser/Parser.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/Parser.cpp + + + Scope3Parser/Parser.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/Parser.h + + + Scope3Parser/SentenceMap.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/SentenceMap.h + + + Scope3Parser/StackLattice.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLattice.h + + + Scope3Parser/StackLatticeBuilder.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeBuilder.cpp + + + Scope3Parser/StackLatticeBuilder.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeBuilder.h + + + Scope3Parser/StackLatticeSearcher.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeSearcher.h + + + Scope3Parser/VarSpanNode.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanNode.h + + + Scope3Parser/VarSpanTrieBuilder.cpp + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanTrieBuilder.cpp + + + Scope3Parser/VarSpanTrieBuilder.h + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanTrieBuilder.h + + + Scope3Parser/bin + 2 + virtual:/virtual + + + bin/darwin-4.2.1 + 2 + virtual:/virtual + + + CYKPlusParser/bin/clang-darwin-4.2.1 + 2 + virtual:/virtual + + + CYKPlusParser/bin/darwin-4.2.1 + 2 + virtual:/virtual + + + DynSAInclude/bin/clang-darwin-4.2.1 + 2 + virtual:/virtual + + + DynSAInclude/bin/darwin-4.2.1 + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1 + 2 + virtual:/virtual + + + LM/bin/gcc-4.2.1 + 2 + virtual:/virtual + + + LM/bin/lm.log + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/lm.log + + + RuleTable/bin/darwin-4.2.1 + 2 + virtual:/virtual + + + Scope3Parser/bin/darwin-4.2.1 + 2 + virtual:/virtual + + + bin/darwin-4.2.1/release + 2 + virtual:/virtual + + + CYKPlusParser/bin/clang-darwin-4.2.1/release + 2 + virtual:/virtual + + + CYKPlusParser/bin/darwin-4.2.1/release + 2 + virtual:/virtual + + + DynSAInclude/bin/clang-darwin-4.2.1/release + 2 + virtual:/virtual + + + DynSAInclude/bin/darwin-4.2.1/release + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1/release + 2 + virtual:/virtual + + + LM/bin/gcc-4.2.1/release + 2 + virtual:/virtual + + + RuleTable/bin/darwin-4.2.1/release + 2 + virtual:/virtual + + + Scope3Parser/bin/darwin-4.2.1/release + 2 + virtual:/virtual + + + bin/darwin-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1/release/link-static + 2 + virtual:/virtual + + + LM/bin/gcc-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on + 2 + virtual:/virtual + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi + 2 + virtual:/virtual + + + DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi + 2 + virtual:/virtual + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi + 2 + virtual:/virtual + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi + 2 + virtual:/virtual + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ChartRuleLookupManagerCYKPlus.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ChartRuleLookupManagerCYKPlus.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ChartRuleLookupManagerMemory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ChartRuleLookupManagerMemory.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ChartRuleLookupManagerOnDisk.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ChartRuleLookupManagerOnDisk.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/DotChartInMemory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/DotChartInMemory.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/DotChartOnDisk.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/DotChartOnDisk.o + + + DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Base.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Base.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Factory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Factory.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/IRST.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/IRST.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Implementation.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Implementation.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Joint.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Joint.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Ken.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Ken.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/MultiFactor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/MultiFactor.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ParallelBackoff.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ParallelBackoff.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Remote.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Remote.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/SRI.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/SRI.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/SingleFactor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/SingleFactor.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/Base.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/Base.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/Factory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/Factory.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/Implementation.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/Implementation.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/Joint.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/Joint.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/Ken.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/Ken.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/MultiFactor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/MultiFactor.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/Remote.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/Remote.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/SingleFactor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/SingleFactor.o + + + LM/bin/darwin-4.2.1/release/link-static/threading-multi/libLM.a + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/link-static/threading-multi/libLM.a + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderCompact.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderCompact.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderFactory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderFactory.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderHiero.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderHiero.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderStandard.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderStandard.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryALSuffixArray.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryALSuffixArray.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryNodeSCFG.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryNodeSCFG.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryOnDisk.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryOnDisk.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionarySCFG.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionarySCFG.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Trie.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Trie.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/UTrie.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/UTrie.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/UTrieNode.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/UTrieNode.o + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ApplicableRuleTrie.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ApplicableRuleTrie.o + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Parser.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Parser.o + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/StackLatticeBuilder.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/StackLatticeBuilder.o + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/VarSpanTrieBuilder.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/VarSpanTrieBuilder.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/AlignmentInfo.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/AlignmentInfo.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/AlignmentInfoCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/BilingualDynSuffixArray.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/BilingualDynSuffixArray.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/BitmapContainer.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/BitmapContainer.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartCell.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartCell.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartCellCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartCellCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartHypothesis.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartHypothesis.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartHypothesisCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartHypothesisCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartManager.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartManager.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOption.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOption.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptionCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptionCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptionList.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTranslationOptionList.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTrellisDetour.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTrellisDetour.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTrellisDetourQueue.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTrellisDetourQueue.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTrellisNode.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTrellisNode.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTrellisPath.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartTrellisPath.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ConfusionNet.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ConfusionNet.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeFeature.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeFeature.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeGraph.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeGraph.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeStep.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeStep.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeStepGeneration.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeStepGeneration.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeStepTranslation.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DecodeStepTranslation.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Dictionary.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Dictionary.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DummyScoreProducers.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DummyScoreProducers.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude + 2 + virtual:/virtual + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSuffixArray.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSuffixArray.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FFState.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FFState.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Factor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Factor.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FactorCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FactorCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FactorTypeSet.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FactorTypeSet.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FeatureFunction.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FeatureFunction.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/File.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/File.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FloydWarshall.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/FloydWarshall.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/GenerationDictionary.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/GenerationDictionary.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/GlobalLexicalModel.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/GlobalLexicalModel.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Hypothesis.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Hypothesis.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/HypothesisStack.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/HypothesisStack.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/HypothesisStackCubePruning.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/HypothesisStackCubePruning.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/HypothesisStackNormal.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/HypothesisStackNormal.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/InputFileStream.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/InputFileStream.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/InputType.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/InputType.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LMList.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LMList.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LVoc.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LVoc.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LexicalReordering.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LexicalReordering.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingState.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingState.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTable.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LexicalReorderingTable.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Manager.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Manager.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/NonTerminal.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/NonTerminal.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PCNTools.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PCNTools.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Parameter.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Parameter.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PartialTranslOptColl.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PartialTranslOptColl.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Phrase.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Phrase.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryDynSuffixArray.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryDynSuffixArray.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryMemory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryMemory.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryNode.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryNode.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryTree.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryTree.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryTreeAdaptor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryTreeAdaptor.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PrefixTreeMap.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PrefixTreeMap.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ReorderingConstraint.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ReorderingConstraint.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ReorderingStack.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ReorderingStack.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/RuleCube.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/RuleCube.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/RuleCubeItem.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/RuleCubeItem.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/RuleCubeQueue.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/RuleCubeQueue.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ScoreComponentCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ScoreIndexManager.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ScoreIndexManager.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ScoreProducer.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ScoreProducer.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Search.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Search.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SearchCubePruning.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SearchCubePruning.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SearchNormal.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SearchNormal.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Sentence.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Sentence.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SentenceStats.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SentenceStats.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SquareMatrix.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SquareMatrix.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/StaticData.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/StaticData.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TargetPhrase.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TargetPhrase.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TargetPhraseCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TargetPhraseCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ThreadPool.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ThreadPool.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Timer.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Timer.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOption.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOption.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionConfusionNet.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionConfusionNet.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionText.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOptionCollectionText.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOptionList.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationOptionList.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationSystem.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TranslationSystem.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TreeInput.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TreeInput.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TrellisPath.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TrellisPath.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TrellisPathCollection.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/TrellisPathCollection.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UserMessage.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UserMessage.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Util.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Util.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Word.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Word.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/WordLattice.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/WordLattice.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/WordsBitmap.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/WordsBitmap.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/WordsRange.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/WordsRange.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/XmlOption.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/XmlOption.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/hash.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/hash.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libmoses_internal.a + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libmoses_internal.a + + + CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DotChartOnDisk.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DotChartOnDisk.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerCYKPlus.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerCYKPlus.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerMemory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerMemory.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerOnDisk.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerOnDisk.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DotChartInMemory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DotChartInMemory.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DotChartOnDisk.o + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DotChartOnDisk.o + + + CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libCYKPlusParser.a + 1 + PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libCYKPlusParser.a + + + DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude + 2 + virtual:/virtual + + + DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libdynsa.a + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libdynsa.a + + + DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude + 2 + virtual:/virtual + + + DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libdynsa.a + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libdynsa.a + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Base.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Base.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Factory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Factory.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/IRST.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/IRST.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Implementation.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Implementation.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Joint.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Joint.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Ken.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Ken.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ORLM.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ORLM.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ParallelBackoff.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ParallelBackoff.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Rand.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Rand.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Remote.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Remote.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SRI.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SRI.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o + + + LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libLM.a + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libLM.a + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Base.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Base.o + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Factory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Factory.o + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Implementation.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Implementation.o + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Joint.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Joint.o + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Ken.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Ken.o + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/MultiFactor.o + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Remote.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/Remote.o + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/SingleFactor.o + + + LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/libLM.a + 1 + PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/libLM.a + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderCompact.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderCompact.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderFactory.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderFactory.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderHiero.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderHiero.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderStandard.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderStandard.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryALSuffixArray.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryALSuffixArray.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryNodeSCFG.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryNodeSCFG.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryOnDisk.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryOnDisk.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionarySCFG.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionarySCFG.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Trie.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Trie.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UTrie.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UTrie.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UTrieNode.o + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UTrieNode.o + + + RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libRuleTable.a + 1 + PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libRuleTable.a + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ApplicableRuleTrie.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ApplicableRuleTrie.o + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Parser.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Parser.o + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/StackLatticeBuilder.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/StackLatticeBuilder.o + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/VarSpanTrieBuilder.o + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/VarSpanTrieBuilder.o + + + Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libScope3Parser.a + 1 + PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libScope3Parser.a + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o + + + bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o + 1 + PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o + + + DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o + + + DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o + + + DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o + + + DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o + + + DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o + + + DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o + 1 + PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o + + + diff --git a/contrib/other-builds/util/.cproject b/contrib/other-builds/util/.cproject new file mode 100644 index 000000000..46e9a02b6 --- /dev/null +++ b/contrib/other-builds/util/.cproject @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/util/.project b/contrib/other-builds/util/.project new file mode 100644 index 000000000..537def437 --- /dev/null +++ b/contrib/other-builds/util/.project @@ -0,0 +1,90 @@ + + + util + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + ?name? + + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.autoBuildTarget + all + + + org.eclipse.cdt.make.core.buildArguments + + + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.buildLocation + ${workspace_loc:/util/Debug} + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + org.eclipse.cdt.make.core.fullBuildTarget + all + + + org.eclipse.cdt.make.core.stopOnError + true + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + util + 2 + PARENT-3-PROJECT_LOC/util + + + diff --git a/lm/bhiksha.cc b/lm/bhiksha.cc index cdeafb478..870a4eee5 100644 --- a/lm/bhiksha.cc +++ b/lm/bhiksha.cc @@ -1,6 +1,7 @@ #include "lm/bhiksha.hh" #include "lm/config.hh" #include "util/file.hh" +#include "util/exception.hh" #include diff --git a/moses-cmd/src/IOWrapper.h b/moses-cmd/src/IOWrapper.h index e7936f33c..83c428d47 100644 --- a/moses-cmd/src/IOWrapper.h +++ b/moses-cmd/src/IOWrapper.h @@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef moses_cmd_IOWrapper_h #define moses_cmd_IOWrapper_h +#include #include #include #include @@ -121,13 +122,13 @@ IOWrapper *GetIODevice(const Moses::StaticData &staticData); bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source); void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector &outputFactorOrder, bool reportSegmentation, bool reportAllFactors); void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector&, - const TranslationSystem* system, long translationId, bool reportSegmentation); + const Moses::TranslationSystem* system, long translationId, bool reportSegmentation); void OutputLatticeMBRNBest(std::ostream& out, const std::vector& solutions,long translationId); void OutputBestHypo(const std::vector& mbrBestHypo, long /*translationId*/, bool reportSegmentation, bool reportAllFactors, std::ostream& out); void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool reportSegmentation, bool reportAllFactors, std::ostream &out); -void OutputInput(std::ostream& os, const Hypothesis* hypo); -void OutputAlignment(OutputCollector* collector, size_t lineNo, const Hypothesis *hypo); -void OutputAlignment(OutputCollector* collector, size_t lineNo, const TrellisPath &path); +void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo); +void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo); +void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path); #endif diff --git a/moses/src/LM/ORLM.h b/moses/src/LM/ORLM.h index c1ccb4387..55adb9d82 100644 --- a/moses/src/LM/ORLM.h +++ b/moses/src/LM/ORLM.h @@ -5,7 +5,7 @@ #include "Factor.h" #include "Util.h" #include "LM/SingleFactor.h" -#include "onlineRLM.h" +#include "DynSAInclude/onlineRLM.h" //#include "multiOnlineRLM.h" #include "DynSAInclude/file.h" #include "DynSAInclude/vocab.h" diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 0db2ee437..f33067dc0 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -42,7 +42,7 @@ my $_CORES = 1; my $debug = 0; # debug this script, do not delete any files in debug mode # the following line is set installation time by 'make release'. BEWARE! -my $BINDIR="/home/hieu/workspace/bin/training-tools/"; +my $BINDIR="/Users/hieuhoang/workspace/bin/training-tools/"; $_HELP = 1 unless &GetOptions('root-dir=s' => \$_ROOT_DIR, diff --git a/util/bit_packing.cc b/util/bit_packing.cc index 41999b726..b5a14008b 100644 --- a/util/bit_packing.cc +++ b/util/bit_packing.cc @@ -10,7 +10,7 @@ template struct StaticCheck {}; template <> struct StaticCheck { typedef bool StaticAssertionPassed; }; // If your float isn't 4 bytes, we're hosed. -typedef StaticCheck::StaticAssertionPassed FloatSize; +//typedef StaticCheck::StaticAssertionPassed FloatSize; } // namespace From 6d1165654caf8edc995a41a4c6c9666e65ebce96 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Mon, 28 May 2012 20:15:58 +0100 Subject: [PATCH 30/38] script updates and added ems config help --- scripts/ems/example/config.basic | 3 +- scripts/ems/example/config.factored | 3 +- scripts/ems/example/config.hierarchical | 3 +- scripts/ems/example/config.syntax | 3 +- scripts/ems/example/config.toy | 3 +- scripts/generic/compound-splitter.perl | 174 ++++++++++++++++++++---- 6 files changed, 161 insertions(+), 28 deletions(-) diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic index c08f51764..939e13aad 100644 --- a/scripts/ems/example/config.basic +++ b/scripts/ems/example/config.basic @@ -260,7 +260,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored index 4bc198a6b..df9f28f33 100644 --- a/scripts/ems/example/config.factored +++ b/scripts/ems/example/config.factored @@ -280,7 +280,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical index b9858f393..6161f6ac4 100644 --- a/scripts/ems/example/config.hierarchical +++ b/scripts/ems/example/config.hierarchical @@ -260,7 +260,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax index 7c97b9ac4..635585844 100644 --- a/scripts/ems/example/config.syntax +++ b/scripts/ems/example/config.syntax @@ -264,7 +264,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy index 140a45229..7b8c95faa 100644 --- a/scripts/ems/example/config.toy +++ b/scripts/ems/example/config.toy @@ -244,7 +244,8 @@ script = $moses-script-dir/training/train-model.perl ### general options # these are options that are passed on to train-model.perl, for instance # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza -# * "-sort-buffer-size 8G" to reduce on-disk sorting +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building # #training-options = "" diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index ced661e3f..9948c648e 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -8,15 +8,23 @@ my $FILLER = ":s:es"; my $MIN_SIZE = 3; my $MIN_COUNT = 5; my $MAX_COUNT = 5; +my $FACTORED = 0; +my $SYNTAX = 0; +my $MARK_SPLIT = 0; +my $BINARIZE = 0; $HELP = 1 unless &GetOptions('corpus=s' => \$CORPUS, 'model=s' => \$MODEL, 'filler=s' => \$FILLER, + 'factored' => \$FACTORED, 'min-size=i' => \$MIN_SIZE, 'min-count=i' => \$MIN_COUNT, 'max-count=i' => \$MAX_COUNT, 'help' => \$HELP, 'verbose' => \$VERBOSE, + 'syntax' => \$SYNTAX, + 'binarize' => \$BINARIZE, + 'mark-split' => \$MARK_SPLIT, 'train' => \$TRAIN); if ($HELP || @@ -29,59 +37,152 @@ if ($HELP || print "options: -min-size: minimum word size (default $MIN_SIZE)\n"; print " -min-count: minimum word count (default $MIN_COUNT)\n"; print " -filler: filler letters between words (default $FILLER)\n"; + print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n"; + print " -syntax: syntactically parsed data (default $SYNTAX)\n"; + print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n"; + print " -binarize: binarize subtree for split word (default $BINARIZE)\n"; exit; } if ($TRAIN) { - &train; + if ($SYNTAX) { &train_syntax(); } + elsif ($FACTORED) { &train_factored(); } + else { &train(); } } else { - &apply; + &apply(); } sub train { - my %WORD; + my %COUNT; open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); while() { chop; s/\s+/ /g; s/^ //; s/ $//; foreach (split) { - $WORD{$_}++; + $COUNT{$_}++; } } - close($CORPUS); + close(CORPUS); + &save_trained_model(\%COUNT); +} + +sub save_trained_model { + my ($COUNT) = @_; my $id = 0; open(MODEL,">".$MODEL); - foreach my $word (keys %WORD) { - print MODEL "".(++$id)."\t".$word."\t".$WORD{$word}."\n"; + foreach my $word (keys %$COUNT) { + print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n"; } close(MODEL); - print STDERR "written model file with ".(scalar keys %WORD)." words.\n"; + print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n"; +} + +sub train_factored { + my (%COUNT,%FACTORED_COUNT); + # collect counts for interpretations for each surface word + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while() { + chop; s/\s+/ /g; s/^ //; s/ $//; + foreach my $factored_word (split) { + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + $FACTORED_COUNT{$word}{$factored_word}++; + } + } + close(CORPUS); + # only preserve most frequent interpretation, assign sum of counts + foreach my $word (keys %FACTORED_COUNT) { + my ($max,$best,$total) = (0,"",0); + foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) { + my $count = $FACTORED_COUNT{$word}{$factored_word}; + $total += $count; + if ($count > $max) { + $max = $count; + $best = $factored_word; + } + } + $COUNT{$best} = $total; + } + &save_trained_model(\%COUNT); +} + +sub train_syntax { + my (%COUNT,%LABELED_COUNT); + # collect counts for interpretations for each surface word + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while() { + chop; s/\s+/ /g; s/^ //; s/ $//; + my $label; + foreach (split) { + if (/^label="([^\"]+)"/) { + $label = $1; + } + elsif (! /^ $max) { + $max = $count; + $best = "$word $label"; + } + } + $COUNT{$best} = $total; + } + &save_trained_model(\%COUNT); } sub apply { - my (%WORD,%TRUECASE); + my (%COUNT,%TRUECASE,%LABEL); open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'"); while() { chomp; - my ($id,$word,$count) = split(/\t/); + my ($id,$factored_word,$count) = split(/\t/); + my $label; + ($factored_word,$label) = split(/ /,$factored_word); + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor my $lc = lc($word); # if word exists with multipe casings, only record most frequent - next if defined($WORD{$lc}) && $WORD{$lc} > $count; - $WORD{$lc} = $count; - $TRUECASE{$lc} = $word; + next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; + $COUNT{$lc} = $count; + $TRUECASE{$lc} = $factored_word; + $LABEL{$lc} = $label if $SYNTAX; } close(MODEL); while() { my $first = 1; chop; s/\s+/ /g; s/^ //; s/ $//; - foreach my $word (split) { + my @BUFFER; # for xml tags + foreach my $factored_word (split) { print " " unless $first; $first = 0; + # syntax: don't split xml + if ($SYNTAX && ($factored_word =~ /^$/)) { + push @BUFFER,$factored_word; + $first = 1; + next; + } + + # get case class + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + # don't split frequent words - if (defined($WORD{$word}) && $WORD{$word}>=$MAX_COUNT) { - print $word; + if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) { + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; next; } @@ -100,17 +201,18 @@ sub apply { my $subword = lc(substr($word, $start+length($filler), $end-$start+1-length($filler))); - next unless defined($WORD{$subword}); - next unless $WORD{$subword} >= $MIN_COUNT; - print STDERR "\tmatching word $start .. $end ($filler)$subword $WORD{$subword}\n" if $VERBOSE; - push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $WORD{$subword}"; + next unless defined($COUNT{$subword}); + next unless $COUNT{$subword} >= $MIN_COUNT; + print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE; + push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}"; } } } # no matches at all? if (!defined($REACHABLE{$final})) { - print $word; + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; next; } @@ -152,9 +254,35 @@ sub apply { last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final}; for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; } } - $best_split = $word unless $best_split =~ / /; # do not change case for unsplit words - print $best_split; + if ($best_split !~ / /) { + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $word; # do not change case for unsplit words + next; + } + if (!$SYNTAX) { + print $best_split; + } + else { + $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT; + $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n"); + my $pos = $1; + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + + my @SPLIT = split(/ /,$best_split); + my @OUT = (); + if ($BINARIZE) { + for(my $w=0;$w"; + } + } + for(my $w=0;$w=2) { push @OUT, ""; } + push @OUT," $SPLIT[$w] "; + } + print join(" ",@OUT); + } } + print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer print "\n"; } } From fd577d7a65cab923b9102d61873a032654d573a1 Mon Sep 17 00:00:00 2001 From: Colin Cherry Date: Tue, 29 May 2012 13:38:57 -0400 Subject: [PATCH 31/38] Batch k-best MIRA is written and integrated into mert-moses.pl Regression tests all check out, and kbmira seems to work fine on a Hansard French->English task. HypPackEnumerator class may be of interest to pro.cpp and future optimizers, as it abstracts a lot of the boilerplate involved in enumerating multiple k-best lists. MiraWeightVector is not really mira-specific - just a weight vector that enables efficient averaging. Could be useful to a perceptron as well. Same goes for MiraFeatureVector. Interaction with sparse features is written, but untested. --- .gitignore | 1 + mert/BleuScorer.cpp | 41 +++++ mert/BleuScorer.h | 10 ++ mert/BleuScorerTest.cpp | 6 +- mert/DataTest.cpp | 4 +- mert/FeatureDataIterator.cpp | 11 ++ mert/FeatureDataIterator.h | 3 + mert/FeatureDataTest.cpp | 4 +- mert/FeatureStats.cpp | 39 +++++ mert/FeatureStats.h | 10 +- mert/HypPackEnumerator.cpp | 187 +++++++++++++++++++++ mert/HypPackEnumerator.h | 101 +++++++++++ mert/Jamfile | 7 +- mert/MiraFeatureVector.cpp | 144 ++++++++++++++++ mert/MiraFeatureVector.h | 51 ++++++ mert/MiraWeightVector.cpp | 143 ++++++++++++++++ mert/MiraWeightVector.h | 106 ++++++++++++ mert/kbmira.cpp | 298 +++++++++++++++++++++++++++++++++ scripts/training/mert-moses.pl | 25 ++- 19 files changed, 1181 insertions(+), 10 deletions(-) create mode 100644 mert/HypPackEnumerator.cpp create mode 100644 mert/HypPackEnumerator.h create mode 100644 mert/MiraFeatureVector.cpp create mode 100644 mert/MiraFeatureVector.h create mode 100644 mert/MiraWeightVector.cpp create mode 100644 mert/MiraWeightVector.h create mode 100644 mert/kbmira.cpp diff --git a/.gitignore b/.gitignore index d4493bce1..0d6997e8d 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ mert/extractor mert/mert mert/megam_i686.opt mert/pro +mert/kbmira misc/processLexicalTable misc/processPhraseTable misc/queryLexicalTable diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp index 22ce81798..a8a0256f2 100644 --- a/mert/BleuScorer.cpp +++ b/mert/BleuScorer.cpp @@ -232,3 +232,44 @@ float sentenceLevelBleuPlusOne(const vector& stats) { } return exp(logbleu); } + +float sentenceLevelBackgroundBleu(const std::vector& sent, const std::vector& bg) +{ + // Sum sent and background + std::vector stats; + CHECK(sent.size()==bg.size()); + CHECK(sent.size()==kBleuNgramOrder*2+1); + for(size_t i=0;i& stats) { + CHECK(stats.size() == kBleuNgramOrder * 2 + 1); + + float logbleu = 0.0; + for (int j = 0; j < kBleuNgramOrder; j++) { + logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]); + } + logbleu /= kBleuNgramOrder; + const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1]; + + if (brevity < 0.0) { + logbleu += brevity; + } + return exp(logbleu); +} diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 1f568f744..8f1384f5a 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -70,4 +70,14 @@ private: */ float sentenceLevelBleuPlusOne(const std::vector& stats); +/** Computes sentence-level BLEU score given a background corpus. + * This function is used in batch MIRA. + */ +float sentenceLevelBackgroundBleu(const std::vector& sent, const std::vector& bg); + +/** + * Computes plain old BLEU from a vector of stats + */ +float unsmoothedBleu(const std::vector& stats); + #endif // MERT_BLEU_SCORER_H_ diff --git a/mert/BleuScorerTest.cpp b/mert/BleuScorerTest.cpp index 5a7de9654..5960507e8 100644 --- a/mert/BleuScorerTest.cpp +++ b/mert/BleuScorerTest.cpp @@ -152,10 +152,10 @@ BOOST_AUTO_TEST_CASE(bleu_count_ngrams) { // "girl with a telescope", "with a telescope ." NgramCounts counts; BOOST_REQUIRE(scorer.CountNgrams(line, counts, kBleuNgramOrder) == 8); - BOOST_CHECK_EQUAL(25, counts.size()); + BOOST_CHECK_EQUAL((std::size_t)25, counts.size()); mert::Vocabulary* vocab = scorer.GetVocab(); - BOOST_CHECK_EQUAL(7, vocab->size()); + BOOST_CHECK_EQUAL((std::size_t)7, vocab->size()); std::vector res; Tokenize(line.c_str(), ' ', &res); @@ -203,7 +203,7 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts) { ScoreStats entry; scorer.prepareStats(0, line, entry); - BOOST_CHECK_EQUAL(entry.size(), 2 * kBleuNgramOrder + 1); + BOOST_CHECK_EQUAL(entry.size(), (std::size_t)(2 * kBleuNgramOrder + 1)); // Test hypothesis ngram counts BOOST_CHECK_EQUAL(entry.get(0), 5); // unigram diff --git a/mert/DataTest.cpp b/mert/DataTest.cpp index b538c99cb..e94d4ffe9 100644 --- a/mert/DataTest.cpp +++ b/mert/DataTest.cpp @@ -33,8 +33,8 @@ BOOST_AUTO_TEST_CASE(shard_basic) { std::vector shards; data.createShards(2,0,"",shards); - BOOST_CHECK_EQUAL(shards.size(),2); - BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),2); + BOOST_CHECK_EQUAL(shards.size(),(std::size_t)2); + BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2); } BOOST_AUTO_TEST_CASE(init_feature_map_test) { diff --git a/mert/FeatureDataIterator.cpp b/mert/FeatureDataIterator.cpp index 00b59bc38..c0ace87e6 100644 --- a/mert/FeatureDataIterator.cpp +++ b/mert/FeatureDataIterator.cpp @@ -18,6 +18,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include #include +#include #include "util/tokenize_piece.hh" @@ -47,6 +48,16 @@ float ParseFloat(const StringPiece& str) { return value; } +bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) { + return item1.dense==item1.dense && item1.sparse==item1.sparse; +} + +size_t hash_value(FeatureDataItem const& item) { + size_t seed = 0; + boost::hash_combine(seed,item.dense); + boost::hash_combine(seed,item.sparse); + return seed; +} FeatureDataIterator::FeatureDataIterator() {} diff --git a/mert/FeatureDataIterator.h b/mert/FeatureDataIterator.h index 58345829c..9bc5f03f7 100644 --- a/mert/FeatureDataIterator.h +++ b/mert/FeatureDataIterator.h @@ -61,6 +61,9 @@ class FeatureDataItem SparseVector sparse; }; +bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2); +std::size_t hash_value(FeatureDataItem const& item); + class FeatureDataIterator : public boost::iterator_facade, diff --git a/mert/FeatureDataTest.cpp b/mert/FeatureDataTest.cpp index 49c9d0fd5..ed70f7971 100644 --- a/mert/FeatureDataTest.cpp +++ b/mert/FeatureDataTest.cpp @@ -13,7 +13,7 @@ void CheckFeatureMap(const FeatureData* feature_data, std::stringstream ss; ss << str << "_" << i; const std::string& s = ss.str(); - BOOST_CHECK_EQUAL(feature_data->getFeatureIndex(s), *cnt); + BOOST_CHECK_EQUAL(feature_data->getFeatureIndex(s), (std::size_t)(*cnt)); BOOST_CHECK_EQUAL(feature_data->getFeatureName(*cnt).c_str(), s); ++(*cnt); } @@ -35,6 +35,6 @@ BOOST_AUTO_TEST_CASE(set_feature_map) { CheckFeatureMap(&feature_data, "lm", 2, &cnt); CheckFeatureMap(&feature_data, "tm", 5, &cnt); - BOOST_CHECK_EQUAL(feature_data.getFeatureIndex("w_0"), cnt); + BOOST_CHECK_EQUAL(feature_data.getFeatureIndex("w_0"), (std::size_t)cnt); BOOST_CHECK_EQUAL(feature_data.getFeatureName(cnt).c_str(), "w_0"); } diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp index 5d7c5c7b4..2c6cdb88f 100644 --- a/mert/FeatureStats.cpp +++ b/mert/FeatureStats.cpp @@ -10,6 +10,8 @@ #include #include +#include + #include "Util.h" using namespace std; @@ -81,6 +83,43 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) { return res; } +std::vector SparseVector::feats() const { + std::vector toRet; + for(fvector_t::const_iterator iter = m_fvector.begin(); + iter!=m_fvector.end(); + iter++) { + toRet.push_back(iter->first); + } + return toRet; +} + +std::size_t SparseVector::encode(const std::string& name) { + name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); + size_t id = 0; + if (name2id_iter == m_name_to_id.end()) { + id = m_id_to_name.size(); + m_id_to_name.push_back(name); + m_name_to_id[name] = id; + } else { + id = name2id_iter->second; + } + return id; +} + +std::string SparseVector::decode(std::size_t id) { + return m_id_to_name[id]; +} + +bool operator==(SparseVector const& item1, SparseVector const& item2) { + return item1.m_fvector==item2.m_fvector; +} + +std::size_t hash_value(SparseVector const& item) { + boost::hash hasher; + return hasher(item.m_fvector); +} + + FeatureStats::FeatureStats() : m_available_size(kAvailableSize), m_entries(0), m_array(new FeatureStatsType[m_available_size]) {} diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h index 69591001b..2a4e9882c 100644 --- a/mert/FeatureStats.h +++ b/mert/FeatureStats.h @@ -28,11 +28,19 @@ public: void set(const std::string& name, FeatureStatsType value); void clear(); std::size_t size() const { return m_fvector.size(); } - + void write(std::ostream& out, const std::string& sep = " ") const; SparseVector& operator-=(const SparseVector& rhs); + // Added by cherryc + std::vector feats() const; + friend bool operator==(SparseVector const& item1, SparseVector const& item2); + friend std::size_t hash_value(SparseVector const& item); + static std::size_t encode(const std::string& feat); + static std::string decode(std::size_t feat); + // End added by cherryc + private: static name2id_t m_name_to_id; static id2name_t m_id_to_name; diff --git a/mert/HypPackEnumerator.cpp b/mert/HypPackEnumerator.cpp new file mode 100644 index 000000000..9da627212 --- /dev/null +++ b/mert/HypPackEnumerator.cpp @@ -0,0 +1,187 @@ +#include "HypPackEnumerator.h" + +#include +#include +#include + +StreamingHypPackEnumerator::StreamingHypPackEnumerator +( + vector const& featureFiles, + vector const& scoreFiles + ) + : m_featureFiles(featureFiles), + m_scoreFiles(scoreFiles) +{ + if (scoreFiles.size() == 0 || featureFiles.size() == 0) { + cerr << "No data to process" << endl; + exit(0); + } + + if (featureFiles.size() != scoreFiles.size()) { + cerr << "Error: Number of feature files (" << featureFiles.size() << + ") does not match number of score files (" << scoreFiles.size() << ")" << endl; + exit(1); + } + + m_num_lists = scoreFiles.size(); + m_primed = false; + m_iNumDense = -1; +} + +size_t StreamingHypPackEnumerator::num_dense() const { + if(m_iNumDense<0) { + cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl; + exit(1); + } + return (size_t) m_iNumDense; +} + +void StreamingHypPackEnumerator::prime(){ + m_current_indexes.clear(); + boost::unordered_set seen; + m_primed = true; + + for (size_t i = 0; i < m_num_lists; ++i) { + if (m_featureDataIters[i] == FeatureDataIterator::end()) { + cerr << "Error: Feature file " << i << " ended prematurely" << endl; + exit(1); + } + if (m_scoreDataIters[i] == ScoreDataIterator::end()) { + cerr << "Error: Score file " << i << " ended prematurely" << endl; + exit(1); + } + if (m_featureDataIters[i]->size() != m_scoreDataIters[i]->size()) { + cerr << "Error: For sentence " << m_sentenceId << " features and scores have different size" << endl; + exit(1); + } + for (size_t j = 0; j < m_featureDataIters[i]->size(); ++j) { + FeatureDataItem item = m_featureDataIters[i]->operator[](j); + // Dedup + if(seen.find(item)==seen.end()) { + seen.insert(item); + // Confirm dense features are always the same + int iDense = item.dense.size(); + if(m_iNumDense != iDense) { + if(m_iNumDense==-1) m_iNumDense = iDense; + else { + cerr << "Error: expecting constant number of dense features: " + << m_iNumDense << " != " << iDense << endl; + exit(1); + } + } + // Store item for retrieval + m_current_indexes.push_back(pair(i,j)); + } + } + } +} + +void StreamingHypPackEnumerator::reset(){ + m_featureDataIters.clear(); + m_scoreDataIters.clear(); + for (size_t i = 0; i < m_num_lists; ++i) { + m_featureDataIters.push_back(FeatureDataIterator(m_featureFiles[i])); + m_scoreDataIters.push_back(ScoreDataIterator(m_scoreFiles[i])); + } + m_sentenceId=0; + prime(); +} + +bool StreamingHypPackEnumerator::finished(){ + return m_featureDataIters[0]==FeatureDataIterator::end(); +} + +void StreamingHypPackEnumerator::next(){ + if(!m_primed) { + cerr << "Enumerating an unprimed HypPackEnumerator" << endl; + exit(1); + } + for (size_t i = 0; i < m_num_lists; ++i) { + ++m_featureDataIters[i]; + ++m_scoreDataIters[i]; + } + m_sentenceId++; + if(!finished()) prime(); +} + +size_t StreamingHypPackEnumerator::cur_size(){ + if(!m_primed) { + cerr << "Querying size from an unprimed HypPackEnumerator" << endl; + exit(1); + } + return m_current_indexes.size(); +} + +const FeatureDataItem& StreamingHypPackEnumerator::featuresAt(size_t index){ + if(!m_primed) { + cerr << "Querying features from an unprimed HypPackEnumerator" << endl; + exit(1); + } + const pair& pij = m_current_indexes[index]; + return m_featureDataIters[pij.first]->operator[](pij.second); +} + +const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) { + if(!m_primed) { + cerr << "Querying scores from an unprimed HypPackEnumerator" << endl; + exit(1); + } + const pair& pij = m_current_indexes[index]; + return m_scoreDataIters[pij.first]->operator[](pij.second); +} + +/* --------- RandomAccessHypPackEnumerator ------------- */ + +RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector const& featureFiles, + vector const& scoreFiles, + bool no_shuffle) +{ + StreamingHypPackEnumerator train(featureFiles,scoreFiles); + size_t index=0; + for(train.reset(); !train.finished(); train.next()) { + m_features.push_back(vector()); + m_scores.push_back(vector()); + for(size_t j=0;j= m_indexes.size(); +} +void RandomAccessHypPackEnumerator::next() { + m_cur_index++; +} + +size_t RandomAccessHypPackEnumerator::cur_size() { + assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size()); + return m_features[m_indexes[m_cur_index]].size(); +} +const FeatureDataItem& RandomAccessHypPackEnumerator::featuresAt(size_t i) { + return m_features[m_indexes[m_cur_index]][i]; +} +const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) { + return m_scores[m_indexes[m_cur_index]][i]; +} + + +// --Emacs trickery-- +// Local Variables: +// mode:c++ +// c-basic-offset:2 +// End: diff --git a/mert/HypPackEnumerator.h b/mert/HypPackEnumerator.h new file mode 100644 index 000000000..9f44c3372 --- /dev/null +++ b/mert/HypPackEnumerator.h @@ -0,0 +1,101 @@ +/* + * HypPackCollection.h + * kbmira - k-best Batch MIRA + * + * Abstracts away the mess of iterating through multiple + * collections of k-best lists, as well as deduping + */ + +#ifndef MERT_HYP_PACK_COLLECTION_H +#define MERT_HYP_PACK_COLLECTION_H + +#include +#include +#include + +#include "FeatureDataIterator.h" +#include "ScoreDataIterator.h" + +using namespace std; + +// Start with these abstract classes + +class HypPackEnumerator { +public: + virtual void reset() = 0; + virtual bool finished() = 0; + virtual void next() = 0; + + virtual size_t cur_size() = 0; + virtual size_t num_dense() const = 0; + virtual const FeatureDataItem& featuresAt(size_t i) = 0; + virtual const ScoreDataItem& scoresAt(size_t i) = 0; +}; + +// Instantiation that streams from disk +// Low-memory, low-speed, sequential access +class StreamingHypPackEnumerator : public HypPackEnumerator { +public: + StreamingHypPackEnumerator(vector const& featureFiles, + vector const& scoreFiles + ); + + virtual size_t num_dense() const; + + virtual void reset(); + virtual bool finished(); + virtual void next(); + + virtual size_t cur_size(); + virtual const FeatureDataItem& featuresAt(size_t i); + virtual const ScoreDataItem& scoresAt(size_t i); + +private: + void prime(); + size_t m_num_lists; + size_t m_sentenceId; + vector m_featureFiles; + vector m_scoreFiles; + + bool m_primed; + int m_iNumDense; + vector m_featureDataIters; + vector m_scoreDataIters; + vector > m_current_indexes; +}; + +// Instantiation that reads into memory +// High-memory, high-speed, random access +// (Actually randomizes with each call to reset) +class RandomAccessHypPackEnumerator : public HypPackEnumerator { +public: + RandomAccessHypPackEnumerator(vector const& featureFiles, + vector const& scoreFiles, + bool no_shuffle); + + virtual size_t num_dense() const; + + virtual void reset(); + virtual bool finished(); + virtual void next(); + + virtual size_t cur_size(); + virtual const FeatureDataItem& featuresAt(size_t i); + virtual const ScoreDataItem& scoresAt(size_t i); + +private: + bool m_no_shuffle; + size_t m_cur_index; + size_t m_num_dense; + vector m_indexes; + vector > m_features; + vector > m_scores; +}; + +#endif // MERT_HYP_PACK_COLLECTION_H + +// --Emacs trickery-- +// Local Variables: +// mode:c++ +// c-basic-offset:2 +// End: diff --git a/mert/Jamfile b/mert/Jamfile index 2eaa7143c..00219f878 100644 --- a/mert/Jamfile +++ b/mert/Jamfile @@ -15,6 +15,9 @@ FeatureStats.cpp FeatureArray.cpp FeatureData.cpp FeatureDataIterator.cpp +MiraFeatureVector.cpp +MiraWeightVector.cpp +HypPackEnumerator.cpp Data.cpp BleuScorer.cpp SemposScorer.cpp @@ -52,7 +55,9 @@ exe evaluator : evaluator.cpp mert_lib ; exe pro : pro.cpp mert_lib ..//boost_program_options ; -alias programs : mert extractor evaluator pro ; +exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ; + +alias programs : mert extractor evaluator pro kbmira ; unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ; unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ; diff --git a/mert/MiraFeatureVector.cpp b/mert/MiraFeatureVector.cpp new file mode 100644 index 000000000..9636b2fcd --- /dev/null +++ b/mert/MiraFeatureVector.cpp @@ -0,0 +1,144 @@ +#include + +#include "MiraFeatureVector.h" + +MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) + : m_dense(vec.dense) +{ + vector sparseFeats = vec.sparse.feats(); + bool bFirst = true; + size_t lastFeat = 0; + for(size_t i=0;i=feat) { + cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl; + exit(1); + } + } + lastFeat = feat; + } +} + +MiraFeatureVector::MiraFeatureVector(const MiraFeatureVector& other) + : m_dense(other.m_dense), + m_sparseFeats(other.m_sparseFeats), + m_sparseVals(other.m_sparseVals) +{ + if(m_sparseVals.size()!=m_sparseFeats.size()) { + cerr << "Error: mismatching sparse feat and val sizes" << endl; + exit(1); + } +} + +MiraFeatureVector::MiraFeatureVector(const vector& dense, + const vector& sparseFeats, + const vector& sparseVals) + : m_dense(dense), + m_sparseFeats(sparseFeats), + m_sparseVals(sparseVals) +{ + if(m_sparseVals.size()!=m_sparseFeats.size()) { + cerr << "Error: mismatching sparse feat and val sizes" << endl; + exit(1); + } +} + +ValType MiraFeatureVector::val(size_t index) const { + if(index < m_dense.size()) + return m_dense[index]; + else + return m_sparseVals[index]; +} + +size_t MiraFeatureVector::feat(size_t index) const { + if(index < m_dense.size()) + return index; + else + return m_sparseFeats[index]; +} + +size_t MiraFeatureVector::size() const { + return m_dense.size() + m_sparseVals.size(); +} + +ValType MiraFeatureVector::sqrNorm() const { + ValType toRet = 0.0; + for(size_t i=0;i dense; + if(a.m_dense.size()!=b.m_dense.size()) { + cerr << "Mismatching dense vectors passed to MiraFeatureVector subtraction" << endl; + exit(1); + } + for(size_t i=0;i sparseVals; + vector sparseFeats; + while(i < a.m_sparseFeats.size() && j < b.m_sparseFeats.size()) { + + if(a.m_sparseFeats[i] < b.m_sparseFeats[j]) { + sparseFeats.push_back(a.m_sparseFeats[i]); + sparseVals.push_back(a.m_sparseVals[i]); + i++; + } + + else if(b.m_sparseFeats[j] < a.m_sparseFeats[i]) { + sparseFeats.push_back(b.m_sparseFeats[j]); + sparseVals.push_back(-b.m_sparseVals[j]); + j++; + } + + else { + ValType newVal = a.m_sparseVals[i] - b.m_sparseVals[j]; + if(abs(newVal)>1e-6) { + sparseFeats.push_back(a.m_sparseFeats[i]); + sparseVals.push_back(newVal); + } + i++; + j++; + } + } + + while(i + +#include "FeatureDataIterator.h" + +using namespace std; + +typedef FeatureStatsType ValType; + +class MiraFeatureVector { +public: + MiraFeatureVector(const FeatureDataItem& vec); + MiraFeatureVector(const MiraFeatureVector& other); + MiraFeatureVector(const vector& dense, + const vector& sparseFeats, + const vector& sparseVals); + + ValType val(size_t index) const; + size_t feat(size_t index) const; + size_t size() const; + ValType sqrNorm() const; + + friend MiraFeatureVector operator-(const MiraFeatureVector& a, + const MiraFeatureVector& b); + +private: + vector m_dense; + vector m_sparseFeats; + vector m_sparseVals; +}; + +#endif // MERT_FEATURE_VECTOR_H + +// --Emacs trickery-- +// Local Variables: +// mode:c++ +// c-basic-offset:2 +// End: diff --git a/mert/MiraWeightVector.cpp b/mert/MiraWeightVector.cpp new file mode 100644 index 000000000..8b46044fa --- /dev/null +++ b/mert/MiraWeightVector.cpp @@ -0,0 +1,143 @@ +#include "MiraWeightVector.h" + +/** + * Constructor, initializes to the zero vector + */ +MiraWeightVector::MiraWeightVector() + : m_weights(), + m_totals(), + m_lastUpdated() +{ + m_numUpdates = 0; +} + +/** + * Constructor with provided initial vector + * \param init Initial feature values + */ +MiraWeightVector::MiraWeightVector(const vector& init) + : m_weights(init), + m_totals(init), + m_lastUpdated(init.size(), 0) +{ + m_numUpdates = 0; +} + +/** + * Update a the model + * \param fv Feature vector to be added to the weights + * \param tau FV will be scaled by this value before update + */ +void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) { + m_numUpdates++; + for(size_t i=0;ifixTotals(); + return AvgWeightVector(*this); +} + +/** + * Updates a weight and lazily updates its total + */ +void MiraWeightVector::update(size_t index, ValType delta) { + + // Handle previously unseen weights + while(index>=m_weights.size()) { + m_weights.push_back(0.0); + m_totals.push_back(0.0); + m_lastUpdated.push_back(0); + } + + // Book keeping for w = w + delta + m_totals[index] += (m_numUpdates - m_lastUpdated[index]) * m_weights[index] + delta; + m_weights[index] += delta; + m_lastUpdated[index] = m_numUpdates; +} + +/** + * Make sure everyone's total is up-to-date + */ +void MiraWeightVector::fixTotals() { + for(size_t i=0; i + +#include "MiraFeatureVector.h" + +using namespace std; + +class AvgWeightVector; + +class MiraWeightVector { +public: + /** + * Constructor, initializes to the zero vector + */ + MiraWeightVector(); + + /** + * Constructor with provided initial vector + * \param init Initial feature values + */ + MiraWeightVector(const vector& init); + + /** + * Update a the model + * \param fv Feature vector to be added to the weights + * \param tau FV will be scaled by this value before update + */ + void update(const MiraFeatureVector& fv, float tau); + + /** + * Perform an empty update (affects averaging) + */ + void tick(); + + /** + * Score a feature vector according to the model + * \param fv Feature vector to be scored + */ + ValType score(const MiraFeatureVector& fv) const; + + /** + * Squared norm of the weight vector + */ + ValType sqrNorm() const; + + /** + * Return an averaged view of this weight vector + */ + AvgWeightVector avg(); + + friend class AvgWeightVector; + +private: + /** + * Updates a weight and lazily updates its total + */ + void update(size_t index, ValType delta); + + /** + * Make sure everyone's total is up-to-date + */ + void fixTotals(); + + /** + * Helper to handle out-of-range weights + */ + ValType weight(size_t index) const; + + vector m_weights; + vector m_totals; + vector m_lastUpdated; + size_t m_numUpdates; +}; + +/** + * Averaged view of a weight vector + */ +class AvgWeightVector { +public: + AvgWeightVector(const MiraWeightVector& wv); + ValType score(const MiraFeatureVector& fv) const; + ValType weight(size_t index) const; + size_t size() const; +private: + const MiraWeightVector& m_wv; +}; + + +#endif // MERT_WEIGHT_VECTOR_H + +// --Emacs trickery-- +// Local Variables: +// mode:c++ +// c-basic-offset:2 +// End: diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp new file mode 100644 index 000000000..fa01b41a2 --- /dev/null +++ b/mert/kbmira.cpp @@ -0,0 +1,298 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** + +***********************************************************************/ + +/** + * k-best Batch Mira, as described in: + * + * Colin Cherry and George Foster + * Batch Tuning Strategies for Statistical Machine Translation + * NAACL 2012 + * + * Implemented by colin.cherry@nrc-cnrc.gc.ca + * + * To license implementations of any of the other tuners in that paper, + * please get in touch with any member of NRC Canada's Portage project + * + * Input is a set of n-best lists, encoded as feature and score files. + * + * Output is a weight file that results from running MIRA on these + * n-btest lists for J iterations. Will return the set that maximizes + * training BLEU. + **/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "BleuScorer.h" +#include "HypPackEnumerator.h" +#include "MiraFeatureVector.h" +#include "MiraWeightVector.h" + +using namespace std; + +namespace po = boost::program_options; + +ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) { + vector stats(kBleuNgramOrder*2+1,0); + for(train->reset(); !train->finished(); train->next()) { + // Find max model + size_t max_index=0; + ValType max_score=0; + for(size_t i=0;icur_size();i++) { + MiraFeatureVector vec(train->featuresAt(i)); + ValType score = wv.score(vec); + if(i==0 || score > max_score) { + max_index = i; + max_score = score; + } + } + // Update stats + const vector& sent = train->scoresAt(max_index); + for(size_t i=0;i scoreFiles; + vector featureFiles; + int seed; + string outputFile; + float c = 0.01; // Step-size cap C + float decay = 0.999; // Pseudo-corpus decay \gamma + int n_iters = 60; // Max epochs J + bool streaming = false; // Stream all k-best lists? + bool no_shuffle = false; // Don't shuffle, even for in memory version + bool model_bg = false; // Use model for background corpus + + // Command-line processing follows pro.cpp + po::options_description desc("Allowed options"); + desc.add_options() + ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") + ("scfile,S", po::value >(&scoreFiles), "Scorer data files") + ("ffile,F", po::value > (&featureFiles), "Feature data files") + ("random-seed,r", po::value(&seed), "Seed for random number generation") + ("output-file,o", po::value(&outputFile), "Output file") + ("cparam,C", po::value(&c), "MIRA C-parameter, lower for more regularization (default 0.01)") + ("decay,D", po::value(&decay), "BLEU background corpus decay rate (default 0.999)") + ("iters,J", po::value(&n_iters), "Number of MIRA iterations to run (default 60)") + ("dense-init,d", po::value(&denseInitFile), "Weight file for dense features") + ("sparse-init,s", po::value(&sparseInitFile), "Weight file for sparse features") + ("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle") + ("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch") + ("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background"); + ; + + po::options_description cmdline_options; + cmdline_options.add(desc); + po::variables_map vm; + po::store(po::command_line_parser(argc,argv). + options(cmdline_options).run(), vm); + po::notify(vm); + if (help) { + cout << "Usage: " + string(argv[0]) + " [options]" << endl; + cout << desc << endl; + exit(0); + } + + if (vm.count("random-seed")) { + cerr << "Initialising random seed to " << seed << endl; + srand(seed); + } else { + cerr << "Initialising random seed from system clock" << endl; + srand(time(NULL)); + } + + // Initialize weights + /// + // Dense + vector initParams; + if(!denseInitFile.empty()) { + ifstream opt(denseInitFile.c_str()); + string buffer; istringstream strstrm(buffer); + if (opt.fail()) { + cerr << "could not open dense initfile: " << denseInitFile << endl; + exit(3); + } + parameter_t val; + getline(opt,buffer); + while(strstrm >> val) initParams.push_back(val); + opt.close(); + } + size_t initDenseSize = initParams.size(); + // Sparse + if(!sparseInitFile.empty()) { + if(initDenseSize==0) { + cerr << "sparse initialization requires dense initialization" << endl; + exit(3); + } + ifstream opt(sparseInitFile.c_str()); + if(opt.fail()) { + cerr << "could not open sparse initfile: " << sparseInitFile << endl; + exit(3); + } + int sparseCount=0; + parameter_t val; std::string name; + while(opt >> name >> val) { + size_t id = SparseVector::encode(name) + initDenseSize; + while(initParams.size()<=id) initParams.push_back(0.0); + initParams[id] = val; + sparseCount++; + } + cerr << "Found " << sparseCount << " initial sparse features" << endl; + opt.close(); + } + + MiraWeightVector wv(initParams); + + // Initialize background corpus + vector bg; + for(int j=0;j train; + if(streaming) + train.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles)); + else + train.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle)); + cerr << "Initial BLEU = " << evaluate(train.get(), wv.avg()) << endl; + ValType bestBleu = 0; + for(int j=0;jreset(); !train->finished(); train->next()) { + + // Hope / fear decode + size_t hope_index=0, fear_index=0, model_index=0; + ValType hope_score=0, fear_score=0, model_score=0; + for(size_t i=0; i< train->cur_size(); i++) { + MiraFeatureVector vec(train->featuresAt(i)); + ValType score = wv.score(vec); + ValType bleu = sentenceLevelBackgroundBleu(train->scoresAt(i),bg); + // Hope + if(i==0 || (score + bleu) > hope_score) { + hope_score = score + bleu; + hope_index = i; + } + // Fear + if(i==0 || (score - bleu) > fear_score) { + fear_score = score - bleu; + fear_index = i; + } + // Model + if(i==0 || score > model_score) { + model_score = score; + model_index = i; + } + iNumHyps++; + } + // Update weights + if(hope_index!=fear_index) { + // Vector difference + MiraFeatureVector hope(train->featuresAt(hope_index)); + MiraFeatureVector fear(train->featuresAt(fear_index)); + MiraFeatureVector diff = hope - fear; + // Bleu difference + const vector& hope_stats = train->scoresAt(hope_index); + ValType hopeBleu = sentenceLevelBackgroundBleu(hope_stats, bg); + const vector& fear_stats = train->scoresAt(fear_index); + ValType fearBleu = sentenceLevelBackgroundBleu(fear_stats, bg); + assert(hopeBleu > fearBleu); + ValType delta = hopeBleu - fearBleu; + // Loss and update + ValType diff_score = wv.score(diff); + ValType loss = delta - diff_score; + if(loss > 0) { + ValType eta = min(c, loss / diff.sqrNorm()); + wv.update(diff,eta); + totalLoss+=loss; + iNumUpdates++; + } + // Update BLEU statistics + const vector& model_stats = train->scoresAt(model_index); + for(size_t k=0;k1e-8) + *out << SparseVector::decode(i-num_dense) << " " << avg.weight(i) << endl; + } + } + outFile.close(); + bestBleu = bleu; + } + } + cerr << "Best BLEU = " << bestBleu << endl; +} +// --Emacs trickery-- +// Local Variables: +// mode:c++ +// c-basic-offset:2 +// End: diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 2abd5ef7c..a430aa520 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -117,6 +117,9 @@ my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous # TODO: Should we also add these values to options of this script? my $megam_default_options = "-fvals -maxi 30 -nobias binary"; +# Flags related to Batch MIRA (Cherry & Foster, 2012) +my $___BATCH_MIRA = 0; # flg to enable batch MIRA + my $__THREADS = 0; # Parameter for effective reference length when computing BLEU score @@ -206,6 +209,7 @@ GetOptions( "pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER, "pro-starting-point" => \$___PRO_STARTING_POINT, "historic-interpolation=f" => \$___HISTORIC_INTERPOLATION, + "batch-mira" => \$___BATCH_MIRA, "threads=i" => \$__THREADS ) or exit(1); @@ -324,10 +328,12 @@ if (!defined $mertdir) { my $mert_extract_cmd = File::Spec->catfile($mertdir, "extractor"); my $mert_mert_cmd = File::Spec->catfile($mertdir, "mert"); my $mert_pro_cmd = File::Spec->catfile($mertdir, "pro"); +my $mert_mira_cmd = File::Spec->catfile($mertdir, "kbmira"); die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd; die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd; die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd; +die "Not executable: $mert_mira_cmd" if ! -x $mert_mira_cmd; my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt"); # or set to your installation @@ -727,6 +733,11 @@ while (1) { $scfiles = "$score_file"; } + my $mira_settings = ""; + $mira_settings .= " --dense-init run$run.$weights_in_file"; + if (-e "run$run.sparse-weights") { + $mira_settings .= " --sparse-init run$run.sparse-weights"; + } my $file_settings = " --ffile $ffiles --scfile $scfiles"; my $pro_file_settings = "--ffile " . join(" --ffile ", split(/,/, $ffiles)) . " --scfile " . join(" --scfile ", split(/,/, $scfiles)); @@ -759,6 +770,10 @@ while (1) { # ... and run mert $cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/; &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile); + } elsif ($___BATCH_MIRA) { # batch MIRA optimization + safesystem("echo 'not used' > $weights_out_file") or die; + $cmd = "$mert_mira_cmd $mira_settings $seed_settings $pro_file_settings -o $mert_outfile"; + &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile); } else { # just mert &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile); } @@ -906,7 +921,7 @@ chdir($cwd); sub get_weights_from_mert { my ($outfile, $logfile, $weight_count, $sparse_weights) = @_; my ($bestpoint, $devbleu); - if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/)) { + if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/) || $___BATCH_MIRA) { open my $fh, '<', $outfile or die "Can't open $outfile: $!"; my (@WEIGHT, $sum); for (my $i = 0; $i < $weight_count; $i++) { push @WEIGHT, 0; } @@ -923,6 +938,14 @@ sub get_weights_from_mert { foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; } $bestpoint = join(" ", @WEIGHT); close $fh; + if($___BATCH_MIRA) { + open my $fh2, '<', $logfile or die "Can't open $logfile: $!"; + while(<$fh2>) { + if(/Best BLEU = ([\-\d\.]+)/) { + $devbleu = $1; + } + } + } } else { open my $fh, '<', $logfile or die "Can't open $logfile: $!"; while (<$fh>) { From 2e370ed11b0cd8989118891dc4385619837dd39f Mon Sep 17 00:00:00 2001 From: phikoehn Date: Wed, 30 May 2012 00:58:18 +0100 Subject: [PATCH 32/38] more escaping in tokenizer; wrapper for berkeley parser (german) --- scripts/tokenizer/deescape-special-chars.perl | 18 +++---- scripts/tokenizer/detokenizer.perl | 18 +++---- scripts/tokenizer/escape-special-chars.perl | 14 +++--- scripts/tokenizer/tokenizer.perl | 14 +++--- .../wrappers/berkeleyparsed2mosesxml.perl | 36 ++++++++++++++ .../wrappers/mosesxml2berkeleyparsed.perl | 44 +++++++++++++++++ .../training/wrappers/parse-de-berkeley.perl | 48 +++++++++++++++++++ .../wrappers/syntax-hyphen-splitting.perl | 43 +++++++++++++++++ 8 files changed, 207 insertions(+), 28 deletions(-) create mode 100755 scripts/training/wrappers/berkeleyparsed2mosesxml.perl create mode 100755 scripts/training/wrappers/mosesxml2berkeleyparsed.perl create mode 100755 scripts/training/wrappers/parse-de-berkeley.perl create mode 100755 scripts/training/wrappers/syntax-hyphen-splitting.perl diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl index 55035ae6d..345555990 100755 --- a/scripts/tokenizer/deescape-special-chars.perl +++ b/scripts/tokenizer/deescape-special-chars.perl @@ -3,13 +3,15 @@ use strict; while() { - s/\&bar;/\|/g; - s/\</\/g; - s/\&bra;/\[/g; - s/\&ket;/\]/g; - s/\[/\[/g; - s/\]/\]/g; - s/\&/\&/g; + s/\&bar;/\|/g; # factor separator + s/\</\/g; # xml + s/\&bra;/\[/g; # syntax non-terminal (legacy) + s/\&ket;/\]/g; # syntax non-terminal (legacy) + s/\"/\"/g; # xml + s/\'/\'/g; # xml + s/\[/\[/g; # syntax non-terminal + s/\]/\]/g; # syntax non-terminal + s/\&/\&/g; # escape escape print $_; } diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index e55a1a26e..8233b419c 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -66,14 +66,16 @@ sub detokenize { $text = " $text "; $text =~ s/ \@\-\@ /-/g; # de-escape special chars - $text =~ s/\&bar;/\|/g; - $text =~ s/\</\/g; - $text =~ s/\&bra;/\[/g; - $text =~ s/\&ket;/\]/g; - $text =~ s/\[/\[/g; - $text =~ s/\]/\]/g; - $text =~ s/\&/\&/g; + $text =~ s/\&bar;/\|/g; # factor separator + $text =~ s/\</\/g; # xml + $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy) + $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy) + $text =~ s/\"/\"/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + $text =~ s/\&/\&/g; # escape escape my $word; my $i; diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl index f4c1b4dd5..5d9690c04 100755 --- a/scripts/tokenizer/escape-special-chars.perl +++ b/scripts/tokenizer/escape-special-chars.perl @@ -12,12 +12,14 @@ while() { s/ $//g; # special characters in moses - s/\&/\&/g; - s/\|/\&bar;/g; - s/\/\>/g; - s/\[/\[/g; - s/\]/\]/g; + s/\&/\&/g; # escape escape + s/\|/\&bar;/g; # factor separator + s/\/\>/g; # xml + s/\'/\'/g; # xml + s/\"/\"/g; # xml + s/\[/\[/g; # syntax non-terminal + s/\]/\]/g; # syntax non-terminal # restore xml instructions s/\<(\S+) translation="([^\"]+)"> (.+?) <\/(\S+)>/\<$1 translation=\"$2\"> $3 <\/$4>/g; diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index 70bb318f7..0cb713740 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -149,12 +149,14 @@ sub tokenize { $text =~ s/DOTMULTI/./g; #escape special chars - $text =~ s/\&/\&/g; - $text =~ s/\|/\&bar;/g; - $text =~ s/\/\>/g; - $text =~ s/\[/\[/g; - $text =~ s/\]/\]/g; + $text =~ s/\&/\&/g; # escape escape + $text =~ s/\|/\&bar;/g; # factor separator + $text =~ s/\/\>/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\"/\"/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal #ensure final line break $text .= "\n" unless $text =~ /\n$/; diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl new file mode 100755 index 000000000..6a4ed731e --- /dev/null +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl @@ -0,0 +1,36 @@ +#!/usr/bin/perl -w + +use strict; + +while() { + if (/^\(\(\)\)/) { + print "\n"; # parse failures + next; + } + + # prep + s/^\( /\(TOP /; + + # escape words + s/\&/\&/g; # escape escape + s/\|/\&bar;/g; # factor separator + s/\/\>/g; # xml + s/\'/\'/g; # xml + s/\"/\"/g; # xml + s/\[/\[/g; # syntax non-terminal + s/\]/\]/g; # syntax non-terminal + + # convert into tree + s/\((\S+) / /g; + s/\)/ <\/tree> /g; + s/\"\-LRB\-\"/\"LRB\"/g; # labels + s/\"\-RRB\-\"/\"RRB\"/g; + s/\-LRB\-/\(/g; # tokens + s/\-RRB\-/\)/g; + s/ +/ /g; + s/ $//g; + + # output, replace words with original + print $_; +} diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl new file mode 100755 index 000000000..ef6e66024 --- /dev/null +++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl @@ -0,0 +1,44 @@ +#!/usr/bin/perl -w + +use strict; + +#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) ) +#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) ) + +while() { + if (/^$/) { + print "\n"; # parse failures + next; + } + + # parenheses + s/\(/\-LRB\-/g; # tokens + s/\)/\-RRB\-/g; + s/\"LRB\"/\"\-LRB\-\"/g; # labels + s/\"RRB\"/\"\-RRB\-\"/g; + + # main + s//\($1/g; + s/ *<\/tree>/\)/g; + s/^\(TOP/\(/; + + # de-escape + s/\&bar;/\|/g; # factor separator + s/\</\/g; # xml + s/\&bra;/\[/g; # syntax non-terminal (legacy) + s/\&ket;/\]/g; # syntax non-terminal (legacy) + s/\"/\"/g; # xml + s/\'/\'/g; # xml + s/\[/\[/g; # syntax non-terminal + s/\]/\]/g; # syntax non-terminal + s/\&/\&/g; # escape escape + + # cleanup + s/ +/ /g; + s/ $//g; + s/\)$/ \)/g; + + # output + print $_; +} diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl new file mode 100755 index 000000000..6482d11f3 --- /dev/null +++ b/scripts/training/wrappers/parse-de-berkeley.perl @@ -0,0 +1,48 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long "GetOptions"; +use FindBin qw($Bin); + +my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$MARK_SPLIT,$BINARIZE); + +die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n") + unless &GetOptions + ('jar=s' => \$JAR, + 'gr=s' => \$GRAMMAR, + 'split-hyphen' => \$SPLIT_HYPHEN, + 'mark-split' => \$MARK_SPLIT, + 'binarize' => \$BINARIZE) + && defined($JAR) && defined($GRAMMAR); + +die("ERROR: could not find jar file '$JAR'\n") unless -e $JAR; +die("ERROR: could not find grammar file '$GRAMMAR'\n") unless -e $GRAMMAR; + +$BINARIZE = $BINARIZE ? "-binarize" : ""; +$SPLIT_HYPHEN = $SPLIT_HYPHEN ? "| $Bin/syntax-hyphen-splitting.perl $BINARIZE" : ""; +$SPLIT_HYPHEN .= " -mark-split" if $SPLIT_HYPHEN && $MARK_SPLIT; + +my $tmp = "/tmp/parse-de-berkeley.$$"; + +open(TMP,"| $Bin/../../tokenizer/deescape-special-chars.perl > $tmp"); +while() { + # unsplit hyphens + s/ \@-\@ /-/g if $SPLIT_HYPHEN; + + # handle parentheses + s/\(/*LRB*/g; + s/\)/*RRB*/g; + + print TMP $_; +} +close(TMP); + +my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $Bin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN"; +print STDERR $cmd."\n"; + +open(PARSE,"$cmd|"); +while() { + print $_; +} +close(PARSE); +`rm $tmp`; diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl new file mode 100755 index 000000000..69290e51d --- /dev/null +++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl @@ -0,0 +1,43 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long "GetOptions"; + +my $MARK_HYP = 0; +my $BINARIZE = 0; + +die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP); + +while() { + chop; + my @OUT = (); + foreach (split) { + if (/^$/) { + push @OUT, $_; + } + elsif(/([\p{IsAlnum}])\-([\p{IsAlnum}])/) { + s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g; + my @WORD = split; + $OUT[$#OUT] =~ /label=\"([^\"]+)\"/; + my $pos = $1; + if ($MARK_HYP) { + $OUT[$#OUT] =~ s/label=\"/label=\"HYP-/; + } + if ($BINARIZE) { + for(my $i=0;$i"; + } + } + for(my $i=0;$i=2) { + push @OUT, ""; + } + push @OUT," $WORD[$i] "; + } + } + else { + push @OUT, $_; + } + } + print join(" ",@OUT)."\n"; +} From 2b20de8ea944f122e3e367db69f88d95cc83e393 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 30 May 2012 09:49:43 +0100 Subject: [PATCH 33/38] xcode build supports threads. Abort when using Adam's suffix arrays with threads --- .../moses-chart-cmd.xcodeproj/project.pbxproj | 14 ++++++++++++-- .../other-builds/moses.xcodeproj/project.pbxproj | 2 ++ .../RuleTable/PhraseDictionaryALSuffixArray.cpp | 9 +++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj b/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj index 897a4881b..82fe6607c 100644 --- a/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj +++ b/contrib/other-builds/moses-chart-cmd.xcodeproj/project.pbxproj @@ -307,6 +307,7 @@ LIBRARY_SEARCH_PATHS = ( ../../irstlm/lib, ../../srilm/lib/macosx, + /opt/local/lib, ); OTHER_LDFLAGS = ( "-lz", @@ -316,6 +317,7 @@ "-loolm", "-lflm", "-llattice", + "-lboost_thread-mt", ); PRODUCT_NAME = "moses-chart-cmd"; USER_HEADER_SEARCH_PATHS = "../../ ../../moses/src"; @@ -338,6 +340,7 @@ LIBRARY_SEARCH_PATHS = ( ../../irstlm/lib, ../../srilm/lib/macosx, + /opt/local/lib, ); OTHER_LDFLAGS = ( "-lz", @@ -347,6 +350,7 @@ "-loolm", "-lflm", "-llattice", + "-lboost_thread-mt", ); PRODUCT_NAME = "moses-chart-cmd"; USER_HEADER_SEARCH_PATHS = "../../ ../../moses/src"; @@ -359,7 +363,10 @@ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)"; GCC_C_LANGUAGE_STANDARD = gnu99; GCC_OPTIMIZATION_LEVEL = 0; - GCC_PREPROCESSOR_DEFINITIONS = TRACE_ENABLE; + GCC_PREPROCESSOR_DEFINITIONS = ( + TRACE_ENABLE, + WITH_THREADS, + ); GCC_WARN_ABOUT_RETURN_TYPE = YES; GCC_WARN_UNUSED_VARIABLE = YES; HEADER_SEARCH_PATHS = ( @@ -378,7 +385,10 @@ buildSettings = { ARCHS = "$(ARCHS_STANDARD_32_64_BIT)"; GCC_C_LANGUAGE_STANDARD = gnu99; - GCC_PREPROCESSOR_DEFINITIONS = TRACE_ENABLE; + GCC_PREPROCESSOR_DEFINITIONS = ( + TRACE_ENABLE, + WITH_THREADS, + ); GCC_WARN_ABOUT_RETURN_TYPE = YES; GCC_WARN_UNUSED_VARIABLE = YES; HEADER_SEARCH_PATHS = ( diff --git a/contrib/other-builds/moses.xcodeproj/project.pbxproj b/contrib/other-builds/moses.xcodeproj/project.pbxproj index b12427138..b870a74c9 100644 --- a/contrib/other-builds/moses.xcodeproj/project.pbxproj +++ b/contrib/other-builds/moses.xcodeproj/project.pbxproj @@ -1357,6 +1357,7 @@ LM_IRST, "_FILE_OFFSET_BITS=64", _LARGE_FILES, + WITH_THREADS, ); HEADER_SEARCH_PATHS = ( ../.., @@ -1399,6 +1400,7 @@ LM_IRST, "_FILE_OFFSET_BITS=64", _LARGE_FILES, + WITH_THREADS, ); HEADER_SEARCH_PATHS = ( ../.., diff --git a/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp b/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp index 93fc083e5..5a886d32d 100644 --- a/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp +++ b/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp @@ -13,6 +13,8 @@ #include "RuleTable/Loader.h" #include "RuleTable/LoaderFactory.h" #include "TypeDef.h" +#include "StaticData.h" +#include "UserMessage.h" using namespace std; @@ -27,6 +29,13 @@ bool PhraseDictionaryALSuffixArray::Load(const std::vector &input , const LMList &languageModels , const WordPenaltyProducer* wpProducer) { + const StaticData &staticData = StaticData::Instance(); + if (staticData.ThreadCount() > 1) + { + UserMessage::Add("Suffix array implementation is not threadsafe"); + return false; + } + // file path is the directory of the rules for eacg, NOT the file of all the rules SetFilePath(filePath); m_tableLimit = tableLimit; From 9f03125418abb4749b5a46154f4d1c3e19276600 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 30 May 2012 11:50:01 +0100 Subject: [PATCH 34/38] Minor rollback --- util/bit_packing.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/bit_packing.cc b/util/bit_packing.cc index b5a14008b..41999b726 100644 --- a/util/bit_packing.cc +++ b/util/bit_packing.cc @@ -10,7 +10,7 @@ template struct StaticCheck {}; template <> struct StaticCheck { typedef bool StaticAssertionPassed; }; // If your float isn't 4 bytes, we're hosed. -//typedef StaticCheck::StaticAssertionPassed FloatSize; +typedef StaticCheck::StaticAssertionPassed FloatSize; } // namespace From 45870348ff4f8860b9a9b35a4d20952023ead4d7 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 30 May 2012 12:47:20 +0100 Subject: [PATCH 35/38] xcode build supports threads. move 'using namespace' out from .h file to stop namespace pollution --- .../moses-cmd.xcodeproj/project.pbxproj | 16 ++++- mert/HypPackEnumerator.h | 27 ++++---- mert/MiraFeatureVector.h | 12 ++-- moses-cmd/src/LatticeMBR.cpp | 1 + moses-cmd/src/LatticeMBR.h | 66 +++++++++---------- 5 files changed, 67 insertions(+), 55 deletions(-) diff --git a/contrib/other-builds/moses-cmd.xcodeproj/project.pbxproj b/contrib/other-builds/moses-cmd.xcodeproj/project.pbxproj index 927961b2f..619ecf76c 100644 --- a/contrib/other-builds/moses-cmd.xcodeproj/project.pbxproj +++ b/contrib/other-builds/moses-cmd.xcodeproj/project.pbxproj @@ -311,6 +311,7 @@ LM_SRI, LM_IRST, TRACE_ENABLE, + WITH_THREADS, ); GCC_WARN_ABOUT_RETURN_TYPE = YES; GCC_WARN_UNUSED_VARIABLE = YES; @@ -324,6 +325,7 @@ LIBRARY_SEARCH_PATHS = ( ../../irstlm/lib, ../../srilm/lib/macosx, + /opt/local/lib, ); OTHER_LDFLAGS = ( "-lflm", @@ -332,6 +334,7 @@ "-ldstruct", "-lz", "-lirstlm", + "-lboost_thread-mt", ); PREBINDING = NO; PRODUCT_NAME = "moses-cmd"; @@ -348,9 +351,10 @@ GCC_MODEL_TUNING = G5; GCC_OPTIMIZATION_LEVEL = 3; GCC_PREPROCESSOR_DEFINITIONS = ( - LM_IRST, LM_SRI, + LM_IRST, TRACE_ENABLE, + WITH_THREADS, ); GCC_WARN_ABOUT_RETURN_TYPE = YES; GCC_WARN_UNUSED_VARIABLE = YES; @@ -364,6 +368,7 @@ LIBRARY_SEARCH_PATHS = ( ../../irstlm/lib, ../../srilm/lib/macosx, + /opt/local/lib, ); OTHER_LDFLAGS = ( "-lflm", @@ -372,6 +377,7 @@ "-ldstruct", "-lz", "-lirstlm", + "-lboost_thread-mt", ); PREBINDING = NO; PRODUCT_NAME = "moses-cmd"; @@ -384,6 +390,12 @@ buildSettings = { GCC_GENERATE_DEBUGGING_SYMBOLS = NO; GCC_MODEL_TUNING = G5; + GCC_PREPROCESSOR_DEFINITIONS = ( + LM_SRI, + LM_IRST, + TRACE_ENABLE, + WITH_THREADS, + ); GCC_WARN_ABOUT_RETURN_TYPE = YES; GCC_WARN_UNUSED_VARIABLE = YES; HEADER_SEARCH_PATHS = ( @@ -396,6 +408,7 @@ LIBRARY_SEARCH_PATHS = ( ../../irstlm/lib, ../../srilm/lib/macosx, + /opt/local/lib, ); OTHER_LDFLAGS = ( "-lflm", @@ -404,6 +417,7 @@ "-ldstruct", "-lz", "-lirstlm", + "-lboost_thread-mt", ); PREBINDING = NO; PRODUCT_NAME = "moses-cmd"; diff --git a/mert/HypPackEnumerator.h b/mert/HypPackEnumerator.h index 9f44c3372..07f12e91b 100644 --- a/mert/HypPackEnumerator.h +++ b/mert/HypPackEnumerator.h @@ -12,12 +12,11 @@ #include #include #include +#include #include "FeatureDataIterator.h" #include "ScoreDataIterator.h" -using namespace std; - // Start with these abstract classes class HypPackEnumerator { @@ -36,8 +35,8 @@ public: // Low-memory, low-speed, sequential access class StreamingHypPackEnumerator : public HypPackEnumerator { public: - StreamingHypPackEnumerator(vector const& featureFiles, - vector const& scoreFiles + StreamingHypPackEnumerator(std::vector const& featureFiles, + std::vector const& scoreFiles ); virtual size_t num_dense() const; @@ -54,14 +53,14 @@ private: void prime(); size_t m_num_lists; size_t m_sentenceId; - vector m_featureFiles; - vector m_scoreFiles; + std::vector m_featureFiles; + std::vector m_scoreFiles; bool m_primed; int m_iNumDense; - vector m_featureDataIters; - vector m_scoreDataIters; - vector > m_current_indexes; + std::vector m_featureDataIters; + std::vector m_scoreDataIters; + std::vector > m_current_indexes; }; // Instantiation that reads into memory @@ -69,8 +68,8 @@ private: // (Actually randomizes with each call to reset) class RandomAccessHypPackEnumerator : public HypPackEnumerator { public: - RandomAccessHypPackEnumerator(vector const& featureFiles, - vector const& scoreFiles, + RandomAccessHypPackEnumerator(std::vector const& featureFiles, + std::vector const& scoreFiles, bool no_shuffle); virtual size_t num_dense() const; @@ -87,9 +86,9 @@ private: bool m_no_shuffle; size_t m_cur_index; size_t m_num_dense; - vector m_indexes; - vector > m_features; - vector > m_scores; + std::vector m_indexes; + std::vector > m_features; + std::vector > m_scores; }; #endif // MERT_HYP_PACK_COLLECTION_H diff --git a/mert/MiraFeatureVector.h b/mert/MiraFeatureVector.h index 14336c56f..27a4510ad 100644 --- a/mert/MiraFeatureVector.h +++ b/mert/MiraFeatureVector.h @@ -24,9 +24,9 @@ class MiraFeatureVector { public: MiraFeatureVector(const FeatureDataItem& vec); MiraFeatureVector(const MiraFeatureVector& other); - MiraFeatureVector(const vector& dense, - const vector& sparseFeats, - const vector& sparseVals); + MiraFeatureVector(const std::vector& dense, + const std::vector& sparseFeats, + const std::vector& sparseVals); ValType val(size_t index) const; size_t feat(size_t index) const; @@ -37,9 +37,9 @@ public: const MiraFeatureVector& b); private: - vector m_dense; - vector m_sparseFeats; - vector m_sparseVals; + std::vector m_dense; + std::vector m_sparseFeats; + std::vector m_sparseVals; }; #endif // MERT_FEATURE_VECTOR_H diff --git a/moses-cmd/src/LatticeMBR.cpp b/moses-cmd/src/LatticeMBR.cpp index b579fb592..1b1ec8284 100644 --- a/moses-cmd/src/LatticeMBR.cpp +++ b/moses-cmd/src/LatticeMBR.cpp @@ -13,6 +13,7 @@ #include using namespace std; +using namespace Moses; size_t bleu_order = 4; float UNKNGRAMLOGPROB = -20; diff --git a/moses-cmd/src/LatticeMBR.h b/moses-cmd/src/LatticeMBR.h index 8b54e6c51..fa0379aee 100644 --- a/moses-cmd/src/LatticeMBR.h +++ b/moses-cmd/src/LatticeMBR.h @@ -17,35 +17,33 @@ #include "Manager.h" #include "TrellisPathList.h" -using namespace Moses; - class Edge; -typedef std::vector< const Hypothesis *> Lattice; +typedef std::vector< const Moses::Hypothesis *> Lattice; typedef std::vector Path; typedef std::map PathCounts; -typedef std::map NgramHistory; +typedef std::map NgramHistory; class Edge { - const Hypothesis* m_tailNode; - const Hypothesis* m_headNode; + const Moses::Hypothesis* m_tailNode; + const Moses::Hypothesis* m_headNode; float m_score; - TargetPhrase m_targetPhrase; + Moses::TargetPhrase m_targetPhrase; NgramHistory m_ngrams; public: - Edge(const Hypothesis* from, const Hypothesis* to, float score, const TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) { + Edge(const Moses::Hypothesis* from, const Moses::Hypothesis* to, float score, const Moses::TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) { //cout << "Creating new edge from Node " << from->GetId() << ", to Node : " << to->GetId() << ", score: " << score << " phrase: " << targetPhrase << endl; } - const Hypothesis* GetHeadNode() const { + const Moses::Hypothesis* GetHeadNode() const { return m_headNode; } - const Hypothesis* GetTailNode() const { + const Moses::Hypothesis* GetTailNode() const { return m_tailNode; } @@ -57,19 +55,19 @@ public: return m_targetPhrase.GetSize(); } - const Phrase& GetWords() const { + const Moses::Phrase& GetWords() const { return m_targetPhrase; } friend std::ostream& operator<< (std::ostream& out, const Edge& edge); - const NgramHistory& GetNgrams( std::map > & incomingEdges) ; + const NgramHistory& GetNgrams( std::map > & incomingEdges) ; bool operator < (const Edge & compare) const; - void GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const; + void GetPhraseSuffix(const Moses::Phrase& origPhrase, size_t lastN, Moses::Phrase& targetPhrase) const; - void storeNgramHistory(const Phrase& phrase, Path & path, size_t count = 1) { + void storeNgramHistory(const Moses::Phrase& phrase, Path & path, size_t count = 1) { m_ngrams[phrase][path]+= count; } @@ -84,16 +82,16 @@ public: NgramScores() {} /** logsum this score to the existing score */ - void addScore(const Hypothesis* node, const Phrase& ngram, float score); + void addScore(const Moses::Hypothesis* node, const Moses::Phrase& ngram, float score); /** Iterate through ngrams for selected node */ - typedef std::map::const_iterator NodeScoreIterator; - NodeScoreIterator nodeBegin(const Hypothesis* node); - NodeScoreIterator nodeEnd(const Hypothesis* node); + typedef std::map::const_iterator NodeScoreIterator; + NodeScoreIterator nodeBegin(const Moses::Hypothesis* node); + NodeScoreIterator nodeEnd(const Moses::Hypothesis* node); private: - std::set m_ngrams; - std::map > m_scores; + std::set m_ngrams; + std::map > m_scores; }; @@ -102,11 +100,11 @@ class LatticeMBRSolution { public: /** Read the words from the path */ - LatticeMBRSolution(const TrellisPath& path, bool isMap); + LatticeMBRSolution(const Moses::TrellisPath& path, bool isMap); const std::vector& GetNgramScores() const { return m_ngramScores; } - const std::vector& GetWords() const { + const std::vector& GetWords() const { return m_words; } float GetMapScore() const { @@ -117,10 +115,10 @@ public: } /** Initialise ngram scores */ - void CalcScore(std::map& finalNgramScores, const std::vector& thetas, float mapWeight); + void CalcScore(std::map& finalNgramScores, const std::vector& thetas, float mapWeight); private: - std::vector m_words; + std::vector m_words; float m_mapScore; std::vector m_ngramScores; float m_score; @@ -132,18 +130,18 @@ struct LatticeMBRSolutionComparator { } }; -void pruneLatticeFB(Lattice & connectedHyp, std::map < const Hypothesis*, std::set > & outgoingHyps, std::map >& incomingEdges, - const std::vector< float> & estimatedScores, const Hypothesis*, size_t edgeDensity,float scale); +void pruneLatticeFB(Lattice & connectedHyp, std::map < const Moses::Hypothesis*, std::set > & outgoingHyps, std::map >& incomingEdges, + const std::vector< float> & estimatedScores, const Moses::Hypothesis*, size_t edgeDensity,float scale); //Use the ngram scores to rerank the nbest list, return at most n solutions -void getLatticeMBRNBest(Manager& manager, TrellisPathList& nBestList, std::vector& solutions, size_t n); +void getLatticeMBRNBest(Moses::Manager& manager, Moses::TrellisPathList& nBestList, std::vector& solutions, size_t n); //calculate expectated ngram counts, clipping at 1 (ie calculating posteriors) if posteriors==true. -void calcNgramExpectations(Lattice & connectedHyp, std::map >& incomingEdges, std::map >& incomingEdges, std::map& finalNgramScores, bool posteriors); -void GetOutputFactors(const TrellisPath &path, std::vector &translation); -void extract_ngrams(const std::vector& sentence, std::map < Phrase, int > & allngrams); -bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b); -std::vector doLatticeMBR(Manager& manager, TrellisPathList& nBestList); -const TrellisPath doConsensusDecoding(Manager& manager, TrellisPathList& nBestList); -//std::vector doConsensusDecoding(Manager& manager, TrellisPathList& nBestList); +void GetOutputFactors(const Moses::TrellisPath &path, std::vector &translation); +void extract_ngrams(const std::vector& sentence, std::map < Moses::Phrase, int > & allngrams); +bool ascendingCoverageCmp(const Moses::Hypothesis* a, const Moses::Hypothesis* b); +std::vector doLatticeMBR(Moses::Manager& manager, Moses::TrellisPathList& nBestList); +const Moses::TrellisPath doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList); +//std::vector doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList); #endif From d25805858df34eb944f2c2db6b47c21d960136d8 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 30 May 2012 13:04:02 +0100 Subject: [PATCH 36/38] xcode build supports threads. move 'using namespace' out from .h file to stop namespace pollution --- mert/HypPackEnumerator.cpp | 2 ++ .../SentenceAlignmentWithSyntax.cpp | 2 ++ .../training/phrase-extract/hierarchical.h | 14 +++++----- scripts/training/phrase-extract/relax-parse.h | 4 +-- .../training/phrase-extract/tables-core.cpp | 2 ++ scripts/training/phrase-extract/tables-core.h | 26 +++++++++---------- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/mert/HypPackEnumerator.cpp b/mert/HypPackEnumerator.cpp index 9da627212..ffbf3cfb5 100644 --- a/mert/HypPackEnumerator.cpp +++ b/mert/HypPackEnumerator.cpp @@ -4,6 +4,8 @@ #include #include +using namespace std; + StreamingHypPackEnumerator::StreamingHypPackEnumerator ( vector const& featureFiles, diff --git a/scripts/training/phrase-extract/SentenceAlignmentWithSyntax.cpp b/scripts/training/phrase-extract/SentenceAlignmentWithSyntax.cpp index 39c95c221..06dc3919f 100644 --- a/scripts/training/phrase-extract/SentenceAlignmentWithSyntax.cpp +++ b/scripts/training/phrase-extract/SentenceAlignmentWithSyntax.cpp @@ -27,6 +27,8 @@ #include "XmlException.h" #include "XmlTree.h" +using namespace std; + bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID) { if (!m_options.targetSyntax) { diff --git a/scripts/training/phrase-extract/hierarchical.h b/scripts/training/phrase-extract/hierarchical.h index 40f6744ff..61c899013 100644 --- a/scripts/training/phrase-extract/hierarchical.h +++ b/scripts/training/phrase-extract/hierarchical.h @@ -14,22 +14,20 @@ #include #include -using namespace std; - // HPhraseVertex represents a point in the alignment matrix -typedef pair HPhraseVertex; +typedef std::pair HPhraseVertex; // Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix: // bottom-left and top-right -typedef pair HPhrase; +typedef std::pair HPhrase; -// HPhraseVector is a vector of phrases +// HPhraseVector is a std::vector of phrases // the bool value indicates if the associated phrase is within the length limit or not -typedef vector < HPhrase > HPhraseVector; +typedef std::vector < HPhrase > HPhraseVector; // SentenceVertices represents all vertices that have the same positioning of all extracted phrases -// The key of the map is the English index and the value is a set of the foreign ones -typedef map > HSenteceVertices; +// The key of the std::map is the English index and the value is a std::set of the foreign ones +typedef std::map > HSenteceVertices; #endif /* HIERARCHICAL_H_ */ diff --git a/scripts/training/phrase-extract/relax-parse.h b/scripts/training/phrase-extract/relax-parse.h index cdde3f16c..ae5994641 100644 --- a/scripts/training/phrase-extract/relax-parse.h +++ b/scripts/training/phrase-extract/relax-parse.h @@ -31,8 +31,6 @@ #include "SyntaxTree.h" #include "XmlTree.h" -using namespace std; - #define LINE_MAX_LENGTH 1000000 bool leftBinarizeFlag = false; @@ -41,7 +39,7 @@ char SAMTLevel = 0; // functions void init(int argc, char* argv[]); -void store( SyntaxTree &tree, vector &words ); +void store( SyntaxTree &tree, std::vector &words ); void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ); void RightBinarize( SyntaxTree &tree, ParentNodes &parents ); void SAMT( SyntaxTree &tree, ParentNodes &parents ); diff --git a/scripts/training/phrase-extract/tables-core.cpp b/scripts/training/phrase-extract/tables-core.cpp index de50f5024..93ad8b6a1 100644 --- a/scripts/training/phrase-extract/tables-core.cpp +++ b/scripts/training/phrase-extract/tables-core.cpp @@ -5,6 +5,8 @@ #define TABLE_LINE_MAX_LENGTH 1000 #define UNKNOWNSTR "UNK" +using namespace std; + // as in beamdecoder/tables.cpp vector tokenize( const char* input ) { diff --git a/scripts/training/phrase-extract/tables-core.h b/scripts/training/phrase-extract/tables-core.h index 2db8086e5..1899b4d77 100644 --- a/scripts/training/phrase-extract/tables-core.h +++ b/scripts/training/phrase-extract/tables-core.h @@ -12,18 +12,16 @@ #include #include -using namespace std; +extern std::vector tokenize( const char*); -extern vector tokenize( const char*); - -typedef string WORD; +typedef std::string WORD; typedef unsigned int WORD_ID; class Vocabulary { public: - map lookup; - vector< WORD > vocab; + std::map lookup; + std::vector< WORD > vocab; WORD_ID storeIfNew( const WORD& ); WORD_ID getWordID( const WORD& ); inline WORD &getWord( WORD_ID id ) { @@ -31,14 +29,14 @@ public: } }; -typedef vector< WORD_ID > PHRASE; +typedef std::vector< WORD_ID > PHRASE; typedef unsigned int PHRASE_ID; class PhraseTable { public: - map< PHRASE, PHRASE_ID > lookup; - vector< PHRASE > phraseTable; + std::map< PHRASE, PHRASE_ID > lookup; + std::vector< PHRASE > phraseTable; PHRASE_ID storeIfNew( const PHRASE& ); PHRASE_ID getPhraseID( const PHRASE& ); void clear(); @@ -47,21 +45,21 @@ public: } }; -typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC; +typedef std::vector< std::pair< PHRASE_ID, double > > PHRASEPROBVEC; class TTable { public: - map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable; - map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti; + std::map< PHRASE_ID, std::vector< std::pair< PHRASE_ID, double > > > ttable; + std::map< PHRASE_ID, std::vector< std::pair< PHRASE_ID, std::vector< double > > > > ttableMulti; }; class DTable { public: - map< int, double > dtable; + std::map< int, double > dtable; void init(); - void load( const string& ); + void load( const std::string& ); double get( int ); }; From 01eb60f35031157b5a780e539473dfd88a7714d1 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 30 May 2012 22:59:23 +0900 Subject: [PATCH 37/38] Add "virtual" destructor to the HypPackEnumerator class. --- mert/HypPackEnumerator.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mert/HypPackEnumerator.h b/mert/HypPackEnumerator.h index 07f12e91b..5d2a230a5 100644 --- a/mert/HypPackEnumerator.h +++ b/mert/HypPackEnumerator.h @@ -21,6 +21,8 @@ class HypPackEnumerator { public: + virtual ~HypPackEnumerator() {} + virtual void reset() = 0; virtual bool finished() = 0; virtual void next() = 0; From beb2256dbaf420ed525cc8354617ead0db315060 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 30 May 2012 23:11:09 +0900 Subject: [PATCH 38/38] Move 'using namespace std' out from .h. Add "std" to size_t, too. --- mert/HypPackEnumerator.h | 45 +++++++++++++++++++------------------- mert/MiraFeatureVector.cpp | 6 +++-- mert/MiraFeatureVector.h | 18 +++++++-------- mert/MiraWeightVector.cpp | 2 ++ mert/MiraWeightVector.h | 26 ++++++++++------------ 5 files changed, 48 insertions(+), 49 deletions(-) diff --git a/mert/HypPackEnumerator.h b/mert/HypPackEnumerator.h index 5d2a230a5..d878c2625 100644 --- a/mert/HypPackEnumerator.h +++ b/mert/HypPackEnumerator.h @@ -27,10 +27,10 @@ public: virtual bool finished() = 0; virtual void next() = 0; - virtual size_t cur_size() = 0; - virtual size_t num_dense() const = 0; - virtual const FeatureDataItem& featuresAt(size_t i) = 0; - virtual const ScoreDataItem& scoresAt(size_t i) = 0; + virtual std::size_t cur_size() = 0; + virtual std::size_t num_dense() const = 0; + virtual const FeatureDataItem& featuresAt(std::size_t i) = 0; + virtual const ScoreDataItem& scoresAt(std::size_t i) = 0; }; // Instantiation that streams from disk @@ -38,23 +38,22 @@ public: class StreamingHypPackEnumerator : public HypPackEnumerator { public: StreamingHypPackEnumerator(std::vector const& featureFiles, - std::vector const& scoreFiles - ); + std::vector const& scoreFiles); + + virtual std::size_t num_dense() const; - virtual size_t num_dense() const; - virtual void reset(); virtual bool finished(); virtual void next(); - virtual size_t cur_size(); - virtual const FeatureDataItem& featuresAt(size_t i); - virtual const ScoreDataItem& scoresAt(size_t i); - + virtual std::size_t cur_size(); + virtual const FeatureDataItem& featuresAt(std::size_t i); + virtual const ScoreDataItem& scoresAt(std::size_t i); + private: void prime(); - size_t m_num_lists; - size_t m_sentenceId; + std::size_t m_num_lists; + std::size_t m_sentenceId; std::vector m_featureFiles; std::vector m_scoreFiles; @@ -62,7 +61,7 @@ private: int m_iNumDense; std::vector m_featureDataIters; std::vector m_scoreDataIters; - std::vector > m_current_indexes; + std::vector > m_current_indexes; }; // Instantiation that reads into memory @@ -74,21 +73,21 @@ public: std::vector const& scoreFiles, bool no_shuffle); - virtual size_t num_dense() const; - + virtual std::size_t num_dense() const; + virtual void reset(); virtual bool finished(); virtual void next(); - virtual size_t cur_size(); - virtual const FeatureDataItem& featuresAt(size_t i); - virtual const ScoreDataItem& scoresAt(size_t i); + virtual std::size_t cur_size(); + virtual const FeatureDataItem& featuresAt(std::size_t i); + virtual const ScoreDataItem& scoresAt(std::size_t i); private: bool m_no_shuffle; - size_t m_cur_index; - size_t m_num_dense; - std::vector m_indexes; + std::size_t m_cur_index; + std::size_t m_num_dense; + std::vector m_indexes; std::vector > m_features; std::vector > m_scores; }; diff --git a/mert/MiraFeatureVector.cpp b/mert/MiraFeatureVector.cpp index 9636b2fcd..b72d29595 100644 --- a/mert/MiraFeatureVector.cpp +++ b/mert/MiraFeatureVector.cpp @@ -2,6 +2,8 @@ #include "MiraFeatureVector.h" +using namespace std; + MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec) : m_dense(vec.dense) { @@ -97,7 +99,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector& vector sparseVals; vector sparseFeats; while(i < a.m_sparseFeats.size() && j < b.m_sparseFeats.size()) { - + if(a.m_sparseFeats[i] < b.m_sparseFeats[j]) { sparseFeats.push_back(a.m_sparseFeats[i]); sparseVals.push_back(a.m_sparseVals[i]); @@ -136,7 +138,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector& // Create and return vector return MiraFeatureVector(dense,sparseFeats,sparseVals); } - + // --Emacs trickery-- // Local Variables: // mode:c++ diff --git a/mert/MiraFeatureVector.h b/mert/MiraFeatureVector.h index 27a4510ad..31dd025c3 100644 --- a/mert/MiraFeatureVector.h +++ b/mert/MiraFeatureVector.h @@ -16,8 +16,6 @@ #include "FeatureDataIterator.h" -using namespace std; - typedef FeatureStatsType ValType; class MiraFeatureVector { @@ -25,20 +23,20 @@ public: MiraFeatureVector(const FeatureDataItem& vec); MiraFeatureVector(const MiraFeatureVector& other); MiraFeatureVector(const std::vector& dense, - const std::vector& sparseFeats, + const std::vector& sparseFeats, const std::vector& sparseVals); - - ValType val(size_t index) const; - size_t feat(size_t index) const; - size_t size() const; + + ValType val(std::size_t index) const; + std::size_t feat(std::size_t index) const; + std::size_t size() const; ValType sqrNorm() const; - + friend MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector& b); - + private: std::vector m_dense; - std::vector m_sparseFeats; + std::vector m_sparseFeats; std::vector m_sparseVals; }; diff --git a/mert/MiraWeightVector.cpp b/mert/MiraWeightVector.cpp index 8b46044fa..7e17a2714 100644 --- a/mert/MiraWeightVector.cpp +++ b/mert/MiraWeightVector.cpp @@ -1,5 +1,7 @@ #include "MiraWeightVector.h" +using namespace std; + /** * Constructor, initializes to the zero vector */ diff --git a/mert/MiraWeightVector.h b/mert/MiraWeightVector.h index 375858634..65b374625 100644 --- a/mert/MiraWeightVector.h +++ b/mert/MiraWeightVector.h @@ -4,7 +4,7 @@ * * A self-averaging weight-vector. Good for * perceptron learning as well. - * + * */ #ifndef MERT_MIRA_WEIGHT_VECTOR_H @@ -14,8 +14,6 @@ #include "MiraFeatureVector.h" -using namespace std; - class AvgWeightVector; class MiraWeightVector { @@ -29,7 +27,7 @@ public: * Constructor with provided initial vector * \param init Initial feature values */ - MiraWeightVector(const vector& init); + MiraWeightVector(const std::vector& init); /** * Update a the model @@ -60,12 +58,12 @@ public: AvgWeightVector avg(); friend class AvgWeightVector; - + private: /** * Updates a weight and lazily updates its total */ - void update(size_t index, ValType delta); + void update(std::size_t index, ValType delta); /** * Make sure everyone's total is up-to-date @@ -75,12 +73,12 @@ private: /** * Helper to handle out-of-range weights */ - ValType weight(size_t index) const; - - vector m_weights; - vector m_totals; - vector m_lastUpdated; - size_t m_numUpdates; + ValType weight(std::size_t index) const; + + std::vector m_weights; + std::vector m_totals; + std::vector m_lastUpdated; + std::size_t m_numUpdates; }; /** @@ -90,8 +88,8 @@ class AvgWeightVector { public: AvgWeightVector(const MiraWeightVector& wv); ValType score(const MiraFeatureVector& fv) const; - ValType weight(size_t index) const; - size_t size() const; + ValType weight(std::size_t index) const; + std::size_t size() const; private: const MiraWeightVector& m_wv; };