From 3740c9f24859620e68b3e77a5aa0084f9b2d18be Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Sun, 21 Sep 2014 06:02:35 +0100 Subject: [PATCH] bug fix mmsapt training --- scripts/training/build-mmsapt.perl | 9 +++++---- scripts/training/train-model.perl | 15 +++++++++------ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl index 2135c12c9..00a56977e 100755 --- a/scripts/training/build-mmsapt.perl +++ b/scripts/training/build-mmsapt.perl @@ -2,6 +2,7 @@ use strict; use Getopt::Long "GetOptions"; +use FindBin qw($RealBin); my ($DIR,$F,$E,$ALIGNMENT,$CORPUS,$SETTINGS); die("ERROR: syntax is --alignment FILE --corpus FILESTEM --f EXT --e EXT --DIR OUTDIR --settings STRING") @@ -15,8 +16,8 @@ die("ERROR: syntax is --alignment FILE --corpus FILESTEM --f EXT --e EXT --DIR O && -e $ALIGNMENT && -e "$CORPUS.$F" && -e "$CORPUS.$E"; `mkdir $DIR`; -`/opt/moses/bin/mtt-build < $CORPUS.$F -i -o $DIR/$F`; -`/opt/moses/bin/mtt-build < $CORPUS.$E -i -o $DIR/$E`; -`/opt/moses/bin/symal2mam < $ALIGNMENT $DIR/$F-$E.mam`; -`/opt/moses/bin/mmlex-build $DIR/ $F $E -o $DIR/$F-$E.lex -c $DIR/$F-$E.cooc`; +`$RealBin/../../bin/mtt-build < $CORPUS.$F -i -o $DIR/$F`; +`$RealBin/../../bin/mtt-build < $CORPUS.$E -i -o $DIR/$E`; +`$RealBin/../../bin/symal2mam < $ALIGNMENT $DIR/$F-$E.mam`; +`$RealBin/../../bin/mmlex-build $DIR/ $F $E -o $DIR/$F-$E.lex -c $DIR/$F-$E.cooc`; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 8f661b812..fd6e8647e 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -22,7 +22,7 @@ $SCRIPTS_ROOTDIR =~ s/\/training$//; #$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_SORT_PARALLEL, $_CORPUS, - $_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH, + $_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH, $_DISTORTION_LIMIT, $_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_LEXICAL_COUNTS, $_VERBOSE, $_ALIGNMENT, $_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS, $_DIRECTION, $_ONLY_PRINT_GIZA, $_GIZA_EXTENSION, $_REORDERING, @@ -54,6 +54,7 @@ $_HELP = 1 'giza-e2f=s' => \$_GIZA_E2F, 'giza-f2e=s' => \$_GIZA_F2E, 'max-phrase-length=s' => \$_MAX_PHRASE_LENGTH, + 'distortion-limit=s' => \$_DISTORTION_LIMIT, 'lexical-file=s' => \$_LEXICAL_FILE, 'no-lexical-weighting' => \$_NO_LEXICAL_WEIGHTING, 'write-lexical-counts' => \$_LEXICAL_COUNTS, @@ -440,11 +441,14 @@ $___CONTINUE = $_CONTINUE if $_CONTINUE; my $___MAX_PHRASE_LENGTH = "7"; $___MAX_PHRASE_LENGTH = "10" if $_HIERARCHICAL; +$___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH; + +my $___DISTORTION_LIMIT = 6; +$___DISTORTION_LIMIT = $_DISTORTION_LIMIT if $_DISTORTION_LIMIT; my $___LEXICAL_WEIGHTING = 1; my $___LEXICAL_COUNTS = 0; my $___LEXICAL_FILE = $___MODEL_DIR."/lex"; -$___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH; $___LEXICAL_WEIGHTING = 0 if $_NO_LEXICAL_WEIGHTING; $___LEXICAL_COUNTS = 1 if $_LEXICAL_COUNTS; $___LEXICAL_FILE = $_LEXICAL_FILE if $_LEXICAL_FILE; @@ -1972,7 +1976,7 @@ sub create_ini { $phrase_table_impl_name = "PhraseDictionaryOnDisk" if $phrase_table_impl==2; $phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==6; $phrase_table_impl_name = "PhraseDictionaryALSuffixArray" if $phrase_table_impl==10; - $phrase_table_impl_name = "Mmsapt" if $phrase_table_impl==11; + $phrase_table_impl_name = "PhraseDictionaryBitextSampling" if $phrase_table_impl==11; $file .= "/" if $phrase_table_impl==11 && $file !~ /\/$/; # table limit (maximum number of translation options per input phrase) @@ -1982,9 +1986,8 @@ sub create_ini { } # sum up... - $feature_spec .= "$phrase_table_impl_name name=TranslationModel$i num-features=$basic_weight_count ".($phrase_table_impl==11?"base":"path")."=$file input-factor=$input_factor output-factor=$output_factor"; + $feature_spec .= "$phrase_table_impl_name name=TranslationModel$i num-features=$basic_weight_count path=$file input-factor=$input_factor output-factor=$output_factor"; $feature_spec .= " L1=$___F L2=$___E ".$_MMSAPT if defined($_MMSAPT); # extra settings for memory mapped suffix array phrase table - $feature_spec .= " table-limit=$table_limit" unless defined($_MMSAPT); $feature_spec .= "\n"; $weight_spec .= "TranslationModel$i="; for(my $j=0;$j<$basic_weight_count;$j++) { $weight_spec .= " 0.2"; } @@ -2138,7 +2141,7 @@ sub create_ini { } # phrase-based model settings else { - print INI "[distortion-limit]\n6\n"; + print INI "[distortion-limit]\n$___DISTORTION_LIMIT\n"; } # only set the factor delimiter if it is non-standard