bug fix mmsapt training

This commit is contained in:
Philipp Koehn 2014-09-21 06:02:35 +01:00
parent a8659d1399
commit 3740c9f248
2 changed files with 14 additions and 10 deletions

View File

@ -2,6 +2,7 @@
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
my ($DIR,$F,$E,$ALIGNMENT,$CORPUS,$SETTINGS);
die("ERROR: syntax is --alignment FILE --corpus FILESTEM --f EXT --e EXT --DIR OUTDIR --settings STRING")
@ -15,8 +16,8 @@ die("ERROR: syntax is --alignment FILE --corpus FILESTEM --f EXT --e EXT --DIR O
&& -e $ALIGNMENT && -e "$CORPUS.$F" && -e "$CORPUS.$E";
`mkdir $DIR`;
`/opt/moses/bin/mtt-build < $CORPUS.$F -i -o $DIR/$F`;
`/opt/moses/bin/mtt-build < $CORPUS.$E -i -o $DIR/$E`;
`/opt/moses/bin/symal2mam < $ALIGNMENT $DIR/$F-$E.mam`;
`/opt/moses/bin/mmlex-build $DIR/ $F $E -o $DIR/$F-$E.lex -c $DIR/$F-$E.cooc`;
`$RealBin/../../bin/mtt-build < $CORPUS.$F -i -o $DIR/$F`;
`$RealBin/../../bin/mtt-build < $CORPUS.$E -i -o $DIR/$E`;
`$RealBin/../../bin/symal2mam < $ALIGNMENT $DIR/$F-$E.mam`;
`$RealBin/../../bin/mmlex-build $DIR/ $F $E -o $DIR/$F-$E.lex -c $DIR/$F-$E.cooc`;

View File

@ -22,7 +22,7 @@ $SCRIPTS_ROOTDIR =~ s/\/training$//;
#$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_SORT_PARALLEL, $_CORPUS,
$_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH,
$_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH, $_DISTORTION_LIMIT,
$_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_LEXICAL_COUNTS, $_VERBOSE, $_ALIGNMENT,
$_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS,
$_DIRECTION, $_ONLY_PRINT_GIZA, $_GIZA_EXTENSION, $_REORDERING,
@ -54,6 +54,7 @@ $_HELP = 1
'giza-e2f=s' => \$_GIZA_E2F,
'giza-f2e=s' => \$_GIZA_F2E,
'max-phrase-length=s' => \$_MAX_PHRASE_LENGTH,
'distortion-limit=s' => \$_DISTORTION_LIMIT,
'lexical-file=s' => \$_LEXICAL_FILE,
'no-lexical-weighting' => \$_NO_LEXICAL_WEIGHTING,
'write-lexical-counts' => \$_LEXICAL_COUNTS,
@ -440,11 +441,14 @@ $___CONTINUE = $_CONTINUE if $_CONTINUE;
my $___MAX_PHRASE_LENGTH = "7";
$___MAX_PHRASE_LENGTH = "10" if $_HIERARCHICAL;
$___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH;
my $___DISTORTION_LIMIT = 6;
$___DISTORTION_LIMIT = $_DISTORTION_LIMIT if $_DISTORTION_LIMIT;
my $___LEXICAL_WEIGHTING = 1;
my $___LEXICAL_COUNTS = 0;
my $___LEXICAL_FILE = $___MODEL_DIR."/lex";
$___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH;
$___LEXICAL_WEIGHTING = 0 if $_NO_LEXICAL_WEIGHTING;
$___LEXICAL_COUNTS = 1 if $_LEXICAL_COUNTS;
$___LEXICAL_FILE = $_LEXICAL_FILE if $_LEXICAL_FILE;
@ -1972,7 +1976,7 @@ sub create_ini {
$phrase_table_impl_name = "PhraseDictionaryOnDisk" if $phrase_table_impl==2;
$phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==6;
$phrase_table_impl_name = "PhraseDictionaryALSuffixArray" if $phrase_table_impl==10;
$phrase_table_impl_name = "Mmsapt" if $phrase_table_impl==11;
$phrase_table_impl_name = "PhraseDictionaryBitextSampling" if $phrase_table_impl==11;
$file .= "/" if $phrase_table_impl==11 && $file !~ /\/$/;
# table limit (maximum number of translation options per input phrase)
@ -1982,9 +1986,8 @@ sub create_ini {
}
# sum up...
$feature_spec .= "$phrase_table_impl_name name=TranslationModel$i num-features=$basic_weight_count ".($phrase_table_impl==11?"base":"path")."=$file input-factor=$input_factor output-factor=$output_factor";
$feature_spec .= "$phrase_table_impl_name name=TranslationModel$i num-features=$basic_weight_count path=$file input-factor=$input_factor output-factor=$output_factor";
$feature_spec .= " L1=$___F L2=$___E ".$_MMSAPT if defined($_MMSAPT); # extra settings for memory mapped suffix array phrase table
$feature_spec .= " table-limit=$table_limit" unless defined($_MMSAPT);
$feature_spec .= "\n";
$weight_spec .= "TranslationModel$i=";
for(my $j=0;$j<$basic_weight_count;$j++) { $weight_spec .= " 0.2"; }
@ -2138,7 +2141,7 @@ sub create_ini {
}
# phrase-based model settings
else {
print INI "[distortion-limit]\n6\n";
print INI "[distortion-limit]\n$___DISTORTION_LIMIT\n";
}
# only set the factor delimiter if it is non-standard