mosesdecoder/scripts/training/train-model.perl
2014-06-06 17:25:26 +01:00

2211 lines
81 KiB
Perl
Executable File

#!/usr/bin/perl -w
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
use File::Spec::Functions;
use File::Spec::Unix;
use File::Basename;
BEGIN { require "$RealBin/LexicalTranslationModel.pm"; "LexicalTranslationModel"->import; }
# Train Factored Phrase Model
# (c) 2006-2009 Philipp Koehn
# with contributions from other JHU WS participants
# Train a model from a parallel corpus
# -----------------------------------------------------
$ENV{"LC_ALL"} = "C";
my $SCRIPTS_ROOTDIR = $RealBin;
if ($SCRIPTS_ROOTDIR eq '') {
$SCRIPTS_ROOTDIR = dirname(__FILE__);
}
$SCRIPTS_ROOTDIR =~ s/\/training$//;
#$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_SORT_PARALLEL, $_CORPUS,
$_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH,
$_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_LEXICAL_COUNTS, $_VERBOSE, $_ALIGNMENT,
$_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS,
$_DIRECTION, $_ONLY_PRINT_GIZA, $_GIZA_EXTENSION, $_REORDERING,
$_REORDERING_SMOOTH, $_INPUT_FACTOR_MAX, $_ALIGNMENT_FACTORS,
$_TRANSLATION_FACTORS, $_REORDERING_FACTORS, $_GENERATION_FACTORS,
$_DECODING_GRAPH_BACKOFF,
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
@_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
@_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
$_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $_NUM_LATTICE_FEATURES, $IGNORE, $_FLEXIBILITY_SCORE, $_EXTRACT_COMMAND);
my $_BASELINE_CORPUS = "";
my $_CORES = 1;
my $debug = 0; # debug this script, do not delete any files in debug mode
$_HELP = 1
unless &GetOptions('root-dir=s' => \$_ROOT_DIR,
'external-bin-dir=s' => \$_EXTERNAL_BINDIR,
'corpus-dir=s' => \$_CORPUS_DIR,
'corpus=s' => \$_CORPUS,
'f=s' => \$_F,
'e=s' => \$_E,
'giza-e2f=s' => \$_GIZA_E2F,
'giza-f2e=s' => \$_GIZA_F2E,
'max-phrase-length=s' => \$_MAX_PHRASE_LENGTH,
'lexical-file=s' => \$_LEXICAL_FILE,
'no-lexical-weighting' => \$_NO_LEXICAL_WEIGHTING,
'write-lexical-counts' => \$_LEXICAL_COUNTS,
'model-dir=s' => \$_MODEL_DIR,
'temp-dir=s' => \$_TEMP_DIR,
'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
'sort-batch-size=i' => \$_SORT_BATCH_SIZE,
'sort-compress=s' => \$_SORT_COMPRESS,
'sort-parallel=i' => \$_SORT_PARALLEL,
'extract-file=s' => \$_EXTRACT_FILE,
'alignment=s' => \$_ALIGNMENT,
'alignment-file=s' => \$_ALIGNMENT_FILE,
'alignment-stem=s' => \$_ALIGNMENT_STEM,
'verbose' => \$_VERBOSE,
'first-step=i' => \$_FIRST_STEP,
'last-step=i' => \$_LAST_STEP,
'giza-option=s' => \$_GIZA_OPTION,
'giza-extension=s' => \$_GIZA_EXTENSION,
'parallel' => \$_PARALLEL,
'lm=s' => \@_LM,
'help' => \$_HELP,
'mgiza' => \$_MGIZA, # multi-thread
'mgiza-cpus=i' => \$_MGIZA_CPUS, # multi-thread
'snt2cooc=s' => \$_SNT2COOC, # override snt2cooc exe. For when you want to run reduced memory snt2cooc.perl from mgiza
'hmm-align' => \$_HMM_ALIGN,
'final-alignment-model=s' => \$_FINAL_ALIGNMENT_MODEL, # use word alignment model 1/2/hmm/3/4/5 as final (default is 4); value 'hmm' equivalent to the --hmm-align switch
'debug' => \$debug,
'dont-zip' => \$_DONT_ZIP,
'parts=i' => \$_PARTS,
'direction=i' => \$_DIRECTION,
'only-print-giza' => \$_ONLY_PRINT_GIZA,
'reordering=s' => \$_REORDERING,
'reordering-smooth=s' => \$_REORDERING_SMOOTH,
'input-factor-max=i' => \$_INPUT_FACTOR_MAX,
'alignment-factors=s' => \$_ALIGNMENT_FACTORS,
'translation-factors=s' => \$_TRANSLATION_FACTORS,
'reordering-factors=s' => \$_REORDERING_FACTORS,
'generation-factors=s' => \$_GENERATION_FACTORS,
'decoding-steps=s' => \$_DECODING_STEPS,
'decoding-graph-backoff=s' => \$_DECODING_GRAPH_BACKOFF,
'bin-dir=s' => \$IGNORE,
'scripts-root-dir=s' => \$IGNORE,
'factor-delimiter=s' => \$_FACTOR_DELIMITER,
'phrase-translation-table=s' => \@_PHRASE_TABLE,
'generation-corpus=s' => \$_GENERATION_CORPUS,
'generation-table=s' => \@_GENERATION_TABLE,
'reordering-table=s' => \@_REORDERING_TABLE,
'generation-type=s' => \@_GENERATION_TYPE,
'continue' => \$_CONTINUE,
'hierarchical' => \$_HIERARCHICAL,
'glue-grammar' => \$_GLUE_GRAMMAR,
'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE,
'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE,
'unknown-word-soft-matches-file=s' => \$_UNKNOWN_WORD_SOFT_MATCHES_FILE, # give dummy label to unknown word, and allow soft matches to all other labels (with cost determined by sparse features)
'ghkm' => \$_GHKM,
'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
'pcfg' => \$_PCFG,
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
'extract-options=s' => \@_EXTRACT_OPTIONS,
'score-options=s' => \@_SCORE_OPTIONS,
'source-syntax' => \$_SOURCE_SYNTAX,
'target-syntax' => \$_TARGET_SYNTAX,
'xml' => \$_XML,
'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
'config=s' => \$_CONFIG,
'osm-model=s' => \$_OSM,
'osm-setting=s' => \$_OSM_FACTORS,
'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT,
'transliteration-phrase-table=s' => \$_TRANSLITERATION_PHRASE_TABLE,
'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
'do-steps=s' => \$_DO_STEPS,
'memscore:s' => \$_MEMSCORE,
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
'dictionary=s' => \$_DICTIONARY,
'sparse-phrase-features' => \$_SPARSE_PHRASE_FEATURES,
'eppex:s' => \$_EPPEX,
'additional-ini=s' => \@_ADDITIONAL_INI,
'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE,
'baseline-alignment-model=s{8}' => \@_BASELINE_ALIGNMENT_MODEL,
'baseline-extract=s' => \$_BASELINE_EXTRACT,
'baseline-corpus=s' => \$_BASELINE_CORPUS,
'baseline-alignment=s' => \$_BASELINE_ALIGNMENT,
'cores=i' => \$_CORES,
'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
'flexibility-score' => \$_FLEXIBILITY_SCORE,
'extract-command=s' => \$_EXTRACT_COMMAND,
);
if ($_HELP) {
print "Train Phrase Model
Steps: (--first-step to --last-step)
(1) prepare corpus
(2) run GIZA
(3) align words
(4) learn lexical translation
(5) extract phrases
(6) score phrases
(7) learn reordering model
(8) learn generation model
(9) create decoder config file
For more, please check manual or contact koehn\@inf.ed.ac.uk\n";
exit(1);
}
if (defined($IGNORE)) {
print STDERR "WARNING: Do not specify -bin-dir or -scripts-root-dir anymore. These variable are ignored and will be deleted soon";
}
# convert all paths to absolute paths
$_ROOT_DIR = File::Spec->rel2abs($_ROOT_DIR) if defined($_ROOT_DIR);
$_EXTERNAL_BINDIR = File::Spec->rel2abs($_EXTERNAL_BINDIR) if defined($_EXTERNAL_BINDIR);
$_CORPUS_DIR = File::Spec->rel2abs($_CORPUS_DIR) if defined($_CORPUS_DIR);
$_CORPUS = File::Spec->rel2abs($_CORPUS) if defined($_CORPUS);
$_LEXICAL_FILE = File::Spec->rel2abs($_LEXICAL_FILE) if defined($_LEXICAL_FILE);
$_MODEL_DIR = File::Spec->rel2abs($_MODEL_DIR) if defined($_MODEL_DIR);
$_TEMP_DIR = File::Spec->rel2abs($_TEMP_DIR) if defined($_TEMP_DIR);
$_ALIGNMENT_FILE = File::Spec->rel2abs($_ALIGNMENT_FILE) if defined($_ALIGNMENT_FILE);
$_ALIGNMENT_STEM = File::Spec->rel2abs($_ALIGNMENT_STEM) if defined($_ALIGNMENT_STEM);
$_GLUE_GRAMMAR_FILE = File::Spec->rel2abs($_GLUE_GRAMMAR_FILE) if defined($_GLUE_GRAMMAR_FILE);
$_UNKNOWN_WORD_LABEL_FILE = File::Spec->rel2abs($_UNKNOWN_WORD_LABEL_FILE) if defined($_UNKNOWN_WORD_LABEL_FILE);
$_EXTRACT_FILE = File::Spec->rel2abs($_EXTRACT_FILE) if defined($_EXTRACT_FILE);
foreach (@_PHRASE_TABLE) { $_ = File::Spec->rel2abs($_); }
foreach (@_REORDERING_TABLE) { $_ = File::Spec->rel2abs($_); }
foreach (@_GENERATION_TABLE) { $_ = File::Spec->rel2abs($_); }
$_GIZA_E2F = File::Spec->rel2abs($_GIZA_E2F) if defined($_GIZA_E2F);
$_GIZA_F2E = File::Spec->rel2abs($_GIZA_F2E) if defined($_GIZA_F2E);
my $_SCORE_OPTIONS; # allow multiple switches
foreach (@_SCORE_OPTIONS) { $_SCORE_OPTIONS .= $_." "; }
chop($_SCORE_OPTIONS) if $_SCORE_OPTIONS;
my $_EXTRACT_OPTIONS; # allow multiple switches
foreach (@_EXTRACT_OPTIONS) { $_EXTRACT_OPTIONS .= $_." "; }
chop($_EXTRACT_OPTIONS) if $_EXTRACT_OPTIONS;
my $_ADDITIONAL_INI; # allow multiple switches
foreach (@_ADDITIONAL_INI) { $_ADDITIONAL_INI .= $_." "; }
chop($_ADDITIONAL_INI) if $_ADDITIONAL_INI;
$_HIERARCHICAL = 1 if $_SOURCE_SYNTAX || $_TARGET_SYNTAX;
$_XML = 1 if $_SOURCE_SYNTAX || $_TARGET_SYNTAX;
my $___FACTOR_DELIMITER = $_FACTOR_DELIMITER;
$___FACTOR_DELIMITER = '|' unless ($_FACTOR_DELIMITER);
print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
# Setting the steps to perform
my $___VERBOSE = 0;
my $___FIRST_STEP = 1;
my $___LAST_STEP = 9;
$___VERBOSE = $_VERBOSE if $_VERBOSE;
$___FIRST_STEP = $_FIRST_STEP if $_FIRST_STEP;
$___LAST_STEP = $_LAST_STEP if $_LAST_STEP;
my $___DO_STEPS = $___FIRST_STEP."-".$___LAST_STEP;
$___DO_STEPS = $_DO_STEPS if $_DO_STEPS;
my @STEPS = (0,0,0,0,0,0,0,0,0);
my @step_conf = split(',',$___DO_STEPS);
my ($f,$l);
foreach my $step (@step_conf) {
if ($step =~ /^(\d)$/) {
$f = $1;
$l = $1;
}
elsif ($step =~ /^(\d)-(\d)$/) {
$f = $1;
$l = $2;
}
else {
die ("Malformed argument to --do-steps");
}
die("Only steps between 1 and 9 can be used") if ($f < 1 || $l > 9);
die("The first step must be smaller than the last step") if ($f > $l);
for (my $i=$f; $i<=$l; $i++) {
$STEPS[$i] = 1;
}
}
# supporting binaries from other packages
my $MKCLS = "$_EXTERNAL_BINDIR/mkcls";
my $MGIZA_MERGE_ALIGN = "$_EXTERNAL_BINDIR/merge_alignment.py";
my $GIZA;
my $SNT2COOC;
if ($STEPS[1] || $STEPS[2])
{
if(!defined $_MGIZA ){
$GIZA = "$_EXTERNAL_BINDIR/GIZA++";
if (-x "$_EXTERNAL_BINDIR/snt2cooc.out") {
$SNT2COOC = "$_EXTERNAL_BINDIR/snt2cooc.out";
} elsif (-x "$_EXTERNAL_BINDIR/snt2cooc") { # Since "snt2cooc.out" and "snt2cooc" work the same
$SNT2COOC = "$_EXTERNAL_BINDIR/snt2cooc";
}
print STDERR "Using single-thread GIZA\n";
} else {
# accept either "mgiza" or "mgizapp" and either "snt2cooc.out" or "snt2cooc"
if (-x "$_EXTERNAL_BINDIR/mgiza") {
$GIZA = "$_EXTERNAL_BINDIR/mgiza";
} elsif (-x "$_EXTERNAL_BINDIR/mgizapp") {
$GIZA = "$_EXTERNAL_BINDIR/mgizapp";
}
if (-x "$_EXTERNAL_BINDIR/snt2cooc") {
$SNT2COOC = "$_EXTERNAL_BINDIR/snt2cooc";
} elsif (-x "$_EXTERNAL_BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $_EXTERNAL_BINDIR
$SNT2COOC = "$_EXTERNAL_BINDIR/snt2cooc.out";
}
print STDERR "Using multi-thread GIZA\n";
if (!defined($_MGIZA_CPUS)) {
$_MGIZA_CPUS=4;
}
die("ERROR: Cannot find $MGIZA_MERGE_ALIGN") unless (-x $MGIZA_MERGE_ALIGN);
}
# override
$SNT2COOC = "$_EXTERNAL_BINDIR/$_SNT2COOC" if defined($_SNT2COOC);
}
# parallel extract
my $SPLIT_EXEC = `gsplit --help 2>/dev/null`;
if($SPLIT_EXEC) {
$SPLIT_EXEC = 'gsplit';
}
else {
$SPLIT_EXEC = 'split';
}
my $SORT_EXEC = `gsort --help 2>/dev/null`;
if($SORT_EXEC) {
$SORT_EXEC = 'gsort';
}
else {
$SORT_EXEC = 'sort';
}
my $__SORT_BUFFER_SIZE = "";
$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
my $__SORT_BATCH_SIZE = "";
$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;
my $__SORT_COMPRESS = "";
$__SORT_COMPRESS = "--compress-program $_SORT_COMPRESS" if $_SORT_COMPRESS;
my $__SORT_PARALLEL = "";
$__SORT_PARALLEL = "--parallel $_SORT_PARALLEL" if $_SORT_PARALLEL;
# supporting scripts/binaries from this package
my $PHRASE_EXTRACT;
if (defined($_EXTRACT_COMMAND)) {
$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/$_EXTRACT_COMMAND";
}
else {
$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/extract";
}
$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_EXTRACT";
my $RULE_EXTRACT;
if (defined($_EXTRACT_COMMAND)) {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/$_EXTRACT_COMMAND";
}
elsif (defined($_GHKM)) {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/extract-ghkm";
}
else {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/extract-rules";
}
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $RULE_EXTRACT";
my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/../bin/lexical-reordering-score";
my $MEMSCORE = "$SCRIPTS_ROOTDIR/../bin/memscore";
my $EPPEX = "$SCRIPTS_ROOTDIR/../bin/eppex";
my $SYMAL = "$SCRIPTS_ROOTDIR/../bin/symal";
my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/giza2bal.pl";
my $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/../bin/score";
$PHRASE_SCORE = "$SCRIPTS_ROOTDIR/generic/score-parallel.perl $_CORES \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_SCORE";
my $PHRASE_CONSOLIDATE = "$SCRIPTS_ROOTDIR/../bin/consolidate";
my $FLEX_SCORER = "$SCRIPTS_ROOTDIR/training/flexibility_score.py";
# utilities
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
# do a sanity check to make sure we can find the necessary binaries since
# these are not installed by default
# not needed if we start after step 2
die("ERROR: Cannot find mkcls, GIZA++/mgiza, & snt2cooc.out/snt2cooc in $_EXTERNAL_BINDIR.\nYou MUST specify the parameter -external-bin-dir") unless ((!$STEPS[2]) ||
(defined($_EXTERNAL_BINDIR) && -x $GIZA && defined($SNT2COOC) && -x $MKCLS));
# set varibles to defaults or from options
my $___ROOT_DIR = ".";
$___ROOT_DIR = $_ROOT_DIR if $_ROOT_DIR;
my $___CORPUS_DIR = $___ROOT_DIR."/corpus";
$___CORPUS_DIR = $_CORPUS_DIR if $_CORPUS_DIR;
die("ERROR: use --corpus to specify corpus") unless $_CORPUS || !($STEPS[1] || $STEPS[4] || $STEPS[5] || $STEPS[8]);
my $___CORPUS = $_CORPUS;
# check the final-alignment-model switch
my $___FINAL_ALIGNMENT_MODEL = undef;
$___FINAL_ALIGNMENT_MODEL = 'hmm' if $_HMM_ALIGN;
$___FINAL_ALIGNMENT_MODEL = $_FINAL_ALIGNMENT_MODEL if $_FINAL_ALIGNMENT_MODEL;
die("ERROR: --final-alignment-model can be set to '1', '2', 'hmm', '3', '4' or '5'")
unless (!defined($___FINAL_ALIGNMENT_MODEL) or $___FINAL_ALIGNMENT_MODEL =~ /^(1|2|hmm|3|4|5)$/);
my $___GIZA_EXTENSION = 'A3.final';
if(defined $___FINAL_ALIGNMENT_MODEL) {
$___GIZA_EXTENSION = 'A1.5' if $___FINAL_ALIGNMENT_MODEL eq '1';
$___GIZA_EXTENSION = 'A2.5' if $___FINAL_ALIGNMENT_MODEL eq '2';
$___GIZA_EXTENSION = 'Ahmm.5' if $___FINAL_ALIGNMENT_MODEL eq 'hmm';
}
$___GIZA_EXTENSION = $_GIZA_EXTENSION if $_GIZA_EXTENSION;
my $___CORPUS_COMPRESSION = '';
if ($_CORPUS_COMPRESSION) {
$___CORPUS_COMPRESSION = ".$_CORPUS_COMPRESSION";
}
# foreign/English language extension
die("ERROR: use --f to specify foreign language") unless $_F;
die("ERROR: use --e to specify English language") unless $_E;
my $___F = $_F;
my $___E = $_E;
# vocabulary files in corpus dir
my $___VCB_E = $___CORPUS_DIR."/".$___E.".vcb";
my $___VCB_F = $___CORPUS_DIR."/".$___F.".vcb";
# GIZA generated files
my $___GIZA = $___ROOT_DIR."/giza";
my $___GIZA_E2F = $___GIZA.".".$___E."-".$___F;
my $___GIZA_F2E = $___GIZA.".".$___F."-".$___E;
$___GIZA_E2F = $_GIZA_E2F if $_GIZA_E2F;
$___GIZA_F2E = $_GIZA_F2E if $_GIZA_F2E;
my $___GIZA_OPTION = "";
$___GIZA_OPTION = $_GIZA_OPTION if $_GIZA_OPTION;
# alignment heuristic
my $___ALIGNMENT = "grow-diag-final";
$___ALIGNMENT = $_ALIGNMENT if $_ALIGNMENT;
my $___NOTE_ALIGNMENT_DROPS = 1;
# baseline alignment model for incremetal updating
die "ERROR: buggy definition of baseline alignment model, should have 8 values:\n\t".join("\n\t",@_BASELINE_ALIGNMENT_MODEL)."\n"
unless scalar(@_BASELINE_ALIGNMENT_MODEL) == 8 || scalar(@_BASELINE_ALIGNMENT_MODEL) == 0;
die "ERROR: use of baseline alignment model limited to HMM training (-hmm-align)\n"
if defined($___FINAL_ALIGNMENT_MODEL) && $___FINAL_ALIGNMENT_MODEL ne 'hmm' && scalar(@_BASELINE_ALIGNMENT_MODEL) == 8;
# model dir and alignment/extract file
my $___MODEL_DIR = $___ROOT_DIR."/model";
$___MODEL_DIR = $_MODEL_DIR if $_MODEL_DIR;
my $___ALIGNMENT_FILE = "$___MODEL_DIR/aligned";
$___ALIGNMENT_FILE = $_ALIGNMENT_FILE if $_ALIGNMENT_FILE;
my $___ALIGNMENT_STEM = $___ALIGNMENT_FILE;
$___ALIGNMENT_STEM = $_ALIGNMENT_STEM if $_ALIGNMENT_STEM;
my $___EXTRACT_FILE = $___MODEL_DIR."/extract";
$___EXTRACT_FILE = $_EXTRACT_FILE if $_EXTRACT_FILE;
my $___GLUE_GRAMMAR_FILE = $___MODEL_DIR."/glue-grammar";
$___GLUE_GRAMMAR_FILE = $_GLUE_GRAMMAR_FILE if $_GLUE_GRAMMAR_FILE;
my $___CONFIG = $___MODEL_DIR."/moses.ini";
$___CONFIG = $_CONFIG if $_CONFIG;
my $___DONT_ZIP = 0;
$_DONT_ZIP = $___DONT_ZIP unless $___DONT_ZIP;
my $___TEMP_DIR = $___MODEL_DIR;
$___TEMP_DIR = $_TEMP_DIR if $_TEMP_DIR;
my $___CONTINUE = 0;
$___CONTINUE = $_CONTINUE if $_CONTINUE;
my $___MAX_PHRASE_LENGTH = "7";
$___MAX_PHRASE_LENGTH = "10" if $_HIERARCHICAL;
my $___LEXICAL_WEIGHTING = 1;
my $___LEXICAL_COUNTS = 0;
my $___LEXICAL_FILE = $___MODEL_DIR."/lex";
$___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH;
$___LEXICAL_WEIGHTING = 0 if $_NO_LEXICAL_WEIGHTING;
$___LEXICAL_COUNTS = 1 if $_LEXICAL_COUNTS;
$___LEXICAL_FILE = $_LEXICAL_FILE if $_LEXICAL_FILE;
my $___PHRASE_SCORER = "phrase-extract";
$___PHRASE_SCORER = "memscore" if defined $_MEMSCORE;
my $___MEMSCORE_OPTIONS = "-s ml -s lexweights \$LEX_E2F -r ml -r lexweights \$LEX_F2E -s const 2.718";
$___MEMSCORE_OPTIONS = $_MEMSCORE if $_MEMSCORE;
my @___LM = ();
if ($STEPS[9]) {
die "ERROR: use --lm factor:order:filename to specify at least one language model"
if scalar @_LM == 0;
foreach my $lm (@_LM) {
my $type = 0; # default to srilm
my ($f, $order, $filename);
($f, $order, $filename, $type) = split /:/, $lm, 4;
die "ERROR: Wrong format of --lm. Expected: --lm factor:order:filename"
if $f !~ /^[0-9,]+$/ || $order !~ /^[0-9]+$/ || !defined $filename;
die "ERROR: Filename is not absolute: $filename"
unless file_name_is_absolute $filename;
die "ERROR: Language model file not found or empty: $filename"
if ! -e $filename;
push @___LM, [ $f, $order, $filename, $type ];
}
}
my $___PARTS = 1;
$___PARTS = $_PARTS if $_PARTS;
my $___DIRECTION = 0;
$___DIRECTION = $_DIRECTION if $_DIRECTION;
# don't fork
my $___NOFORK = !defined $_PARALLEL;
my $___ONLY_PRINT_GIZA = 0;
$___ONLY_PRINT_GIZA = 1 if $_ONLY_PRINT_GIZA;
# Reordering model (esp. lexicalized)
my $___REORDERING = "distance";
$___REORDERING = $_REORDERING if $_REORDERING;
my $___REORDERING_SMOOTH = 0.5;
$___REORDERING_SMOOTH = $_REORDERING_SMOOTH if $_REORDERING_SMOOTH;
my @REORDERING_MODELS;
my $REORDERING_LEXICAL = 0; # flag for building lexicalized reordering models
my %REORDERING_MODEL_TYPES = ();
my $___MAX_LEXICAL_REORDERING = 0;
$___MAX_LEXICAL_REORDERING = 1 if $_MAX_LEXICAL_REORDERING;
my $model_num = 0;
foreach my $r (split(/\,/,$___REORDERING)) {
#Don't do anything for distance models
next if ($r eq "distance");
#change some config string options, to be backward compatible
$r =~ s/orientation/msd/;
$r =~ s/unidirectional/backward/;
#set default values
push @REORDERING_MODELS, {};
$REORDERING_MODELS[$model_num]{"dir"} = "backward";
$REORDERING_MODELS[$model_num]{"type"} = "wbe";
$REORDERING_MODELS[$model_num]{"collapse"} = "allff";
#handle the options set in the config string
foreach my $reoconf (split(/\-/,$r)) {
if ($reoconf =~ /^((msd)|(mslr)|(monotonicity)|(leftright))/) {
$REORDERING_MODELS[$model_num]{"orient"} = $reoconf;
$REORDERING_LEXICAL = 1;
}
elsif ($reoconf =~ /^((bidirectional)|(backward)|(forward))/) {
$REORDERING_MODELS[$model_num]{"dir"} = $reoconf;
}
elsif ($reoconf =~ /^((fe)|(f))/) {
$REORDERING_MODELS[$model_num]{"lang"} = $reoconf;
}
elsif ($reoconf =~ /^((hier)|(phrase)|(wbe))/) {
$REORDERING_MODELS[$model_num]{"type"} = $reoconf;
}
elsif ($reoconf =~ /^((collapseff)|(allff))/) {
$REORDERING_MODELS[$model_num]{"collapse"} = $reoconf;
}
else {
print STDERR "unknown type in reordering model config string: \"$reoconf\" in $r\n";
exit(1);
}
}
#check that the required attributes are given
if (!defined($REORDERING_MODELS[$model_num]{"type"})) {
print STDERR "you have to give the type of the reordering models (mslr, msd, monotonicity or leftright); it is not done in $r\n";
exit(1);
}
if (!defined($REORDERING_MODELS[$model_num]{"lang"})) {
print STDERR "you have specify which languages to condition on for lexical reordering (f or fe); it is not done in $r\n";
exit(1);
}
#fix the all-string
$REORDERING_MODELS[$model_num]{"filename"} = $REORDERING_MODELS[$model_num]{"type"}."-".$REORDERING_MODELS[$model_num]{"orient"}.'-'.
$REORDERING_MODELS[$model_num]{"dir"}."-".$REORDERING_MODELS[$model_num]{"lang"};
$REORDERING_MODELS[$model_num]{"config"} = $REORDERING_MODELS[$model_num]{"filename"}."-".$REORDERING_MODELS[$model_num]{"collapse"};
# fix numfeatures
$REORDERING_MODELS[$model_num]{"numfeatures"} = 1;
$REORDERING_MODELS[$model_num]{"numfeatures"} = 2 if $REORDERING_MODELS[$model_num]{"dir"} eq "bidirectional";
if ($REORDERING_MODELS[$model_num]{"collapse"} ne "collapseff") {
if ($REORDERING_MODELS[$model_num]{"orient"} eq "msd") {
$REORDERING_MODELS[$model_num]{"numfeatures"} *= 3;
}
elsif ($REORDERING_MODELS[$model_num]{"orient"} eq "mslr") {
$REORDERING_MODELS[$model_num]{"numfeatures"} *= 4;
}
else {
$REORDERING_MODELS[$model_num]{"numfeatures"} *= 2;
}
}
# fix the overall model selection
if (defined $REORDERING_MODEL_TYPES{$REORDERING_MODELS[$model_num]{"type"}}) {
$REORDERING_MODEL_TYPES{$REORDERING_MODELS[$model_num]{"type"}} .=
$REORDERING_MODELS[$model_num]{"orient"}."-";
}
else {
$REORDERING_MODEL_TYPES{$REORDERING_MODELS[$model_num]{"type"}} =
$REORDERING_MODELS[$model_num]{"orient"};
}
$model_num++;
}
# pick the overall most specific model for each reordering model type
for my $mtype ( keys %REORDERING_MODEL_TYPES) {
if ($REORDERING_MODEL_TYPES{$mtype} =~ /msd/) {
$REORDERING_MODEL_TYPES{$mtype} = "msd"
}
elsif ($REORDERING_MODEL_TYPES{$mtype} =~ /monotonicity/) {
$REORDERING_MODEL_TYPES{$mtype} = "monotonicity"
}
else {
$REORDERING_MODEL_TYPES{$mtype} = "mslr"
}
}
### Factored translation models
my $___NOT_FACTORED = !$_FORCE_FACTORED_FILENAMES;
$___NOT_FACTORED = 0 if $_INPUT_FACTOR_MAX;
my $___ALIGNMENT_FACTORS = "0-0";
$___ALIGNMENT_FACTORS = $_ALIGNMENT_FACTORS if defined($_ALIGNMENT_FACTORS);
die("ERROR: format for alignment factors is \"0-0\" or \"0,1,2-0,1\", you provided $___ALIGNMENT_FACTORS\n") if $___ALIGNMENT_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*$/;
$___NOT_FACTORED = 0 unless $___ALIGNMENT_FACTORS eq "0-0";
my $___TRANSLATION_FACTORS = undef;
$___TRANSLATION_FACTORS = "0-0" unless defined($_DECODING_STEPS); # single factor default
$___TRANSLATION_FACTORS = $_TRANSLATION_FACTORS if defined($_TRANSLATION_FACTORS);
die("ERROR: format for translation factors is \"0-0\" or \"0-0+1-1\" or \"0-0+0,1-0,1\", you provided $___TRANSLATION_FACTORS\n")
if defined $___TRANSLATION_FACTORS && $___TRANSLATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
$___NOT_FACTORED = 0 unless $___TRANSLATION_FACTORS eq "0-0";
my $___REORDERING_FACTORS = undef;
$___REORDERING_FACTORS = "0-0" if defined($_REORDERING) && ! defined($_DECODING_STEPS); # single factor default
$___REORDERING_FACTORS = $_REORDERING_FACTORS if defined($_REORDERING_FACTORS);
die("ERROR: format for reordering factors is \"0-0\" or \"0-0+1-1\" or \"0-0+0,1-0,1\", you provided $___REORDERING_FACTORS\n")
if defined $___REORDERING_FACTORS && $___REORDERING_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
$___NOT_FACTORED = 0 if defined($_REORDERING) && $___REORDERING_FACTORS ne "0-0";
my $___GENERATION_FACTORS = undef;
$___GENERATION_FACTORS = $_GENERATION_FACTORS if defined($_GENERATION_FACTORS);
die("ERROR: format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n")
if defined $___GENERATION_FACTORS && $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
$___NOT_FACTORED = 0 if defined($___GENERATION_FACTORS);
my $___DECODING_STEPS = "t0";
$___DECODING_STEPS = $_DECODING_STEPS if defined($_DECODING_STEPS);
die("ERROR: format for decoding steps is \"t0,g0,t1,g1:t2\", you provided $___DECODING_STEPS\n")
if defined $_DECODING_STEPS && $_DECODING_STEPS !~ /^[tg]\d+([,:][tg]\d+)*$/;
### MAIN
&prepare() if $STEPS[1];
&run_giza() if $STEPS[2];
&word_align() if $STEPS[3];
&get_lexical_factored() if $STEPS[4];
&extract_phrase_factored() if $STEPS[5];
&score_phrase_factored() if $STEPS[6];
&get_reordering_factored() if $STEPS[7];
&get_generation_factored() if $STEPS[8];
&create_ini() if $STEPS[9];
### (1) PREPARE CORPUS
sub prepare {
print STDERR "(1) preparing corpus @ ".`date`;
safesystem("mkdir -p $___CORPUS_DIR") or die("ERROR: could not create corpus dir $___CORPUS_DIR");
print STDERR "(1.0) selecting factors @ ".`date`;
my ($factor_f,$factor_e) = split(/\-/,$___ALIGNMENT_FACTORS);
my $corpus = ($___NOT_FACTORED && !$_XML) ? $___CORPUS : $___CORPUS.".".$___ALIGNMENT_FACTORS;
my $VCB_F, my $VCB_E;
if ($___NOFORK) {
if (! $___NOT_FACTORED || $_XML) {
&reduce_factors($___CORPUS.".".$___F,$corpus.".".$___F,$factor_f);
&reduce_factors($___CORPUS.".".$___E,$corpus.".".$___E,$factor_e);
}
&make_classes($corpus.".".$___F,$___VCB_F.".classes");
&make_classes($corpus.".".$___E,$___VCB_E.".classes");
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
&numberize_txt_file($VCB_F,$corpus.".".$___F,
$VCB_E,$corpus.".".$___E,
$___CORPUS_DIR."/$___F-$___E-int-train.snt");
&numberize_txt_file($VCB_E,$corpus.".".$___E,
$VCB_F,$corpus.".".$___F,
$___CORPUS_DIR."/$___E-$___F-int-train.snt");
}
else {
print "Forking...\n";
if (! $___NOT_FACTORED || $_XML) {
my $pid = fork();
die "ERROR: couldn't fork" unless defined $pid;
if (!$pid) {
&reduce_factors($___CORPUS.".".$___F,$corpus.".".$___F,$factor_f);
exit 0;
}
else {
&reduce_factors($___CORPUS.".".$___E,$corpus.".".$___E,$factor_e);
}
printf "Waiting for second reduce_factors process...\n";
waitpid($pid, 0);
}
my $pid = fork();
die "ERROR: couldn't fork" unless defined $pid;
if (!$pid) {
&make_classes($corpus.".".$___F,$___VCB_F.".classes");
exit 0;
} # parent
my $pid2 = fork();
die "ERROR: couldn't fork again" unless defined $pid2;
if (!$pid2) { #child
&make_classes($corpus.".".$___E,$___VCB_E.".classes");
exit 0;
}
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
&numberize_txt_file($VCB_F,$corpus.".".$___F,
$VCB_E,$corpus.".".$___E,
$___CORPUS_DIR."/$___F-$___E-int-train.snt");
&numberize_txt_file($VCB_E,$corpus.".".$___E,
$VCB_F,$corpus.".".$___F,
$___CORPUS_DIR."/$___E-$___F-int-train.snt");
printf "Waiting for mkcls processes to finish...\n";
waitpid($pid2, 0);
waitpid($pid, 0);
}
if (defined $_DICTIONARY)
{
my $dict= &make_dicts_files($_DICTIONARY, $VCB_F,$VCB_E,
$___CORPUS_DIR."/gizadict.$___E-$___F",
$___CORPUS_DIR."/gizadict.$___F-$___E");
if (not $dict)
{
print STDERR "WARNING: empty dictionary\n";
undef $_DICTIONARY;
}
}
}
sub reduce_factors {
my ($full,$reduced,$factors) = @_;
# my %INCLUDE;
# foreach my $factor (split(/,/,$factors)) {
# $INCLUDE{$factor} = 1;
# }
my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`;
while(-e $reduced.".lock") {
sleep(10);
}
if (-e $reduced) {
print STDERR " $reduced in place, reusing\n";
return;
}
if (-e $reduced.".gz") {
print STDERR " $reduced.gz in place, reusing\n";
return;
}
unless ($_XML) {
# peek at input, to check if we are asked to produce exactly the
# available factors
my $inh = open_or_zcat($full);
my $firstline = <$inh>;
die "Corpus file $full is empty" unless $firstline;
close $inh;
# pick first word
$firstline =~ s/^\s*//;
$firstline =~ s/\s.*//;
# count factors
my @WORD = split(/ /,$firstline);
my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
my $maxfactorindex = scalar(@FACTOR)-1;
if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
# create just symlink; preserving compression
my $realfull = $full;
if (!-e $realfull && -e $realfull.".gz") {
$realfull .= ".gz";
$reduced =~ s/(\.gz)?$/.gz/;
}
safesystem("ln -s '$realfull' '$reduced'")
or die "Failed to create symlink $realfull -> $reduced";
return;
}
}
# The default is to select the needed factors
`touch $reduced.lock`;
*IN = open_or_zcat($full);
open(OUT,">".$reduced) or die "ERROR: Can't write $reduced";
my $nr = 0;
while(<IN>) {
$nr++;
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
s/<\S[^>]*>/ /g if $_XML; # remove xml
chomp; s/ +/ /g; s/^ //; s/ $//;
my $first = 1;
foreach (split) {
my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
# \Q causes to disable metacharacters in regex
print OUT " " unless $first;
$first = 0;
my $first_factor = 1;
foreach my $outfactor (@INCLUDE) {
print OUT $___FACTOR_DELIMITER unless $first_factor;
$first_factor = 0;
my $out = $FACTOR[$outfactor];
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
print OUT $out;
}
# for(my $factor=0;$factor<=$#FACTOR;$factor++) {
# next unless defined($INCLUDE{$factor});
# print OUT "|" unless $first_factor;
# $first_factor = 0;
# print OUT $FACTOR[$factor];
# }
}
print OUT "\n";
}
print STDERR "\n";
close(OUT);
close(IN);
`rm -f $reduced.lock`;
}
sub make_classes {
my ($corpus,$classes) = @_;
my $cmd = "$MKCLS -c50 -n2 -p$corpus -V$classes opt";
print STDERR "(1.1) running mkcls @ ".`date`."$cmd\n";
if (-e $classes) {
print STDERR " $classes already in place, reusing\n";
return;
}
safesystem("$cmd"); # ignoring the wrong exit code from mkcls (not dying)
}
sub get_vocabulary {
# return unless $___LEXICAL_WEIGHTING;
my($corpus,$vcb,$is_target) = @_;
print STDERR "(1.2) creating vcb file $vcb @ ".`date`;
my %WORD;
open(TXT,$corpus) or die "ERROR: Can't read $corpus";
while(<TXT>) {
chop;
foreach (split) { $WORD{$_}++; }
}
close(TXT);
my ($id,%VCB);
open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
# words from baseline alignment model when incrementally updating
if (scalar @_BASELINE_ALIGNMENT_MODEL) {
open(BASELINE_VCB,$_BASELINE_ALIGNMENT_MODEL[$is_target]);
while(<BASELINE_VCB>) {
chop;
my ($i,$word,$count) = split;
if (defined($WORD{$word})) {
$count += $WORD{$word};
delete($WORD{$word});
}
printf VCB "%d\t%s\t%d\n",$i,$word,$count;
$VCB{$word} = $i;
$id = $i+1;
}
close(BASELINE_VCB);
}
# not incrementally updating
else {
print VCB "1\tUNK\t0\n";
$id=2;
}
my @NUM;
foreach my $word (keys %WORD) {
my $vcb_with_number = sprintf("%07d %s",$WORD{$word},$word);
push @NUM,$vcb_with_number;
}
foreach (reverse sort @NUM) {
my($count,$word) = split;
printf VCB "%d\t%s\t%d\n",$id,$word,$count;
$VCB{$word} = $id;
$id++;
}
close(VCB);
return \%VCB;
}
sub make_dicts_files {
my ($dictfile,$VCB_SRC,$VCB_TGT,$outfile1, $outfile2) = @_;
my %numberized_dict;
print STDERR "(1.3) numberizing dictionaries $outfile1 and $outfile2 @ ".`date`;
if ((-e $outfile1) && (-e $outfile2)) {
print STDERR " dictionary files already in place, reusing\n";
return;
}
open(DICT,$dictfile) or die "ERROR: Can't read $dictfile";
open(OUT1,">$outfile1") or die "ERROR: Can't write $outfile1";
open(OUT2,">$outfile2") or die "ERROR: Can't write $outfile2";
while(my $line = <DICT>) {
my $src, my $tgt;
($src, $tgt) = split(/\s+/,$line);
chomp($tgt); chomp($src);
if ((not defined($$VCB_TGT{$tgt})) || (not defined($$VCB_SRC{$src})))
{
print STDERR "Warning: unknown word in dictionary: $src <=> $tgt\n";
next;
}
$numberized_dict{int($$VCB_TGT{$tgt})} = int($$VCB_SRC{$src}) ;
}
close(DICT);
my @items = sort {$a <=> $b} keys %numberized_dict;
if (scalar(@items) == 0) { return 0; }
foreach my $key (@items)
{
print OUT1 "$key $numberized_dict{$key}\n";
print OUT2 "$numberized_dict{$key} $key\n";
}
close(OUT);
return 1;
}
sub numberize_txt_file {
my ($VCB_DE,$in_de,$VCB_EN,$in_en,$out) = @_;
my %OUT;
print STDERR "(1.3) numberizing corpus $out @ ".`date`;
if (-e $out) {
print STDERR " $out already in place, reusing\n";
return;
}
open(IN_DE,$in_de) or die "ERROR: Can't read $in_de";
open(IN_EN,$in_en) or die "ERROR: Can't read $in_en";
open(OUT,">$out") or die "ERROR: Can't write $out";
while(my $de = <IN_DE>) {
my $en = <IN_EN>;
print OUT "1\n";
print OUT &numberize_line($VCB_EN,$en);
print OUT &numberize_line($VCB_DE,$de);
}
close(IN_DE);
close(IN_EN);
close(OUT);
}
sub numberize_line {
my ($VCB,$txt) = @_;
chomp($txt);
my $out = "";
my $not_first = 0;
foreach (split(/ /,$txt)) {
next if $_ eq '';
$out .= " " if $not_first++;
print STDERR "Unknown word '$_'\n" unless defined($$VCB{$_});
$out .= $$VCB{$_};
}
return $out."\n";
}
### (2) RUN GIZA
sub run_giza {
return &run_giza_on_parts if $___PARTS>1;
print STDERR "(2) running giza @ ".`date`;
if ($___DIRECTION == 1 || $___DIRECTION == 2 || $___NOFORK) {
&run_single_giza($___GIZA_F2E,$___E,$___F,
$___VCB_E,$___VCB_F,
$___CORPUS_DIR."/$___F-$___E-int-train.snt")
unless $___DIRECTION == 2;
&run_single_giza($___GIZA_E2F,$___F,$___E,
$___VCB_F,$___VCB_E,
$___CORPUS_DIR."/$___E-$___F-int-train.snt")
unless $___DIRECTION == 1;
} else {
my $pid = fork();
if (!defined $pid) {
die "ERROR: Failed to fork";
}
if (!$pid) { # i'm the child
&run_single_giza($___GIZA_F2E,$___E,$___F,
$___VCB_E,$___VCB_F,
$___CORPUS_DIR."/$___F-$___E-int-train.snt");
exit 0; # child exits
} else { #i'm the parent
&run_single_giza($___GIZA_E2F,$___F,$___E,
$___VCB_F,$___VCB_E,
$___CORPUS_DIR."/$___E-$___F-int-train.snt");
}
printf "Waiting for second GIZA process...\n";
waitpid($pid, 0);
}
}
sub run_giza_on_parts {
print STDERR "(2) running giza on $___PARTS cooc parts @ ".`date`;
my $size = `cat $___CORPUS_DIR/$___F-$___E-int-train.snt | wc -l`;
die "ERROR: Failed to get number of lines in $___CORPUS_DIR/$___F-$___E-int-train.snt"
if $size == 0;
if ($___DIRECTION == 1 || $___DIRECTION == 2 || $___NOFORK) {
&run_single_giza_on_parts($___GIZA_F2E,$___E,$___F,
$___VCB_E,$___VCB_F,
$___CORPUS_DIR."/$___F-$___E-int-train.snt",$size)
unless $___DIRECTION == 2;
&run_single_giza_on_parts($___GIZA_E2F,$___F,$___E,
$___VCB_F,$___VCB_E,
$___CORPUS_DIR."/$___E-$___F-int-train.snt",$size)
unless $___DIRECTION == 1;
} else {
my $pid = fork();
if (!defined $pid) {
die "ERROR: Failed to fork";
}
if (!$pid) { # i'm the child
&run_single_giza_on_parts($___GIZA_F2E,$___E,$___F,
$___VCB_E,$___VCB_F,
$___CORPUS_DIR."/$___F-$___E-int-train.snt",$size);
exit 0; # child exits
} else { #i'm the parent
&run_single_giza_on_parts($___GIZA_E2F,$___F,$___E,
$___VCB_F,$___VCB_E,
$___CORPUS_DIR."/$___E-$___F-int-train.snt",$size);
}
printf "Waiting for second GIZA process...\n";
waitpid($pid, 0);
}
}
sub run_single_giza_on_parts {
my($dir,$e,$f,$vcb_e,$vcb_f,$train,$size) = @_;
my $part = 0;
# break up training data into parts
open(SNT,$train) or die "ERROR: Can't read $train";
{
my $i=0;
while(<SNT>) {
$i++;
if ($i%3==1 && $part < ($___PARTS*$i)/$size && $part<$___PARTS) {
close(PART) if $part;
$part++;
safesystem("mkdir -p $___CORPUS_DIR/part$part") or die("ERROR: could not create $___CORPUS_DIR/part$part");
open(PART,">$___CORPUS_DIR/part$part/$f-$e-int-train.snt")
or die "ERROR: Can't write $___CORPUS_DIR/part$part/$f-$e-int-train.snt";
}
print PART $_;
}
}
close(PART);
close(SNT);
# run snt2cooc in parts
my @COOC_PART_FILE_NAME;
for(my $i=1;$i<=$___PARTS;$i++) {
&run_single_snt2cooc("$dir/part$i",$e,$f,$vcb_e,$vcb_f,"$___CORPUS_DIR/part$i/$f-$e-int-train.snt");
push @COOC_PART_FILE_NAME, "$dir/part$i/$f-$e.cooc";
}
# include baseline cooc, if baseline alignment model (incremental training)
if (scalar @_BASELINE_ALIGNMENT_MODEL) {
push @COOC_PART_FILE_NAME, $_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)];
}
&merge_cooc_files($dir,$e,$f,@COOC_PART_FILE_NAME);
# run giza
&run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
}
sub merge_cooc_files {
my ($dir,$e,$f,@COOC_PART_FILE_NAME) = @_;
# merge parts
open(COOC,">$dir/$f-$e.cooc") or die "ERROR: Can't write $dir/$f-$e.cooc";
my(@PF,@CURRENT);
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
print STDERR "merging cooc file $COOC_PART_FILE_NAME[$i]...\n";
open($PF[$i],$COOC_PART_FILE_NAME[$i]) or die "ERROR: Can't read $COOC_PART_FILE_NAME[$i]";
my $pf = $PF[$i];
$CURRENT[$i] = <$pf>;
chop($CURRENT[$i]) if $CURRENT[$i];
}
while(1) {
my ($min1,$min2) = (1e20,1e20);
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
next unless $CURRENT[$i];
my ($w1,$w2) = split(/ /,$CURRENT[$i]);
if ($w1 < $min1 || ($w1 == $min1 && $w2 < $min2)) {
$min1 = $w1;
$min2 = $w2;
}
}
last if $min1 == 1e20;
print COOC "$min1 $min2\n";
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
next unless $CURRENT[$i];
my ($w1,$w2) = split(/ /,$CURRENT[$i]);
if ($w1 == $min1 && $w2 == $min2) {
my $pf = $PF[$i];
$CURRENT[$i] = <$pf>;
chop($CURRENT[$i]) if $CURRENT[$i];
}
}
}
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
close($PF[$i]);
}
close(COOC);
}
sub run_single_giza {
my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
my %GizaDefaultOptions =
(p0 => .999 ,
m1 => 5 ,
m2 => 0 ,
m3 => 3 ,
m4 => 3 ,
o => "giza" ,
nodumps => 1 ,
onlyaldumps => 1 ,
nsmooth => 4 ,
model1dumpfrequency => 1,
model4smoothfactor => 0.4 ,
t => $vcb_f,
s => $vcb_e,
c => $train,
CoocurrenceFile => "$dir/$f-$e.cooc",
o => "$dir/$f-$e");
if (defined $_DICTIONARY)
{ $GizaDefaultOptions{d} = $___CORPUS_DIR."/gizadict.$f-$e"; }
# 5 Giza threads
if (defined $_MGIZA){ $GizaDefaultOptions{"ncpus"} = $_MGIZA_CPUS; }
if ($_HMM_ALIGN) {
$GizaDefaultOptions{m3} = 0;
$GizaDefaultOptions{m4} = 0;
$GizaDefaultOptions{hmmiterations} = 5;
$GizaDefaultOptions{hmmdumpfrequency} = 5;
$GizaDefaultOptions{nodumps} = 0;
}
if ($___FINAL_ALIGNMENT_MODEL) {
$GizaDefaultOptions{nodumps} = ($___FINAL_ALIGNMENT_MODEL =~ /^[345]$/)? 1: 0;
$GizaDefaultOptions{model345dumpfrequency} = 0;
$GizaDefaultOptions{model1dumpfrequency} = ($___FINAL_ALIGNMENT_MODEL eq '1')? 5: 0;
$GizaDefaultOptions{m2} = ($___FINAL_ALIGNMENT_MODEL eq '2')? 5: 0;
$GizaDefaultOptions{model2dumpfrequency} = ($___FINAL_ALIGNMENT_MODEL eq '2')? 5: 0;
$GizaDefaultOptions{hmmiterations} = ($___FINAL_ALIGNMENT_MODEL =~ /^(hmm|[345])$/)? 5: 0;
$GizaDefaultOptions{hmmdumpfrequency} = ($___FINAL_ALIGNMENT_MODEL eq 'hmm')? 5: 0;
$GizaDefaultOptions{m3} = ($___FINAL_ALIGNMENT_MODEL =~ /^[345]$/)? 3: 0;
$GizaDefaultOptions{m4} = ($___FINAL_ALIGNMENT_MODEL =~ /^[45]$/)? 3: 0;
$GizaDefaultOptions{m5} = ($___FINAL_ALIGNMENT_MODEL eq '5')? 3: 0;
}
if (scalar(@_BASELINE_ALIGNMENT_MODEL)) {
$GizaDefaultOptions{oldTrPrbs} = $_BASELINE_ALIGNMENT_MODEL[4 + ($dir eq $___GIZA_F2E?2:0)];
$GizaDefaultOptions{oldAlPrbs} = $_BASELINE_ALIGNMENT_MODEL[5 + ($dir eq $___GIZA_F2E?2:0)];
$GizaDefaultOptions{step_k} = 1;
}
if ($___GIZA_OPTION) {
foreach (split(/[ ,]+/,$___GIZA_OPTION)) {
my ($option,$value) = split(/=/,$_,2);
$GizaDefaultOptions{$option} = $value;
}
}
my $GizaOptions;
foreach my $option (sort keys %GizaDefaultOptions){
my $value = $GizaDefaultOptions{$option} ;
$GizaOptions .= " -$option $value" ;
}
&run_single_snt2cooc($dir,$e,$f,$vcb_e,$vcb_f,$train) if $___PARTS == 1;
print STDERR "(2.1b) running giza $f-$e @ ".`date`."$GIZA $GizaOptions\n";
if (-e "$dir/$f-$e.$___GIZA_EXTENSION.gz") {
print " $dir/$f-$e.$___GIZA_EXTENSION.gz seems finished, reusing.\n";
return;
}
print "$GIZA $GizaOptions\n";
return if $___ONLY_PRINT_GIZA;
safesystem("$GIZA $GizaOptions");
if (defined $_MGIZA and (!defined $___FINAL_ALIGNMENT_MODEL or $___FINAL_ALIGNMENT_MODEL ne '2')){
print STDERR "Merging $___GIZA_EXTENSION.part\* tables\n";
safesystem("$MGIZA_MERGE_ALIGN $dir/$f-$e.$___GIZA_EXTENSION.part*>$dir/$f-$e.$___GIZA_EXTENSION");
#system("rm -f $dir/$f-$e/*.part*");
}
die "ERROR: Giza did not produce the output file $dir/$f-$e.$___GIZA_EXTENSION. Is your corpus clean (reasonably-sized sentences)?"
if ! -e "$dir/$f-$e.$___GIZA_EXTENSION";
safesystem("rm -f $dir/$f-$e.$___GIZA_EXTENSION.gz") or die;
safesystem("gzip $dir/$f-$e.$___GIZA_EXTENSION") or die;
}
sub run_single_snt2cooc {
my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
my $suffix = (scalar @_BASELINE_ALIGNMENT_MODEL) ? ".new" : "";
safesystem("mkdir -p $dir") or die("ERROR");
if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix\n";
safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix") or die("ERROR");
} else {
print "$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train\n";
safesystem("$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train") or die("ERROR");
}
&merge_cooc_files($dir,$e,$f,"$dir/$f-$e.cooc.new",$_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)])
if scalar @_BASELINE_ALIGNMENT_MODEL;
}
### (3) CREATE WORD ALIGNMENT FROM GIZA ALIGNMENTS
sub word_align {
print STDERR "(3) generate word alignment @ ".`date`;
my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);
print STDERR "Combining forward and inverted alignment from files:\n";
print STDERR " $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.{bz2,gz}\n";
print STDERR " $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.{bz2,gz}\n";
### build arguments for giza2bal.pl
my($__ALIGNMENT_CMD,$__ALIGNMENT_INV_CMD);
if (-e "$___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.bz2"){
$__ALIGNMENT_CMD="\"$BZCAT $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.bz2\"";
} elsif (-e "$___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.gz") {
$__ALIGNMENT_CMD="\"$ZCAT $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.gz\"";
} else {
die "ERROR: Can't read $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.{bz2,gz}\n";
}
if ( -e "$___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.bz2"){
$__ALIGNMENT_INV_CMD="\"$BZCAT $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.bz2\"";
}elsif (-e "$___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.gz"){
$__ALIGNMENT_INV_CMD="\"$ZCAT $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.gz\"";
}else{
die "ERROR: Can't read $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.{bz2,gz}\n\n";
}
safesystem("mkdir -p $___MODEL_DIR") or die("ERROR: could not create dir $___MODEL_DIR");
#build arguments for symal
my($__symal_a)="";
$__symal_a="union" if $___ALIGNMENT eq 'union';
$__symal_a="intersect" if $___ALIGNMENT=~ /intersect/;
$__symal_a="grow" if $___ALIGNMENT=~ /grow/;
$__symal_a="srctotgt" if $___ALIGNMENT=~ /srctotgt/;
$__symal_a="tgttosrc" if $___ALIGNMENT=~ /tgttosrc/;
my($__symal_d,$__symal_f,$__symal_b);
($__symal_d,$__symal_f,$__symal_b)=("no","no","no");
$__symal_d="yes" if $___ALIGNMENT=~ /diag/;
$__symal_f="yes" if $___ALIGNMENT=~ /final/;
$__symal_b="yes" if $___ALIGNMENT=~ /final-and/;
safesystem("$GIZA2BAL -d $__ALIGNMENT_INV_CMD -i $__ALIGNMENT_CMD |".
"$SYMAL -alignment=\"$__symal_a\" -diagonal=\"$__symal_d\" ".
"-final=\"$__symal_f\" -both=\"$__symal_b\" > ".
"$___ALIGNMENT_FILE.$___ALIGNMENT")
||
die "ERROR: Can't generate symmetrized alignment file\n"
}
### (4) BUILDING LEXICAL TRANSLATION TABLE
sub get_lexical_factored {
print STDERR "(4) generate lexical translation table $___TRANSLATION_FACTORS @ ".`date`;
if ($___NOT_FACTORED && !$_XML) {
&get_lexical($___CORPUS.".".$___F,
$___CORPUS.".".$___E,
$___ALIGNMENT_FILE.".".$___ALIGNMENT,
$___LEXICAL_FILE,
$___LEXICAL_COUNTS,
$_BASELINE_CORPUS.".".$___F,
$_BASELINE_CORPUS.".".$___E,
$_BASELINE_ALIGNMENT,
$_INSTANCE_WEIGHTS_FILE);
}
else {
foreach my $factor (split(/\+/,$___TRANSLATION_FACTORS)) {
print STDERR "(4) [$factor] generate lexical translation table @ ".`date`;
my ($factor_f,$factor_e) = split(/\-/,$factor);
&reduce_factors($___CORPUS.".".$___F,
$___ALIGNMENT_STEM.".".$factor_f.".".$___F,
$factor_f);
&reduce_factors($___CORPUS.".".$___E,
$___ALIGNMENT_STEM.".".$factor_e.".".$___E,
$factor_e);
my $lexical_file = $___LEXICAL_FILE;
$lexical_file .= ".".$factor if !$___NOT_FACTORED;
&get_lexical($___ALIGNMENT_STEM.".".$factor_f.".".$___F,
$___ALIGNMENT_STEM.".".$factor_e.".".$___E,
$___ALIGNMENT_FILE.".".$___ALIGNMENT,
$lexical_file,
$___LEXICAL_COUNTS,
$_BASELINE_CORPUS.".".$factor_f.".".$___F,
$_BASELINE_CORPUS.".".$factor_e.".".$___E,
$_BASELINE_ALIGNMENT,
$_INSTANCE_WEIGHTS_FILE);
}
}
}
### (5) PHRASE EXTRACTION
sub extract_phrase_factored {
print STDERR "(5) extract phrases @ ".`date`;
if ($___NOT_FACTORED) {
&extract_phrase($___CORPUS.".".$___F,
$___CORPUS.".".$___E,
$___EXTRACT_FILE,
0,1,$REORDERING_LEXICAL);
}
else {
my %EXTRACT_FOR_FACTOR = ();
my $table_number = 0;
my @FACTOR_LIST = ();
foreach my $factor (split(/\+/,"$___TRANSLATION_FACTORS")) {
my $factor_key = $factor.":".&get_max_phrase_length($table_number++);
push @FACTOR_LIST, $factor_key;
$EXTRACT_FOR_FACTOR{$factor_key}{"translation"}++;
}
if ($REORDERING_LEXICAL) {
foreach my $factor (split(/\+/,"$___REORDERING_FACTORS")) {
my $factor_key = $factor.":".&get_max_phrase_length(-1); # max
if (!defined($EXTRACT_FOR_FACTOR{$factor_key}{"translation"})) {
push @FACTOR_LIST, $factor_key;
}
$EXTRACT_FOR_FACTOR{$factor_key}{"reordering"}++;
}
}
$table_number = 0;
foreach my $factor_key (@FACTOR_LIST) {
my ($factor,$max_length) = split(/:/,$factor_key);
print STDERR "(5) [$factor] extract phrases (max length $max_length)@ ".`date`;
my ($factor_f,$factor_e) = split(/\-/,$factor);
&reduce_factors($___CORPUS.".".$___F,
$___ALIGNMENT_STEM.".".$factor_f.".".$___F,
$factor_f);
&reduce_factors($___CORPUS.".".$___E,
$___ALIGNMENT_STEM.".".$factor_e.".".$___E,
$factor_e);
&extract_phrase($___ALIGNMENT_STEM.".".$factor_f.".".$___F,
$___ALIGNMENT_STEM.".".$factor_e.".".$___E,
$___EXTRACT_FILE.".".$factor,
$table_number++,
defined($EXTRACT_FOR_FACTOR{$factor_key}{"translation"}),
defined($EXTRACT_FOR_FACTOR{$factor_key}{"reordering"}));
}
}
}
sub get_max_phrase_length {
my ($table_number) = @_;
# single length? that's it then
if ($___MAX_PHRASE_LENGTH =~ /^\d+$/) {
return $___MAX_PHRASE_LENGTH;
}
my $max_length = 0;
my @max = split(/,/,$___MAX_PHRASE_LENGTH);
# maximum of specified lengths
if ($table_number == -1) {
foreach (@max) {
$max_length = $_ if $_ > $max_length;
}
return $max_length;
}
# look up length for table
$max_length = $max[0]; # fallback: first specified length
if ($#max >= $table_number) {
$max_length = $max[$table_number];
}
return $max_length;
}
sub get_extract_reordering_flags {
if ($___MAX_LEXICAL_REORDERING) {
return " --model wbe-mslr --model phrase-mslr --model hier-mslr";
}
return "" unless @REORDERING_MODELS;
my $config_string = "";
for my $type ( keys %REORDERING_MODEL_TYPES) {
$config_string .= " --model $type-".$REORDERING_MODEL_TYPES{$type};
}
return $config_string;
}
sub extract_phrase {
my ($alignment_file_f,$alignment_file_e,$extract_file,$table_number,$ttable_flag,$reordering_flag) = @_;
my $alignment_file_a = $___ALIGNMENT_FILE.".".$___ALIGNMENT;
# Make sure the corpus exists in unzipped form
my @tempfiles = ();
foreach my $f ($alignment_file_e, $alignment_file_f, $alignment_file_a) {
if (! -e $f && -e $f.".gz") {
safesystem("gunzip < $f.gz > $f") or die("Failed to gunzip corpus $f");
push @tempfiles, "$f.gz";
}
}
my $cmd;
my $suffix = (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) ? ".new" : "";
if ($_HIERARCHICAL)
{
my $max_length = &get_max_phrase_length($table_number);
$cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix";
$cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR;
$cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
$cmd .= " --UnknownWordSoftMatches $_UNKNOWN_WORD_SOFT_MATCHES_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
$cmd .= " --PCFG" if $_PCFG;
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
if (!defined($_GHKM)) {
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
$cmd .= " --MaxSpan $max_length";
}
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
else
{
if ( $_EPPEX ) {
# eppex sets max_phrase_length itself (as the maximum phrase length for which any Lossy Counter is defined)
$cmd = "$EPPEX $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix $_EPPEX";
}
else {
my $max_length = &get_max_phrase_length($table_number);
print "MAX $max_length $reordering_flag $table_number\n";
$max_length = &get_max_phrase_length(-1) if $reordering_flag;
$cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix $max_length";
}
if ($reordering_flag) {
$cmd .= " orientation";
$cmd .= get_extract_reordering_flags();
$cmd .= " --NoTTable" if !$ttable_flag;
}
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
$cmd .= " --GZOutput ";
$cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
$cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
$cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE;
map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
print STDERR "$cmd\n";
safesystem("$cmd") or die "ERROR: Phrase extraction failed (missing input files?)";
if (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) {
print STDERR "merging with baseline extract from $_BASELINE_EXTRACT\n";
safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | gzip > $extract_file.gz");
safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | gzip > $extract_file.inv.gz");
safesystem("$ZCAT $_BASELINE_EXTRACT.o.gz $extract_file$suffix.o.gz | gzip > $extract_file.o.gz")
if -e "$extract_file$suffix.o.gz";
safesystem("rm $extract_file$suffix.gz");
safesystem("rm $extract_file$suffix.inv.gz");
safesystem("rm $extract_file$suffix.o.gz")
if -e "$extract_file$suffix.o.gz";
}
foreach my $f (@tempfiles) {
unlink $f;
}
}
### (6) PHRASE SCORING
sub score_phrase_factored {
print STDERR "(6) score phrases @ ".`date`;
my @SPECIFIED_TABLE = @_PHRASE_TABLE;
if ($___NOT_FACTORED) {
my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table");
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
&score_phrase($file,$___LEXICAL_FILE,$___EXTRACT_FILE);
}
else {
my $table_id = 0;
foreach my $factor (split(/\+/,$___TRANSLATION_FACTORS)) {
print STDERR "(6) [$factor] score phrases @ ".`date`;
my ($factor_f,$factor_e) = split(/\-/,$factor);
my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").".$factor";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
&score_phrase($file,$___LEXICAL_FILE.".".$factor,$___EXTRACT_FILE.".".$factor,$table_id);
$table_id++;
}
}
}
sub score_phrase {
my ($ttable_file,$lexical_file,$extract_file,$table_id) = @_;
if ($___PHRASE_SCORER eq "phrase-extract") {
&score_phrase_phrase_extract($ttable_file,$lexical_file,$extract_file,$table_id);
} elsif ($___PHRASE_SCORER eq "memscore") {
&score_phrase_memscore($ttable_file,$lexical_file,$extract_file);
} else {
die "ERROR: Unknown phrase scorer: ".$___PHRASE_SCORER;
}
}
sub score_phrase_phrase_extract {
my ($ttable_file,$lexical_file,$extract_file,$table_id) = @_;
# distinguish between score and consolidation options
my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/);
my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/);
my ($SPARSE_COUNT_BIN,$COUNT_BIN,$DOMAIN) = ("","","");
$SPARSE_COUNT_BIN = $1 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SparseCountBinFeature ([\s\d]*\d)/;
$COUNT_BIN = $1 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /\-CountBinFeature ([\s\d]*\d)/;
$DOMAIN = $1 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /(\-+[a-z]*Domain[a-z]+ .+)/i;
$DOMAIN =~ s/ \-.+//g;
if ($DOMAIN =~ /^(.+) table ([\d\,]+) *$/) {
my ($main_spec,$specified_tables) = ($1,$2);
$DOMAIN = "--IgnoreSentenceId";
foreach my $specified_table_id (split(/,/,$specified_tables)) {
$DOMAIN = $main_spec if $specified_table_id == $table_id;
}
}
my $SINGLETON = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /Singleton/);
my $CROSSEDNONTERM = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /CrossedNonTerm/);
my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/);
my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E);
if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) {
$UNALIGNED_FW_COUNT = 1;
$UNALIGNED_FW_F = $1;
$UNALIGNED_FW_E = $2;
}
my $GOOD_TURING = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /GoodTuring/);
my $KNESER_NEY = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /KneserNey/);
my $LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LogProb/);
my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/);
my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/);
my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
my $SPAN_LENGTH = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SpanLength/);
my $CORE_SCORE_OPTIONS = "";
$CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
$CORE_SCORE_OPTIONS .= " --NegLogProb" if $NEG_LOG_PROB;
$CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
$CORE_SCORE_OPTIONS .= " --Singleton" if $SINGLETON;
$CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM;
my $substep = 1;
my $isParent = 1;
my @children;
for my $direction ("f2e","e2f") {
if ($___NOFORK and @children > 0) {
waitpid((shift @children), 0);
$substep+=2;
}
my $pid = fork();
if ($pid == 0)
{
next if $___CONTINUE && -e "$ttable_file.half.$direction";
next if $___CONTINUE && $direction eq "e2f" && -e "$ttable_file.half.e2f.gz";
my $inverse = "";
my $extract_filename = $extract_file;
if ($direction eq "e2f") {
$inverse = "--Inverse";
$extract_filename = $extract_file.".inv";
}
my $extract = "$extract_filename.sorted.gz";
print STDERR "(6.".($substep++).") creating table half $ttable_file.half.$direction @ ".`date`;
my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction.gz $inverse";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --NoWordAlignment" if $_OMIT_WORD_ALIGNMENT;
$cmd .= " --KneserNey" if $KNESER_NEY;
$cmd .= " --GoodTuring" if $GOOD_TURING && $inverse eq "";
$cmd .= " --SpanLength" if $SPAN_LENGTH && $inverse eq "";
$cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
$cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
$cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
$cmd .= " --PCFG" if $_PCFG;
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
$cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
# sorting
if ($direction eq "e2f" || $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2) {
$cmd .= " 1 ";
}
else {
$cmd .= " 0 ";
}
print STDERR $cmd."\n";
safesystem($cmd) or die "ERROR: Scoring of phrases failed";
exit();
}
else
{ # parent
push(@children, $pid);
}
}
# wait for everything is finished
if ($isParent)
{
foreach (@children) {
waitpid($_, 0);
}
}
else
{
die "shouldn't be here";
}
# merging the two halves
print STDERR "(6.6) consolidating the two halves @ ".`date`;
return if $___CONTINUE && -e "$ttable_file.gz";
my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.gz /dev/stdout";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --LogProb" if $LOG_PROB;
$cmd .= " --NegLogProb" if $NEG_LOG_PROB;
$cmd .= " --OnlyDirect" if $ONLY_DIRECT;
$cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
$cmd .= " --LowCountFeature" if $LOW_COUNT;
$cmd .= " --CountBinFeature $COUNT_BIN" if $COUNT_BIN;
$cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
$cmd .= " | gzip -c > $ttable_file.gz";
safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
}
sub score_phrase_memscore {
my ($ttable_file,$lexical_file,$extract_file) = @_;
return if $___CONTINUE && -e "$ttable_file.gz";
my $options = $___MEMSCORE_OPTIONS;
$options =~ s/\$LEX_F2E/$lexical_file.f2e/g;
$options =~ s/\$LEX_E2F/$lexical_file.e2f/g;
# The output is sorted to avoid breaking scripts that rely on the
# sorting behaviour of the previous scoring algorithm.
my $cmd = "$MEMSCORE $options | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip >$ttable_file.gz";
if (-e "$extract_file.gz") {
$cmd = "$ZCAT $extract_file.gz | ".$cmd;
} else {
$cmd = $cmd." <".$extract_file;
}
print $cmd."\n";
safesystem($cmd) or die "ERROR: Scoring of phrases failed";
}
### (7) LEARN REORDERING MODEL
sub get_reordering_factored {
print STDERR "(7) learn reordering model @ ".`date`;
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
if ($REORDERING_LEXICAL) {
if ($___NOT_FACTORED) {
print STDERR "(7.1) [no factors] learn reordering model @ ".`date`;
# foreach my $model (@REORDERING_MODELS) {
# #my $file = "$___MODEL_DIR/reordering-table.";
# $file .= $model->{"all"};
# #$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
# $model->{"file"} = $file;
# }
my $file = "$___MODEL_DIR/reordering-table";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$file .= ".";
&get_reordering($___EXTRACT_FILE,$file);
}
else {
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
print STDERR "(7.1) [$factor] learn reordering model @ ".`date`;
my ($factor_f,$factor_e) = split(/\-/,$factor);
# foreach my $model (@REORDERING_MODELS) {
# my $file = "$___MODEL_DIR/reordering-table.$factor";
# $file .= $model->{"all"};
# $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
# $model->{"file"} = $file;
# }
my $file ="$___MODEL_DIR/reordering-table.$factor";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$file .= ".";
&get_reordering("$___EXTRACT_FILE.$factor",$file);
}
}
}
else {
print STDERR " ... skipping this step, reordering is not lexicalized ...\n";
}
}
sub get_reordering {
my ($extract_file,$reo_model_path) = @_;
my $smooth = $___REORDERING_SMOOTH;
print STDERR "(7.2) building tables @ ".`date`;
#create cmd string for lexical reordering scoring
my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path";
$cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
for my $mtype (keys %REORDERING_MODEL_TYPES) {
# * $mtype will be one of wbe, phrase, or hier
# * the value stored in $REORDERING_MODEL_TYPES{$mtype} is a concatenation of the "orient"
# attributes such as "msd"
# * the "filename" attribute is appended to the filename, but actually serves as the main configuration specification
# for reordering scoring. it holds a string such as "wbe-msd-didirectional-fe"
# which has the more general format type-orient-dir-lang
$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
foreach my $model (@REORDERING_MODELS) {
if ($model->{"type"} eq $mtype) {
$cmd .= " ".$model->{"filename"};
}
}
$cmd .= "\"";
}
#Call the lexical reordering scorer
safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
}
### (8) LEARN GENERATION MODEL
my $factor_e_source;
sub get_generation_factored {
print STDERR "(8) learn generation model @ ".`date`;
if (defined $___GENERATION_FACTORS) {
my @SPECIFIED_TABLE = @_GENERATION_TABLE;
my @TYPE = @_GENERATION_TYPE;
my $corpus = $___CORPUS.".".$___E.$___CORPUS_COMPRESSION;
$corpus = $_GENERATION_CORPUS if defined($_GENERATION_CORPUS);
foreach my $factor (split(/\+/,$___GENERATION_FACTORS)) {
my ($factor_e_source,$factor_e) = split(/\-/,$factor);
my $file = "$___MODEL_DIR/generation.$factor";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
my $type = "double";
$type = shift @TYPE if scalar @TYPE;
&get_generation($file,$type,$factor,$factor_e_source,$factor_e,$corpus);
}
}
else {
print STDERR " no generation model requested, skipping step\n";
}
}
sub get_generation {
my ($file,$type,$factor,$factor_e_source,$factor_e,$corpus) = @_;
print STDERR "(8) [$factor] generate generation table @ ".`date`;
$file = "$___MODEL_DIR/generation.$factor" unless $file;
my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);
my %INCLUDE_SOURCE;
foreach my $factor (split(/,/,$factor_e_source)) {
$INCLUDE_SOURCE{$factor} = 1;
}
my %INCLUDE;
foreach my $factor (split(/,/,$factor_e)) {
$INCLUDE{$factor} = 1;
}
my (%GENERATION,%GENERATION_TOTAL_SOURCE,%GENERATION_TOTAL_TARGET);
*E = open_or_zcat($corpus);
while(<E>) {
chomp;
foreach (split) {
my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
my ($source,$target);
my $first_factor = 1;
foreach my $factor (split(/,/,$factor_e_source)) {
$source .= $___FACTOR_DELIMITER unless $first_factor;
$first_factor = 0;
$source .= $FACTOR[$factor];
}
$first_factor = 1;
foreach my $factor (split(/,/,$factor_e)) {
$target .= $___FACTOR_DELIMITER unless $first_factor;
$first_factor = 0;
$target .= $FACTOR[$factor];
}
$GENERATION{$source}{$target}++;
$GENERATION_TOTAL_SOURCE{$source}++;
$GENERATION_TOTAL_TARGET{$target}++;
}
}
close(E);
open(GEN,">$file") or die "ERROR: Can't write $file";
foreach my $source (keys %GENERATION) {
foreach my $target (keys %{$GENERATION{$source}}) {
printf GEN ("%s %s %.7f ",$source,$target,
$GENERATION{$source}{$target}/$GENERATION_TOTAL_SOURCE{$source});
printf GEN (" %.7f",
$GENERATION{$source}{$target}/$GENERATION_TOTAL_TARGET{$target})
unless $type eq 'single';
print GEN "\n";
}
}
close(GEN);
safesystem("rm -f $file.gz") or die("ERROR");
safesystem("gzip $file") or die("ERROR");
}
### (9) CREATE CONFIGURATION FILE
sub create_ini {
print STDERR "(9) create moses.ini @ ".`date`;
&full_path(\$___MODEL_DIR);
&full_path(\$___VCB_E);
&full_path(\$___VCB_F);
`mkdir -p $___MODEL_DIR`;
open(INI,">$___CONFIG") or die("ERROR: Can't write $___CONFIG");
print INI "#########################
### MOSES CONFIG FILE ###
#########################
\n";
if (defined $___TRANSLATION_FACTORS) {
print INI "# input factors\n";
print INI "[input-factors]\n";
my $INPUT_FACTOR_MAX = 0;
foreach my $table (split /\+/, $___TRANSLATION_FACTORS) {
my ($factor_list, $output) = split /-+/, $table;
foreach (split(/,/,$factor_list)) {
$INPUT_FACTOR_MAX = $_ if $_>$INPUT_FACTOR_MAX;
}
}
$INPUT_FACTOR_MAX = $_INPUT_FACTOR_MAX if $_INPUT_FACTOR_MAX; # use specified, if exists
for (my $c = 0; $c <= $INPUT_FACTOR_MAX; $c++) { print INI "$c\n"; }
} else {
die "ERROR: No translation steps defined, cannot prepare [input-factors] section\n";
}
# mapping steps
my %stepsused;
print INI "\n# mapping steps
[mapping]\n";
my $path = 0;
my %FIRST_TTABLE;
foreach (split(/:/,$___DECODING_STEPS)) {
my $first_ttable_flag = 1;
foreach (split(/,/,$_)) {
s/t/T /g;
s/g/G /g;
my ($type, $num) = split /\s+/;
if ($first_ttable_flag && $type eq "T") {
$FIRST_TTABLE{$num}++;
$first_ttable_flag = 0;
}
$stepsused{$type} = $num+1 if !defined $stepsused{$type} || $stepsused{$type} < $num+1;
print INI $path." ".$_."\n";
}
$path++;
}
print INI "1 T 1\n" if $_GLUE_GRAMMAR;
print INI "1 T 1\n" if $_TRANSLITERATION_PHRASE_TABLE;
if (defined($_DECODING_GRAPH_BACKOFF)) {
$_DECODING_GRAPH_BACKOFF =~ s/\s+/ /g;
$_DECODING_GRAPH_BACKOFF =~ s/^ //;
print INI "\n[decoding-graph-backoff]\n";
foreach (split(/ /,$_DECODING_GRAPH_BACKOFF)) {
print INI "$_\n";
}
}
my $feature_spec = "";
my $weight_spec = "";
# translation tables
my $i=0;
my @SPECIFIED_TABLE = @_PHRASE_TABLE;
# number of weights
my $basic_weight_count = 4; # both directions, lex and phrase
$basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
$basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del
$basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty/;
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
$basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /PhraseCount/; # phrase count feature
$basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /(\-CountBinFeature [\s\d]*\d)/) {
$basic_weight_count += scalar split(/\s+/,$1);
}
if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /\-+Domain([a-z]+) (\S+)/i) {
my ($method,$file) = ($1,$2);
my $count = `cut -d\\ -f 2 $file | sort | uniq | wc -l`;
$basic_weight_count += $count if $method eq "Indicator" || $method eq "Ratio";
$basic_weight_count += 2**$count-1 if $method eq "Subset";
}
$basic_weight_count++ if $_PCFG;
$basic_weight_count+=4 if $_FLEXIBILITY_SCORE;
$basic_weight_count+=2 if $_FLEXIBILITY_SCORE && $_HIERARCHICAL;
# go over each table
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
my ($input_factor,$output_factor) = split(/\-/,$f);
my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
my $phrase_table_impl = ($_HIERARCHICAL? 6 : 0);
# specified file name?
if (scalar(@SPECIFIED_TABLE)) {
$file = shift @SPECIFIED_TABLE;
my @toks = split(/:/,$file);
$file = $toks[0];
if (@toks > 1) {
$phrase_table_impl = $toks[1];
}
if (@toks == 3) {
$basic_weight_count = $toks[2];
}
}
# name of type
my $phrase_table_impl_name = "UnknownPtImplementation";
$phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==0;
$phrase_table_impl_name = "PhraseDictionaryBinary" if $phrase_table_impl==1;
$phrase_table_impl_name = "PhraseDictionaryOnDisk" if $phrase_table_impl==2;
$phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==6;
$phrase_table_impl_name = "PhraseDictionaryALSuffixArray" if $phrase_table_impl==10;
#table limit
my $table_limit = 0;
if ($i == 0) {
$table_limit = 20;
}
# sum up...
$feature_spec .= "$phrase_table_impl_name name=TranslationModel$i table-limit=$table_limit num-features=$basic_weight_count path=$file input-factor=$input_factor output-factor=$output_factor\n";
$weight_spec .= "TranslationModel$i=";
for(my $j=0;$j<$basic_weight_count;$j++) { $weight_spec .= " 0.2"; }
$weight_spec .= "\n";
$i++;
}
if ($i != $stepsused{"T"}) {
print STDERR "WARNING: Your [mapping-steps] require translation steps up to id $stepsused{T} but you defined translation steps 0..$i\n";
exit 1 if $i < $stepsused{"T"}; # fatal to define less
}
if ($_TRANSLITERATION_PHRASE_TABLE){
$feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i table-limit=100 num-features=4 path=$_TRANSLITERATION_PHRASE_TABLE input-factor=0 output-factor=0\n";
$weight_spec .= "TranslationModel$i= 0.2 0.2 0.2 0.2\n";
$i++;
}
# glue grammar
if ($_GLUE_GRAMMAR) {
&full_path(\$___GLUE_GRAMMAR_FILE);
$feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i num-features=1 path=$___GLUE_GRAMMAR_FILE input-factor=0 output-factor=0\n";
$weight_spec .= "TranslationModel$i= 1.0\n";
}
# generation model
if (defined $___GENERATION_FACTORS) {
my @TYPE = @_GENERATION_TYPE;
my $i=0;
my @SPECIFIED_TABLE = @_GENERATION_TABLE;
foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {
my $weights_per_generation_model = 2;
$weights_per_generation_model = 1 if scalar(@TYPE) && (shift @TYPE) eq 'single';
my ($input_factor,$output_factor) = split(/\-/,$f);
my $file = "$___MODEL_DIR/generation.$f";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$file .= ".gz" if ! -e $file && -e $file.".gz";
$feature_spec .= "Generation name=GenerationModel$i num-features=$weights_per_generation_model path=$file input-factor=$input_factor output-factor=$output_factor\n";
$weight_spec .= "GenerationModel$i= 0.3".($weights_per_generation_model==2?" 0":"")."\n";
$i++;
}
if ($i != $stepsused{"G"}) {
print STDERR "WARNING: Your [mapping-steps] require generation steps up to id $stepsused{G} but you defined generation steps 0..$i\n";
exit 1 if $i < $stepsused{"G"}; # fatal to define less
}
}
# lexicalized reordering model
if ($___REORDERING ne "distance") {
my $i = 0;
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
my ($input_factor,$output_factor) = split(/\-/,$factor);
foreach my $model (@REORDERING_MODELS) {
my $table_file = "$___MODEL_DIR/reordering-table";
$table_file .= ".$factor" unless $___NOT_FACTORED;
$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$table_file .= ".";
$table_file .= $model->{"filename"};
$table_file .= ".gz";
$feature_spec .= "LexicalReordering name=LexicalReordering$i num-features=".$model->{"numfeatures"}." type=".$model->{"config"}." input-factor=$input_factor output-factor=$output_factor path=$table_file\n";
$weight_spec .= "LexicalReordering$i=";
for(my $j=0;$j<$model->{"numfeatures"};$j++) { $weight_spec .= " 0.3"; }
$weight_spec .= "\n";
$i++;
}
}
}
# operation sequence model
if($_OSM)
{
if (defined($_OSM_FACTORS))
{
my $count = 0;
my @factor_values = split(/\+/, $_OSM_FACTORS);
foreach my $factor_val (@factor_values) {
my ($factor_f,$factor_e) = split(/\-/,$factor_val);
if($count == 0){
$feature_spec .= "OpSequenceModel name=OpSequenceModel$count num-features=5 path=". $_OSM . $factor_val . "/operationLM.bin" . " input-factor=". $factor_f . " output-factor=". $factor_e . " support-features=yes \n";
$weight_spec .= "OpSequenceModel$count= 0.08 -0.02 0.02 -0.001 0.03\n";
}
else{
$feature_spec .= "OpSequenceModel name=OpSequenceModel$count num-features=1 path=". $_OSM . $factor_val . "/operationLM.bin" . " input-factor=". $factor_f . " output-factor=". $factor_e . " support-features=no \n";
$weight_spec .= "OpSequenceModel$count= 0.08 \n";
}
$count++;
}
}
else
{
$feature_spec .= "OpSequenceModel name=OpSequenceModel0 num-features=5 path=". $_OSM . " \n";
$weight_spec .= "OpSequenceModel0= 0.08 -0.02 0.02 -0.001 0.03\n";
}
}
# distance-based reordering
if (!$_HIERARCHICAL) {
$feature_spec .= "Distortion\n";
$weight_spec .= "Distortion0= 0.3\n";
}
# language model
$i=0;
foreach my $lm (@___LM) {
my ($f, $o, $fn, $type) = @{$lm};
if ($fn !~ /^\//) {
my $path = `pwd`; chop($path);
$fn = $path."/".$fn;
}
$type = 0 unless $type;
my $type_name = "UnknownLM";
$type_name = "SRILM" if $type == 0;
$type_name = "IRSTLM" if $type == 1;
$type_name = "KENLM lazyken=0" if $type == 8;
$type_name = "KENLM lazyken=1" if $type == 9;
my $lm_oov_prob = 0.1;
if ($_POST_DECODING_TRANSLIT || $_TRANSLITERATION_PHRASE_TABLE){
$lm_oov_prob = -100.0;
$_LMODEL_OOV_FEATURE = "yes";
}
$feature_spec .= "$type_name name=LM$i factor=$f path=$fn order=$o\n";
$weight_spec .= "LM$i= 0.5".($_LMODEL_OOV_FEATURE?" $lm_oov_prob":"")."\n";
$i++;
}
if ($_LMODEL_OOV_FEATURE) {
print INI "\n# language model OOV feature enabled\n[lmodel-oov-feature]\n1\n\n";
}
# hierarchical model settings
print INI "\n";
if ($_HIERARCHICAL) {
print INI "[unknown-lhs]\n$_UNKNOWN_WORD_LABEL_FILE\n\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
print INI "[cube-pruning-pop-limit]\n1000\n\n";
print INI "[non-terminals]\nX\n\n";
print INI "[search-algorithm]\n3\n\n";
print INI "[inputtype]\n3\n\n";
print INI "[max-chart-span]\n";
foreach (split(/\+/,$___TRANSLATION_FACTORS)) { print INI "20\n"; }
print INI "1000\n";
}
# phrase-based model settings
else {
print INI "[distortion-limit]\n6\n";
}
# only set the factor delimiter if it is non-standard
unless ($___FACTOR_DELIMITER eq '|') {
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
}
# lattice feature
if ($_NUM_LATTICE_FEATURES) {
$feature_spec .= "InputFeature num-input-features=$_NUM_LATTICE_FEATURES\n";
$weight_spec .= "InputFeature0=";
for (1..$_NUM_LATTICE_FEATURES) {
$weight_spec .= " 0.1";
}
$weight_spec .= "\n";
}
# get addititional content for config file from switch or file
if ($_ADDITIONAL_INI) {
print INI "\n# additional settings\n\n";
foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
}
if ($_ADDITIONAL_INI_FILE) {
print INI "\n# additional settings\n\n";
print INI `cat $_ADDITIONAL_INI_FILE`;
}
# feature functions and weights
print INI "\n# feature functions\n";
print INI "[feature]\n";
print INI "UnknownWordPenalty\n";
print INI "WordPenalty\n";
print INI "PhrasePenalty\n";
print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
print INI $feature_spec;
print INI "\n# dense weights for feature functions\n";
print INI "[weight]\n";
print INI "UnknownWordPenalty0= 1\n";
print INI "WordPenalty0= -1\n";
print INI "PhrasePenalty0= 0.2\n";
print INI $weight_spec;
close(INI);
}
sub full_path {
my ($PATH) = @_;
return if $$PATH =~ /^\//;
$$PATH = `pwd`."/".$$PATH;
$$PATH =~ s/[\r\n]//g;
$$PATH =~ s/\/\.\//\//g;
$$PATH =~ s/\/+/\//g;
my $sanity = 0;
while($$PATH =~ /\/\.\.\// && $sanity++<10) {
$$PATH =~ s/\/+/\//g;
$$PATH =~ s/\/[^\/]+\/\.\.\//\//g;
}
$$PATH =~ s/\/[^\/]+\/\.\.$//;
$$PATH =~ s/\/+$//;
}
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "ERROR: Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit(1);
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
sub open_or_zcat {
my $fn = shift;
my $read = $fn;
$fn = $fn.".gz" if ! -e $fn && -e $fn.".gz";
$fn = $fn.".bz2" if ! -e $fn && -e $fn.".bz2";
if ($fn =~ /\.bz2$/) {
$read = "$BZCAT $fn|";
} elsif ($fn =~ /\.gz$/) {
$read = "$ZCAT $fn|";
}
my $hdl;
open($hdl,$read) or die "Can't read $fn ($read)";
return $hdl;
}