minor bug fixes with MML

This commit is contained in:
phikoehn 2012-12-09 20:31:20 +00:00
parent 2a3c9fc679
commit 466b502ae0
9 changed files with 33 additions and 20 deletions

View File

@ -277,8 +277,8 @@ type = 8
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -586,6 +586,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
#
# visualization of the search graph in tree-based models
#analyze-search-graph = yes
[EVALUATION:newstest2011]

View File

@ -297,8 +297,8 @@ factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.p
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -602,6 +602,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
#
# visualization of the search graph in tree-based models
#analyze-search-graph = yes
[EVALUATION:newstest2011]

View File

@ -277,8 +277,8 @@ type = 8
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -581,6 +581,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
#
# visualization of the search graph in tree-based models
#analyze-search-graph = yes
[EVALUATION:newstest2011]

View File

@ -281,8 +281,8 @@ type = 8
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -585,6 +585,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
#
# visualization of the search graph in tree-based models
#analyze-search-graph = yes
[EVALUATION:newstest2011]

View File

@ -261,8 +261,8 @@ type = 8
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -562,6 +562,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
#
# visualization of the search graph in tree-based models
#analyze-search-graph = yes
[EVALUATION:test]

View File

@ -1,4 +1,4 @@
cluster: townhill seville hermes lion seville sannox lutzow frontend
multicore-8: tyr thor odin crom
multicore-16: saxnot vali vili freyja bragi hoenir
multicore-24: syn hel skaol saga
multicore-24: syn hel skaol saga buri loki sif magni

View File

@ -360,18 +360,17 @@ split-indomain-target
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
train
in: indomain-stem outdomain-stem
in: indomain-stem outdomain-stem settings
out: model
ignore-unless: AND settings indomain-stem
ignore-unless: indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer IN2
train-in-mono
in: indomain-source indomain-target outdomain-stem
in: indomain-source indomain-target outdomain-stem settings
out: model
ignore-unless: settings
ignore-if: indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer IN3
[TRAINING] single
consolidate
@ -391,7 +390,7 @@ mml-score
ignore-unless: mml-before-wa mml-after-wa
rerun-on-change: mml-filter-corpora
default-name: training/corpus-mml-score
template: $moses-script-dir/ems/support/mml-score.perl -model IN -corpus IN1 -domains IN2 -input-extension $input-extension -output-extension $output-extension -query $MML:lm-query -filter-domains $mml-filter-corpora > OUT
template: $moses-script-dir/ems/support/mml-score.perl -model IN -corpus IN1 -domains IN2 -input-extension $input-extension -output-extension $output-extension -query $MML:lm-query -filter-domains "$mml-filter-corpora" > OUT
mml-filter-before-wa
in: corpus mml-scores domains
out: corpus-mml-prefilter

View File

@ -58,7 +58,6 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
open(INI,">$outfile_prefix.ini");
print INI $ini;
print INI "\n[report-sparse-features]\n$report\n";
print INI "\n[use-alignment-info]\ntrue\n\n";
close(INI);
sub create_top_words {

View File

@ -72,7 +72,7 @@ sub train_lm {
$file = "$model.$type.tok";
}
my $cmd = "$lm_training -order $order $lm_settings -text $model.$type.tok -lm $model.$type.lm";
my $cmd = "$lm_training -order $order $lm_settings -text $file -lm $model.$type.lm";
$cmd .= " -vocab $model.$vocab.vocab" if defined($vocab);
print STDERR $cmd."\n";
print STDERR `$cmd`;