EMS: more flexible way to concatenate LM training data.

the implementation allows the user to specify which corpora to combine,
and to have multiple LMs on the same data.
This commit is contained in:
Rico Sennrich 2015-05-20 17:16:41 +01:00
parent 1dfbbde846
commit 6aac7ded9a
7 changed files with 54 additions and 224 deletions

View File

@ -285,28 +285,6 @@ type = 8
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
[CONCATENATED-LM] IGNORE
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
#lm-training = $srilm-dir/ngram-count
#settings = "-interpolate -kndiscount -unk"
#order = 5
### script to create quantized language model format
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
### script to use for binary table format for irstlm or kenlm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/bin/build_binary
#type = 8
#################################################################
# TRANSLATION MODEL TRAINING

View File

@ -305,28 +305,6 @@ factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.p
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
[CONCATENATED-LM] IGNORE
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
#lm-training = $srilm-dir/ngram-count
#settings = "-interpolate -kndiscount -unk"
#order = 5
### script to create quantized language model format
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
### script to use for binary table format for irstlm or kenlm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/bin/build_binary
#type = 8
#################################################################
# TRANSLATION MODEL TRAINING

View File

@ -288,28 +288,6 @@ type = 8
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
[CONCATENATED-LM] IGNORE
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
#lm-training = $srilm-dir/ngram-count
#settings = "-interpolate -kndiscount -unk"
#order = 5
### script to create quantized language model format
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
### script to use for binary table format for irstlm or kenlm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/bin/build_binary
#type = 8
#################################################################
# TRANSLATION MODEL TRAINING

View File

@ -292,28 +292,6 @@ type = 8
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
[CONCATENATED-LM] IGNORE
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
#lm-training = $srilm-dir/ngram-count
#settings = "-interpolate -kndiscount -unk"
#order = 5
### script to create quantized language model format
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
### script to use for binary table format for irstlm or kenlm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/bin/build_binary
#type = 8
#################################################################
# TRANSLATION MODEL TRAINING

View File

@ -269,28 +269,6 @@ type = 8
# (if used at all, should be small as a percentage of corpus)
#settings = "--line-count 100000"
#################################################################
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
[CONCATENATED-LM] IGNORE
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
#lm-training = $srilm-dir/ngram-count
#settings = "-interpolate -kndiscount -unk"
#order = 5
### script to create quantized language model format
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
### script to use for binary table format for irstlm or kenlm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/bin/build_binary
#type = 8
#################################################################
# TRANSLATION MODEL TRAINING

View File

@ -177,7 +177,7 @@ tokenize
out: tokenized-corpus
default-name: lm/tok
pass-unless: output-tokenizer
ignore-if: parallel-corpus-stem
ignore-if: parallel-corpus-stem concatenate-files link-file concatenate-files-split link-file-split
template: $output-tokenizer < IN > OUT
parallelizable: yes
mock-parse
@ -185,12 +185,14 @@ mock-parse
out: mock-parsed-corpus
default-name: lm/mock-parsed
pass-unless: mock-output-parser-lm
ignore-if: concatenate-files link-file concatenate-files-split link-file-split
template: $mock-output-parser-lm < IN > OUT
factorize
in: mock-parsed-corpus
out: factorized-corpus
default-name: lm/factored
pass-unless: factors
pass-unless: factors
ignore-if: concatenate-files link-file concatenate-files-split link-file-split
parallelizable: yes
error: can't open
error: incompatible number of words in factor
@ -199,7 +201,7 @@ lowercase
out: lowercased-corpus
default-name: lm/lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser
ignore-if: output-truecaser concatenate-files link-file concatenate-files-split link-file-split
#only-factor-0: yes
template: $output-lowercaser < IN > OUT
parallelizable: yes
@ -209,6 +211,7 @@ truecase
rerun-on-change: output-truecaser
default-name: lm/truecased
ignore-unless: output-truecaser
ignore-if: concatenate-files link-file concatenate-files-split link-file-split
only-factor-0: yes
template: $output-truecaser -model IN1.$output-extension < IN > OUT
parallelizable: yes
@ -218,13 +221,39 @@ split
rerun-on-change: output-splitter
default-name: lm/split
pass-unless: output-splitter
ignore-if: concatenate-files link-file concatenate-files-split link-file-split
template: $output-splitter -model IN1.$output-extension < IN > OUT
strip
in: split-corpus
out: stripped-corpus
default-name: lm/stripped
pass-unless: mock-output-parser-lm
ignore-if: concatenate-files link-file
template: $moses-script-dir/training/strip-xml.perl < IN > OUT
concatenate-split
in: concatenate-files-split
out: split-corpus
ignore-unless: concatenate-files-split
default-name: lm/split
template: cat IN > OUT
concatenate
in: concatenate-files
out: stripped-corpus
ignore-unless: concatenate-files
default-name: lm/stripped
template: cat IN > OUT
link-split
in: link-file-split
out: split-corpus
default-name: lm/split
ignore-unless: link-file-split
template: ln -s IN OUT
link
in: link-file
out: stripped-corpus
default-name: lm/stripped
ignore-unless: link-file
template: ln -s IN OUT
train
in: stripped-corpus
out: lm
@ -250,7 +279,7 @@ train-custom-syntax
out: binlm
default-name: lm/custom-lm
rerun-on-change: custom-training
ignore-unless: AND custom-training config-feature-line config-weight-line syntactic
ignore-unless: AND custom-training config-feature-line config-weight-line syntactic mock-output-parser-lm
template: $custom-training -text IN -lm OUT
final-model: yes
randomize
@ -455,75 +484,6 @@ train-in-mono
ignore-if: indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
[CONCATENATED-LM] single
concatenate
in: LM:stripped-corpus
out: concatenated-corpus
default-name: lm/concatenated
template: cat IN > OUT
concatenate-mock-parsed
in: LM:split-corpus
out: concatenated-split-corpus
pass-unless: LM:mock-output-parser-lm
default-name: lm/concatenated-mock-parsed
template: cat IN > OUT
train
in: concatenated-corpus
out: lm
default-name: lm/concatenated-lm
rerun-on-change: lm-training order settings
template: $lm-training -order $order $settings -text IN -lm OUT
ignore-unless: lm-training
ignore-if: rlm-training custom-training
error: cannot execute binary file
error: unrecognised option
not-error: BadDiscountException
not-error: To override this error
train-custom
in: concatenated-corpus
out: binlm
default-name: lm/concatenated-custom-lm
rerun-on-change: custom-training
ignore-unless: AND custom-training config-feature-line config-weight-line
ignore-if: syntactic
template: $custom-training -text IN -lm OUT
final-model: yes
train-custom-syntax
in: concatenated-split-corpus
out: binlm
default-name: lm/concatenated-custom-lm
rerun-on-change: custom-training
ignore-unless: AND custom-training config-feature-line config-weight-line syntactic
template: $custom-training -text IN -lm OUT
final-model: yes
randomize
in: lm
out: rlm
default-name: lm/concatenated-rlm
pass-unless: lm-randomizer
ignore-if: rlm-training
train-randomized
in: concatenated-corpus
out: rlm
default-name: lm/concatenated-rlm
ignore-unless: rlm-training
rerun-on-change: rlm-training order
quantize
in: rlm
out: qlm
pass-unless: lm-quantizer
default-name: lm/concatenated-qlm
template: $lm-quantizer IN OUT
binarize
in: qlm
out: binlm
pass-unless: lm-binarizer
ignore-unless: lm-training rlm-training
rerun-on-change: lm
default-name: lm/concatenated-binlm
template: $lm-binarizer IN OUT
error: set KENLM_MAX_ORDER to at least this value
final-model: yes
[TRAINING] single
consolidate
in: CORPUS:clean-split-stem
@ -783,7 +743,7 @@ build-sparse
default-name: model/sparse-features
template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
create-config
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model CONCATENATED-LM:binlm INTERPOLATED-LM:binlm LM:binlm
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
out: config
ignore-if: use-hiero thot
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature

View File

@ -619,10 +619,24 @@ sub find_steps_for_module {
foreach my $in (@IN) {
print "\t\tneeds input $in: " if $VERBOSE;
if(defined($CONFIG{$in}) && $CONFIG{$in}[0] =~ /^\[(.+)\]$/) {
$in = $1;
print $in if $VERBOSE;
push @{$NEEDED{$in}}, $#DO_STEP;
print "\n\t\tcross-directed to $in\n" if $VERBOSE;
# multiple input, explicitly defined (example: LM:{europarl,nc}:lm )
if ($CONFIG{$in}[0] =~ /^\[([^:]+):{(\S+)}:(\S+)\]$/) {
my @SETS = split(',', $2);
foreach my $set (@SETS) {
$in = &construct_name($1,$set,$3);
print $in if $VERBOSE;
push @{$NEEDED{$in}}, $#DO_STEP;
push @{$USES_INPUT{$#DO_STEP}},$in;
print "\n\t\tcross-directed to $in\n" if $VERBOSE;
}
$in = "";
}
else {
$in = $1;
print $in if $VERBOSE;
push @{$NEEDED{$in}}, $#DO_STEP;
print "\n\t\tcross-directed to $in\n" if $VERBOSE;
}
}
elsif(defined($CONFIG{$in})) {
print "\n\t\t... but that is specified\n" if $VERBOSE;
@ -2499,7 +2513,7 @@ sub get_config_tables {
sub define_training_create_config {
my ($step_id) = @_;
my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm,$concat_lm, @LM)
my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
= &get_output_and_input($step_id);
my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
@ -2572,34 +2586,6 @@ sub define_training_create_config {
my $feature_lines = "";
my $weight_lines = "";
if ($concat_lm) {
if (&get("CONCATENATED-LM:config-feature-line") && &get("CONCATENATED-LM:config-weight-line")) {
$feature_lines .= &get("CONCATENATED-LM:config-feature-line") . ";";
$weight_lines .= &get("CONCATENATED-LM:config-weight-line") . ";";
}
else {
my $type = 0;
my $order = &check_backoff_and_get("CONCATENATED-LM:order");
# binarizing the lm?
$type = 1 if (&get("CONCATENATED-LM:binlm") ||
&backoff_and_get("CONCATENATED-LM:lm-binarizer"));
# randomizing the lm?
$type = 5 if (&get("CONCATENATED-LM:rlm") ||
&backoff_and_get("CONCATENATED-LM:lm-randomizer"));
# manually set type
$type = &get("CONCATENATED-LM:type") if &get("CONCATENATED-LM:type");
# which factor is the model trained on?
my $factor = 0;
if (&backoff_and_get("TRAINING:output-factors") &&
&backoff_and_get("CONCATENATED-LM:factors")) {
$factor = $OUTPUT_FACTORS{&backoff_and_get("CONCATENATED-LM:factors")};
}
$cmd .= "-lm $factor:$order:$concat_lm:$type ";
}
}
die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match")
unless scalar @LM == scalar @LM_SETS;
@ -2629,13 +2615,6 @@ sub define_training_create_config {
# manually set type
$type = &backoff_and_get("LM:$set:type") if (&backoff_and_get("LM:$set:type"));
# binarized by INTERPOLATED-LM
if (&get("INTERPOLATED-LM:lm-binarizer")) {
$lm_file =~ s/\.lm/\.binlm/;
$type = 1;
$type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type");
}
# which factor is the model trained on?
my $factor = 0;
if (&backoff_and_get("TRAINING:output-factors") &&
@ -2813,6 +2792,7 @@ sub get_interpolated_lm_sets {
my $count=0;
my $icount=0;
foreach my $set (@LM_SETS) {
next if (&get("LM:$set:exclude-from-interpolation"));
my $order = &check_backoff_and_get("LM:$set:order");
my $factor = 0;