mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-13 10:40:52 +03:00
EMS: more flexible way to concatenate LM training data.
the implementation allows the user to specify which corpora to combine, and to have multiple LMs on the same data.
This commit is contained in:
parent
1dfbbde846
commit
6aac7ded9a
@ -285,28 +285,6 @@ type = 8
|
||||
# (if used at all, should be small as a percentage of corpus)
|
||||
#settings = "--line-count 100000"
|
||||
|
||||
#################################################################
|
||||
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
|
||||
|
||||
[CONCATENATED-LM] IGNORE
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#settings = "-interpolate -kndiscount -unk"
|
||||
#order = 5
|
||||
|
||||
### script to create quantized language model format
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#type = 8
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
|
@ -305,28 +305,6 @@ factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.p
|
||||
# (if used at all, should be small as a percentage of corpus)
|
||||
#settings = "--line-count 100000"
|
||||
|
||||
#################################################################
|
||||
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
|
||||
|
||||
[CONCATENATED-LM] IGNORE
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#settings = "-interpolate -kndiscount -unk"
|
||||
#order = 5
|
||||
|
||||
### script to create quantized language model format
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#type = 8
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
|
@ -288,28 +288,6 @@ type = 8
|
||||
# (if used at all, should be small as a percentage of corpus)
|
||||
#settings = "--line-count 100000"
|
||||
|
||||
#################################################################
|
||||
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
|
||||
|
||||
[CONCATENATED-LM] IGNORE
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#settings = "-interpolate -kndiscount -unk"
|
||||
#order = 5
|
||||
|
||||
### script to create quantized language model format
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#type = 8
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
|
@ -292,28 +292,6 @@ type = 8
|
||||
# (if used at all, should be small as a percentage of corpus)
|
||||
#settings = "--line-count 100000"
|
||||
|
||||
#################################################################
|
||||
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
|
||||
|
||||
[CONCATENATED-LM] IGNORE
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#settings = "-interpolate -kndiscount -unk"
|
||||
#order = 5
|
||||
|
||||
### script to create quantized language model format
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#type = 8
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
|
@ -269,28 +269,6 @@ type = 8
|
||||
# (if used at all, should be small as a percentage of corpus)
|
||||
#settings = "--line-count 100000"
|
||||
|
||||
#################################################################
|
||||
# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
|
||||
|
||||
[CONCATENATED-LM] IGNORE
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#settings = "-interpolate -kndiscount -unk"
|
||||
#order = 5
|
||||
|
||||
### script to create quantized language model format
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#type = 8
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
|
@ -177,7 +177,7 @@ tokenize
|
||||
out: tokenized-corpus
|
||||
default-name: lm/tok
|
||||
pass-unless: output-tokenizer
|
||||
ignore-if: parallel-corpus-stem
|
||||
ignore-if: parallel-corpus-stem concatenate-files link-file concatenate-files-split link-file-split
|
||||
template: $output-tokenizer < IN > OUT
|
||||
parallelizable: yes
|
||||
mock-parse
|
||||
@ -185,12 +185,14 @@ mock-parse
|
||||
out: mock-parsed-corpus
|
||||
default-name: lm/mock-parsed
|
||||
pass-unless: mock-output-parser-lm
|
||||
ignore-if: concatenate-files link-file concatenate-files-split link-file-split
|
||||
template: $mock-output-parser-lm < IN > OUT
|
||||
factorize
|
||||
in: mock-parsed-corpus
|
||||
out: factorized-corpus
|
||||
default-name: lm/factored
|
||||
pass-unless: factors
|
||||
pass-unless: factors
|
||||
ignore-if: concatenate-files link-file concatenate-files-split link-file-split
|
||||
parallelizable: yes
|
||||
error: can't open
|
||||
error: incompatible number of words in factor
|
||||
@ -199,7 +201,7 @@ lowercase
|
||||
out: lowercased-corpus
|
||||
default-name: lm/lowercased
|
||||
pass-unless: output-lowercaser
|
||||
ignore-if: output-truecaser
|
||||
ignore-if: output-truecaser concatenate-files link-file concatenate-files-split link-file-split
|
||||
#only-factor-0: yes
|
||||
template: $output-lowercaser < IN > OUT
|
||||
parallelizable: yes
|
||||
@ -209,6 +211,7 @@ truecase
|
||||
rerun-on-change: output-truecaser
|
||||
default-name: lm/truecased
|
||||
ignore-unless: output-truecaser
|
||||
ignore-if: concatenate-files link-file concatenate-files-split link-file-split
|
||||
only-factor-0: yes
|
||||
template: $output-truecaser -model IN1.$output-extension < IN > OUT
|
||||
parallelizable: yes
|
||||
@ -218,13 +221,39 @@ split
|
||||
rerun-on-change: output-splitter
|
||||
default-name: lm/split
|
||||
pass-unless: output-splitter
|
||||
ignore-if: concatenate-files link-file concatenate-files-split link-file-split
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
strip
|
||||
in: split-corpus
|
||||
out: stripped-corpus
|
||||
default-name: lm/stripped
|
||||
pass-unless: mock-output-parser-lm
|
||||
ignore-if: concatenate-files link-file
|
||||
template: $moses-script-dir/training/strip-xml.perl < IN > OUT
|
||||
concatenate-split
|
||||
in: concatenate-files-split
|
||||
out: split-corpus
|
||||
ignore-unless: concatenate-files-split
|
||||
default-name: lm/split
|
||||
template: cat IN > OUT
|
||||
concatenate
|
||||
in: concatenate-files
|
||||
out: stripped-corpus
|
||||
ignore-unless: concatenate-files
|
||||
default-name: lm/stripped
|
||||
template: cat IN > OUT
|
||||
link-split
|
||||
in: link-file-split
|
||||
out: split-corpus
|
||||
default-name: lm/split
|
||||
ignore-unless: link-file-split
|
||||
template: ln -s IN OUT
|
||||
link
|
||||
in: link-file
|
||||
out: stripped-corpus
|
||||
default-name: lm/stripped
|
||||
ignore-unless: link-file
|
||||
template: ln -s IN OUT
|
||||
train
|
||||
in: stripped-corpus
|
||||
out: lm
|
||||
@ -250,7 +279,7 @@ train-custom-syntax
|
||||
out: binlm
|
||||
default-name: lm/custom-lm
|
||||
rerun-on-change: custom-training
|
||||
ignore-unless: AND custom-training config-feature-line config-weight-line syntactic
|
||||
ignore-unless: AND custom-training config-feature-line config-weight-line syntactic mock-output-parser-lm
|
||||
template: $custom-training -text IN -lm OUT
|
||||
final-model: yes
|
||||
randomize
|
||||
@ -455,75 +484,6 @@ train-in-mono
|
||||
ignore-if: indomain-stem
|
||||
default-name: mml/model
|
||||
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
|
||||
[CONCATENATED-LM] single
|
||||
concatenate
|
||||
in: LM:stripped-corpus
|
||||
out: concatenated-corpus
|
||||
default-name: lm/concatenated
|
||||
template: cat IN > OUT
|
||||
concatenate-mock-parsed
|
||||
in: LM:split-corpus
|
||||
out: concatenated-split-corpus
|
||||
pass-unless: LM:mock-output-parser-lm
|
||||
default-name: lm/concatenated-mock-parsed
|
||||
template: cat IN > OUT
|
||||
train
|
||||
in: concatenated-corpus
|
||||
out: lm
|
||||
default-name: lm/concatenated-lm
|
||||
rerun-on-change: lm-training order settings
|
||||
template: $lm-training -order $order $settings -text IN -lm OUT
|
||||
ignore-unless: lm-training
|
||||
ignore-if: rlm-training custom-training
|
||||
error: cannot execute binary file
|
||||
error: unrecognised option
|
||||
not-error: BadDiscountException
|
||||
not-error: To override this error
|
||||
train-custom
|
||||
in: concatenated-corpus
|
||||
out: binlm
|
||||
default-name: lm/concatenated-custom-lm
|
||||
rerun-on-change: custom-training
|
||||
ignore-unless: AND custom-training config-feature-line config-weight-line
|
||||
ignore-if: syntactic
|
||||
template: $custom-training -text IN -lm OUT
|
||||
final-model: yes
|
||||
train-custom-syntax
|
||||
in: concatenated-split-corpus
|
||||
out: binlm
|
||||
default-name: lm/concatenated-custom-lm
|
||||
rerun-on-change: custom-training
|
||||
ignore-unless: AND custom-training config-feature-line config-weight-line syntactic
|
||||
template: $custom-training -text IN -lm OUT
|
||||
final-model: yes
|
||||
randomize
|
||||
in: lm
|
||||
out: rlm
|
||||
default-name: lm/concatenated-rlm
|
||||
pass-unless: lm-randomizer
|
||||
ignore-if: rlm-training
|
||||
train-randomized
|
||||
in: concatenated-corpus
|
||||
out: rlm
|
||||
default-name: lm/concatenated-rlm
|
||||
ignore-unless: rlm-training
|
||||
rerun-on-change: rlm-training order
|
||||
quantize
|
||||
in: rlm
|
||||
out: qlm
|
||||
pass-unless: lm-quantizer
|
||||
default-name: lm/concatenated-qlm
|
||||
template: $lm-quantizer IN OUT
|
||||
binarize
|
||||
in: qlm
|
||||
out: binlm
|
||||
pass-unless: lm-binarizer
|
||||
ignore-unless: lm-training rlm-training
|
||||
rerun-on-change: lm
|
||||
default-name: lm/concatenated-binlm
|
||||
template: $lm-binarizer IN OUT
|
||||
error: set KENLM_MAX_ORDER to at least this value
|
||||
final-model: yes
|
||||
[TRAINING] single
|
||||
consolidate
|
||||
in: CORPUS:clean-split-stem
|
||||
@ -783,7 +743,7 @@ build-sparse
|
||||
default-name: model/sparse-features
|
||||
template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
|
||||
create-config
|
||||
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model CONCATENATED-LM:binlm INTERPOLATED-LM:binlm LM:binlm
|
||||
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
|
||||
out: config
|
||||
ignore-if: use-hiero thot
|
||||
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature
|
||||
|
@ -619,10 +619,24 @@ sub find_steps_for_module {
|
||||
foreach my $in (@IN) {
|
||||
print "\t\tneeds input $in: " if $VERBOSE;
|
||||
if(defined($CONFIG{$in}) && $CONFIG{$in}[0] =~ /^\[(.+)\]$/) {
|
||||
$in = $1;
|
||||
print $in if $VERBOSE;
|
||||
push @{$NEEDED{$in}}, $#DO_STEP;
|
||||
print "\n\t\tcross-directed to $in\n" if $VERBOSE;
|
||||
# multiple input, explicitly defined (example: LM:{europarl,nc}:lm )
|
||||
if ($CONFIG{$in}[0] =~ /^\[([^:]+):{(\S+)}:(\S+)\]$/) {
|
||||
my @SETS = split(',', $2);
|
||||
foreach my $set (@SETS) {
|
||||
$in = &construct_name($1,$set,$3);
|
||||
print $in if $VERBOSE;
|
||||
push @{$NEEDED{$in}}, $#DO_STEP;
|
||||
push @{$USES_INPUT{$#DO_STEP}},$in;
|
||||
print "\n\t\tcross-directed to $in\n" if $VERBOSE;
|
||||
}
|
||||
$in = "";
|
||||
}
|
||||
else {
|
||||
$in = $1;
|
||||
print $in if $VERBOSE;
|
||||
push @{$NEEDED{$in}}, $#DO_STEP;
|
||||
print "\n\t\tcross-directed to $in\n" if $VERBOSE;
|
||||
}
|
||||
}
|
||||
elsif(defined($CONFIG{$in})) {
|
||||
print "\n\t\t... but that is specified\n" if $VERBOSE;
|
||||
@ -2499,7 +2513,7 @@ sub get_config_tables {
|
||||
sub define_training_create_config {
|
||||
my ($step_id) = @_;
|
||||
|
||||
my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm,$concat_lm, @LM)
|
||||
my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
|
||||
= &get_output_and_input($step_id);
|
||||
|
||||
my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
|
||||
@ -2572,34 +2586,6 @@ sub define_training_create_config {
|
||||
my $feature_lines = "";
|
||||
my $weight_lines = "";
|
||||
|
||||
if ($concat_lm) {
|
||||
if (&get("CONCATENATED-LM:config-feature-line") && &get("CONCATENATED-LM:config-weight-line")) {
|
||||
$feature_lines .= &get("CONCATENATED-LM:config-feature-line") . ";";
|
||||
$weight_lines .= &get("CONCATENATED-LM:config-weight-line") . ";";
|
||||
}
|
||||
else {
|
||||
my $type = 0;
|
||||
my $order = &check_backoff_and_get("CONCATENATED-LM:order");
|
||||
# binarizing the lm?
|
||||
$type = 1 if (&get("CONCATENATED-LM:binlm") ||
|
||||
&backoff_and_get("CONCATENATED-LM:lm-binarizer"));
|
||||
# randomizing the lm?
|
||||
$type = 5 if (&get("CONCATENATED-LM:rlm") ||
|
||||
&backoff_and_get("CONCATENATED-LM:lm-randomizer"));
|
||||
|
||||
# manually set type
|
||||
$type = &get("CONCATENATED-LM:type") if &get("CONCATENATED-LM:type");
|
||||
|
||||
# which factor is the model trained on?
|
||||
my $factor = 0;
|
||||
if (&backoff_and_get("TRAINING:output-factors") &&
|
||||
&backoff_and_get("CONCATENATED-LM:factors")) {
|
||||
$factor = $OUTPUT_FACTORS{&backoff_and_get("CONCATENATED-LM:factors")};
|
||||
}
|
||||
|
||||
$cmd .= "-lm $factor:$order:$concat_lm:$type ";
|
||||
}
|
||||
}
|
||||
|
||||
die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match")
|
||||
unless scalar @LM == scalar @LM_SETS;
|
||||
@ -2629,13 +2615,6 @@ sub define_training_create_config {
|
||||
# manually set type
|
||||
$type = &backoff_and_get("LM:$set:type") if (&backoff_and_get("LM:$set:type"));
|
||||
|
||||
# binarized by INTERPOLATED-LM
|
||||
if (&get("INTERPOLATED-LM:lm-binarizer")) {
|
||||
$lm_file =~ s/\.lm/\.binlm/;
|
||||
$type = 1;
|
||||
$type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type");
|
||||
}
|
||||
|
||||
# which factor is the model trained on?
|
||||
my $factor = 0;
|
||||
if (&backoff_and_get("TRAINING:output-factors") &&
|
||||
@ -2813,6 +2792,7 @@ sub get_interpolated_lm_sets {
|
||||
my $count=0;
|
||||
my $icount=0;
|
||||
foreach my $set (@LM_SETS) {
|
||||
next if (&get("LM:$set:exclude-from-interpolation"));
|
||||
my $order = &check_backoff_and_get("LM:$set:order");
|
||||
|
||||
my $factor = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user