EMS: more flexible way to concatenate LM training data.

the implementation allows the user to specify which corpora to combine, and to have multiple LMs on the same data.
2024-11-13 10:40:52 +03:00 · 2015-05-20 17:16:41 +01:00 · 2015-05-20 17:16:41 +01:00 · 6aac7ded9a
commit 6aac7ded9a
parent 1dfbbde846
7 changed files with 54 additions and 224 deletions
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@ -285,28 +285,6 @@ type = 8
 # (if used at all, should be small as a percentage of corpus)
 #settings = "--line-count 100000"

-#################################################################
-# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
-
-[CONCATENATED-LM] IGNORE
-
-### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
-#
-#lm-training = $srilm-dir/ngram-count
-#settings = "-interpolate -kndiscount -unk"
-#order = 5
-
-### script to create quantized language model format
-# (default: no quantization)
-#
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
-
-### script to use for binary table format for irstlm or kenlm
-# kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/bin/build_binary
-#type = 8
-
 #################################################################
 # TRANSLATION MODEL TRAINING

--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@ -305,28 +305,6 @@ factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.p
 # (if used at all, should be small as a percentage of corpus)
 #settings = "--line-count 100000"

-#################################################################
-# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
-
-[CONCATENATED-LM] IGNORE
-
-### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
-#
-#lm-training = $srilm-dir/ngram-count
-#settings = "-interpolate -kndiscount -unk"
-#order = 5
-
-### script to create quantized language model format
-# (default: no quantization)
-#
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
-
-### script to use for binary table format for irstlm or kenlm
-# kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/bin/build_binary
-#type = 8
-
 #################################################################
 # TRANSLATION MODEL TRAINING

--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@ -288,28 +288,6 @@ type = 8
 # (if used at all, should be small as a percentage of corpus)
 #settings = "--line-count 100000"

-#################################################################
-# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
-
-[CONCATENATED-LM] IGNORE
-
-### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
-#
-#lm-training = $srilm-dir/ngram-count
-#settings = "-interpolate -kndiscount -unk"
-#order = 5
-
-### script to create quantized language model format
-# (default: no quantization)
-#
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
-
-### script to use for binary table format for irstlm or kenlm
-# kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/bin/build_binary
-#type = 8
-
 #################################################################
 # TRANSLATION MODEL TRAINING

--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@ -292,28 +292,6 @@ type = 8
 # (if used at all, should be small as a percentage of corpus)
 #settings = "--line-count 100000"

-#################################################################
-# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
-
-[CONCATENATED-LM] IGNORE
-
-### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
-#
-#lm-training = $srilm-dir/ngram-count
-#settings = "-interpolate -kndiscount -unk"
-#order = 5
-
-### script to create quantized language model format
-# (default: no quantization)
-#
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
-
-### script to use for binary table format for irstlm or kenlm
-# kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/bin/build_binary
-#type = 8
-
 #################################################################
 # TRANSLATION MODEL TRAINING

--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@ -269,28 +269,6 @@ type = 8
 # (if used at all, should be small as a percentage of corpus)
 #settings = "--line-count 100000"

-#################################################################
-# TRAIN LANGUAGE MODELS ON CONCATENATION OF DATA
-
-[CONCATENATED-LM] IGNORE
-
-### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
-#
-#lm-training = $srilm-dir/ngram-count
-#settings = "-interpolate -kndiscount -unk"
-#order = 5
-
-### script to create quantized language model format
-# (default: no quantization)
-#
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
-
-### script to use for binary table format for irstlm or kenlm
-# kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/bin/build_binary
-#type = 8
-
 #################################################################
 # TRANSLATION MODEL TRAINING

--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -177,7 +177,7 @@ tokenize
 	out: tokenized-corpus
 	default-name: lm/tok
 	pass-unless: output-tokenizer
-	ignore-if: parallel-corpus-stem
+	ignore-if: parallel-corpus-stem concatenate-files link-file concatenate-files-split link-file-split
 	template: $output-tokenizer < IN > OUT
 	parallelizable: yes
 mock-parse
@ -185,12 +185,14 @@ mock-parse
 	out: mock-parsed-corpus
 	default-name: lm/mock-parsed
 	pass-unless: mock-output-parser-lm
+	ignore-if: concatenate-files link-file concatenate-files-split link-file-split
 	template: $mock-output-parser-lm < IN > OUT
 factorize
 	in: mock-parsed-corpus
 	out: factorized-corpus
 	default-name: lm/factored
-	pass-unless: factors	
+	pass-unless: factors
+	ignore-if: concatenate-files link-file concatenate-files-split link-file-split
 	parallelizable: yes
 	error: can't open
 	error: incompatible number of words in factor
@ -199,7 +201,7 @@ lowercase
 	out: lowercased-corpus
 	default-name: lm/lowercased
 	pass-unless: output-lowercaser
-	ignore-if: output-truecaser
+	ignore-if: output-truecaser concatenate-files link-file concatenate-files-split link-file-split
 	#only-factor-0: yes
 	template: $output-lowercaser < IN > OUT
 	parallelizable: yes
@ -209,6 +211,7 @@ truecase
 	rerun-on-change: output-truecaser
 	default-name: lm/truecased
 	ignore-unless: output-truecaser
+	ignore-if: concatenate-files link-file concatenate-files-split link-file-split
 	only-factor-0: yes
 	template: $output-truecaser -model IN1.$output-extension < IN > OUT
 	parallelizable: yes
@ -218,13 +221,39 @@ split
 	rerun-on-change: output-splitter
 	default-name: lm/split
 	pass-unless: output-splitter
+	ignore-if: concatenate-files link-file concatenate-files-split link-file-split
 	template: $output-splitter -model IN1.$output-extension < IN > OUT
 strip
        in: split-corpus
        out: stripped-corpus
        default-name: lm/stripped
        pass-unless: mock-output-parser-lm
+        ignore-if: concatenate-files link-file
        template: $moses-script-dir/training/strip-xml.perl < IN > OUT
+concatenate-split
+        in: concatenate-files-split
+        out: split-corpus
+        ignore-unless: concatenate-files-split
+        default-name: lm/split
+        template: cat IN > OUT
+concatenate
+        in: concatenate-files
+        out: stripped-corpus
+        ignore-unless: concatenate-files
+        default-name: lm/stripped
+        template: cat IN > OUT
+link-split
+	in: link-file-split
+	out: split-corpus
+	default-name: lm/split
+	ignore-unless: link-file-split
+	template: ln -s IN OUT
+link
+	in: link-file
+	out: stripped-corpus
+	default-name: lm/stripped
+	ignore-unless: link-file
+	template: ln -s IN OUT
 train
 	in: stripped-corpus
 	out: lm
@ -250,7 +279,7 @@ train-custom-syntax
 	out: binlm
 	default-name: lm/custom-lm
 	rerun-on-change: custom-training
-	ignore-unless: AND custom-training config-feature-line config-weight-line syntactic
+	ignore-unless: AND custom-training config-feature-line config-weight-line syntactic mock-output-parser-lm
 	template: $custom-training -text IN -lm OUT
 	final-model: yes
 randomize
@ -455,75 +484,6 @@ train-in-mono
 	ignore-if: indomain-stem
 	default-name: mml/model
 	template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
-[CONCATENATED-LM] single
-concatenate
-	in: LM:stripped-corpus
-	out: concatenated-corpus
-	default-name: lm/concatenated
-	template: cat IN > OUT
-concatenate-mock-parsed
-	in: LM:split-corpus
-	out: concatenated-split-corpus
-	pass-unless: LM:mock-output-parser-lm
-	default-name: lm/concatenated-mock-parsed
-	template: cat IN > OUT
-train
-	in: concatenated-corpus
-	out: lm
-	default-name: lm/concatenated-lm
-	rerun-on-change: lm-training order settings
-	template: $lm-training -order $order $settings -text IN -lm OUT
-	ignore-unless: lm-training
-	ignore-if: rlm-training custom-training
-	error: cannot execute binary file
-	error: unrecognised option
-	not-error: BadDiscountException
-	not-error: To override this error
-train-custom
-	in: concatenated-corpus
-	out: binlm
-	default-name: lm/concatenated-custom-lm
-	rerun-on-change: custom-training
-	ignore-unless: AND custom-training config-feature-line config-weight-line
-	ignore-if: syntactic
-	template: $custom-training -text IN -lm OUT
-	final-model: yes
-train-custom-syntax
-	in: concatenated-split-corpus
-	out: binlm
-	default-name: lm/concatenated-custom-lm
-	rerun-on-change: custom-training
-	ignore-unless: AND custom-training config-feature-line config-weight-line syntactic
-	template: $custom-training -text IN -lm OUT
-	final-model: yes
-randomize
-	in: lm
-	out: rlm
-	default-name: lm/concatenated-rlm
-	pass-unless: lm-randomizer
-	ignore-if: rlm-training
-train-randomized
-	in: concatenated-corpus
-	out: rlm
-	default-name: lm/concatenated-rlm
-	ignore-unless: rlm-training
-	rerun-on-change: rlm-training order
-quantize
-	in: rlm
-	out: qlm
-	pass-unless: lm-quantizer
-	default-name: lm/concatenated-qlm
-	template: $lm-quantizer IN OUT
-binarize
-	in: qlm
-	out: binlm
-	pass-unless: lm-binarizer
-	ignore-unless: lm-training rlm-training
-	rerun-on-change: lm
-	default-name: lm/concatenated-binlm
-	template: $lm-binarizer IN OUT
-	error: set KENLM_MAX_ORDER to at least this value
-	final-model: yes
 [TRAINING] single
 consolidate
 	in: CORPUS:clean-split-stem
@ -783,7 +743,7 @@ build-sparse
        default-name: model/sparse-features
 	template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
 create-config
-	in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model CONCATENATED-LM:binlm INTERPOLATED-LM:binlm LM:binlm
+	in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
 	out: config
 	ignore-if: use-hiero thot
 	rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -619,10 +619,24 @@ sub find_steps_for_module {
 	    foreach my $in (@IN) {
 		print "\t\tneeds input $in: " if $VERBOSE;
 		if(defined($CONFIG{$in}) && $CONFIG{$in}[0] =~ /^\[(.+)\]$/) {
-		    $in = $1;
-		    print $in if $VERBOSE;
-		    push @{$NEEDED{$in}}, $#DO_STEP;
-		    print "\n\t\tcross-directed to $in\n" if $VERBOSE;
+		    # multiple input, explicitly defined (example: LM:{europarl,nc}:lm )
+		    if ($CONFIG{$in}[0] =~ /^\[([^:]+):{(\S+)}:(\S+)\]$/) {
+			my @SETS = split(',', $2);
+			foreach my $set (@SETS) {
+			    $in = &construct_name($1,$set,$3);
+			    print $in if $VERBOSE;
+			    push @{$NEEDED{$in}}, $#DO_STEP;
+			    push @{$USES_INPUT{$#DO_STEP}},$in;
+			    print "\n\t\tcross-directed to $in\n" if $VERBOSE;
+			}
+		    $in = "";
+		    }
+		    else {
+			$in = $1;
+			print $in if $VERBOSE;
+			push @{$NEEDED{$in}}, $#DO_STEP;
+			print "\n\t\tcross-directed to $in\n" if $VERBOSE;
+		    }
 		}
 		elsif(defined($CONFIG{$in})) {
 		    print "\n\t\t... but that is specified\n" if $VERBOSE;
@ -2499,7 +2513,7 @@ sub get_config_tables {
 sub define_training_create_config {
    my ($step_id) = @_;

-    my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm,$concat_lm, @LM)
+    my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
 			= &get_output_and_input($step_id);

    my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
@ -2572,34 +2586,6 @@ sub define_training_create_config {
  my $feature_lines = "";
  my $weight_lines = "";

-  if ($concat_lm) {
-    if (&get("CONCATENATED-LM:config-feature-line") && &get("CONCATENATED-LM:config-weight-line")) {
-        $feature_lines .= &get("CONCATENATED-LM:config-feature-line") . ";";
-        $weight_lines .= &get("CONCATENATED-LM:config-weight-line") . ";";
-    }
-    else {
-        my $type = 0;
-        my $order = &check_backoff_and_get("CONCATENATED-LM:order");
-        # binarizing the lm?
-        $type = 1 if (&get("CONCATENATED-LM:binlm") ||
-                        &backoff_and_get("CONCATENATED-LM:lm-binarizer"));
-        # randomizing the lm?
-        $type = 5 if (&get("CONCATENATED-LM:rlm") ||
-                        &backoff_and_get("CONCATENATED-LM:lm-randomizer"));
-
-        # manually set type
-        $type = &get("CONCATENATED-LM:type") if &get("CONCATENATED-LM:type");
-
-        # which factor is the model trained on?
-        my $factor = 0;
-        if (&backoff_and_get("TRAINING:output-factors") &&
-                &backoff_and_get("CONCATENATED-LM:factors")) {
-                $factor = $OUTPUT_FACTORS{&backoff_and_get("CONCATENATED-LM:factors")};
-        }
-
-        $cmd .= "-lm $factor:$order:$concat_lm:$type ";
-    }
-  }

 	die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match")
 	    unless scalar @LM == scalar @LM_SETS;
@ -2629,13 +2615,6 @@ sub define_training_create_config {
                # manually set type
                $type = &backoff_and_get("LM:$set:type") if (&backoff_and_get("LM:$set:type"));

-                # binarized by INTERPOLATED-LM
-                if (&get("INTERPOLATED-LM:lm-binarizer")) {
-                    $lm_file =~ s/\.lm/\.binlm/;
-                    $type = 1;
-                    $type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type");
-                }
-
                # which factor is the model trained on?
                my $factor = 0;
                if (&backoff_and_get("TRAINING:output-factors") &&
@ -2813,6 +2792,7 @@ sub get_interpolated_lm_sets {
  my $count=0;
  my $icount=0;
  foreach my $set (@LM_SETS) {
+    next if (&get("LM:$set:exclude-from-interpolation"));
    my $order = &check_backoff_and_get("LM:$set:order");

    my $factor = 0;