extensions to modified moore-lewis filtering, bug fixes

This commit is contained in:
phikoehn 2012-11-24 20:13:14 +00:00
parent ea610a0558
commit b5d08745a5
7 changed files with 267 additions and 3 deletions

View File

@ -251,6 +251,35 @@ type = 8
#
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# MODIFIED MOORE LEWIS FILTERING
[MML] IGNORE
### specifications for language models to be trained
#
#lm-training = $srilm-dir/ngram-count
#lm-settings = "-interpolate -kndiscount -unk"
#lm-binarizer = $moses-src-dir/bin/build_binary
#lm-query = $moses-src-dir/bin/query
#order = 5
### in-/out-of-domain source/target corpora to train the 4 language model
#
# in-domain: point either to a parallel corpus
#outdomain-stem = [CORPUS:toy:clean-split-stem]
# ... or to two separate monolingual corpora
#indomain-target = [LM:toy:lowercased-corpus]
#raw-indomain-source = $toy-data/nc-5k.$input-extension
# point to out-of-domain parallel corpus
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -316,6 +345,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### filtering some corpora with modified Moore-Lewis
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
#mml-filter-corpora = toy
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor

View File

@ -271,6 +271,35 @@ temp-dir = $working-dir/training/factor
mxpost = /home/pkoehn/bin/mxpost
factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.perl -mxpost $mxpost"
#################################################################
# MODIFIED MOORE LEWIS FILTERING
[MML] IGNORE
### specifications for language models to be trained
#
#lm-training = $srilm-dir/ngram-count
#lm-settings = "-interpolate -kndiscount -unk"
#lm-binarizer = $moses-src-dir/bin/build_binary
#lm-query = $moses-src-dir/bin/query
#order = 5
### in-/out-of-domain source/target corpora to train the 4 language model
#
# in-domain: point either to a parallel corpus
#outdomain-stem = [CORPUS:toy:clean-split-stem]
# ... or to two separate monolingual corpora
#indomain-target = [LM:toy:lowercased-corpus]
#raw-indomain-source = $toy-data/nc-5k.$input-extension
# point to out-of-domain parallel corpus
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -336,6 +365,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### filtering some corpora with modified Moore-Lewis
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
#mml-filter-corpora = toy
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor

View File

@ -251,6 +251,35 @@ type = 8
#
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# MODIFIED MOORE LEWIS FILTERING
[MML] IGNORE
### specifications for language models to be trained
#
#lm-training = $srilm-dir/ngram-count
#lm-settings = "-interpolate -kndiscount -unk"
#lm-binarizer = $moses-src-dir/bin/build_binary
#lm-query = $moses-src-dir/bin/query
#order = 5
### in-/out-of-domain source/target corpora to train the 4 language model
#
# in-domain: point either to a parallel corpus
#outdomain-stem = [CORPUS:toy:clean-split-stem]
# ... or to two separate monolingual corpora
#indomain-target = [LM:toy:lowercased-corpus]
#raw-indomain-source = $toy-data/nc-5k.$input-extension
# point to out-of-domain parallel corpus
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -316,6 +345,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### filtering some corpora with modified Moore-Lewis
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
#mml-filter-corpora = toy
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor

View File

@ -255,6 +255,35 @@ type = 8
#
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# MODIFIED MOORE LEWIS FILTERING
[MML] IGNORE
### specifications for language models to be trained
#
#lm-training = $srilm-dir/ngram-count
#lm-settings = "-interpolate -kndiscount -unk"
#lm-binarizer = $moses-src-dir/bin/build_binary
#lm-query = $moses-src-dir/bin/query
#order = 5
### in-/out-of-domain source/target corpora to train the 4 language model
#
# in-domain: point either to a parallel corpus
#outdomain-stem = [CORPUS:toy:clean-split-stem]
# ... or to two separate monolingual corpora
#indomain-target = [LM:toy:lowercased-corpus]
#raw-indomain-source = $toy-data/nc-5k.$input-extension
# point to out-of-domain parallel corpus
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -320,6 +349,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### filtering some corpora with modified Moore-Lewis
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
#mml-filter-corpora = toy
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor

View File

@ -235,6 +235,35 @@ type = 8
#
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# MODIFIED MOORE LEWIS FILTERING
[MML] IGNORE
### specifications for language models to be trained
#
#lm-training = $srilm-dir/ngram-count
#lm-settings = "-interpolate -kndiscount -unk"
#lm-binarizer = $moses-src-dir/bin/build_binary
#lm-query = $moses-src-dir/bin/query
#order = 5
### in-/out-of-domain source/target corpora to train the 4 language model
#
# in-domain: point either to a parallel corpus
#outdomain-stem = [CORPUS:toy:clean-split-stem]
# ... or to two separate monolingual corpora
#indomain-target = [LM:toy:lowercased-corpus]
#raw-indomain-source = $toy-data/nc-5k.$input-extension
# point to out-of-domain parallel corpus
#outdomain-stem = [CORPUS:giga:clean-split-stem]
# settings: number of lines sampled from the corpora to train each language model on
# (typically a million or so)
#settings = "--line-count 1000000"
#################################################################
# TRANSLATION MODEL TRAINING
@ -300,6 +329,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
### filtering some corpora with modified Moore-Lewis
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
#mml-filter-corpora = toy
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor

View File

@ -277,12 +277,101 @@ binarize
default-name: lm/interpolated-binlm
error: set kMaxOrder to at least this value
[MML] single
tokenize-indomain-source
in: raw-indomain-source
out: tokenized-indomain-source
default-name: mml/indomain-source.tok
pass-unless: input-tokenizer
template: $input-tokenizer < IN > OUT
parallelizable: yes
factorize-indomain-source
in: tokenized-indomain-source
out: factorized-indomain-source
rerun-on-change: TRAINING:input-factors
default-name: mml/indomain-source.factored
pass-unless: factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
lowercase-indomain-source
in: factorized-indomain-source
out: lowercased-indomain-source
default-name: mml/indomain-source.lowercased
pass-unless: input-lowercaser
ignore-if: input-truecaser
only-factor-0: yes
template: $input-lowercaser < IN > OUT
parallelizable: yes
truecase-indomain-source
in: factorized-indomain-source TRUECASER:truecase-model
out: lowercased-indomain-source
rerun-on-change: input-truecaser
default-name: mml/indomain-source.truecased
ignore-unless: input-truecaser
only-factor-0: yes
template: $input-truecaser -model IN1.$input-extension < IN > OUT
parallelizable: yes
split-indomain-source
in: lowercased-indomain-source SPLITTER:splitter-model
out: indomain-source
rerun-on-change: input-splitter
default-name: mml/indomain-source.split
pass-unless: input-splitter
template: $input-splitter -model IN1.$input-extension < IN > OUT
tokenize-indomain-target
in: raw-indomain-target
out: tokenized-indomain-target
default-name: mml/indomain-target.tok
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
parallelizable: yes
factorize-indomain-target
in: tokenized-indomain-target
out: factorized-indomain-target
rerun-on-change: TRAINING:output-factors
default-name: mml/indomain-target.factored
pass-unless: factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
lowercase-indomain-target
in: factorized-indomain-target
out: lowercased-indomain-target
default-name: mml/indomain-target.lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser
only-factor-0: yes
template: $output-lowercaser < IN > OUT
parallelizable: yes
truecase-indomain-target
in: factorized-indomain-target TRUECASER:truecase-model
out: lowercased-indomain-target
rerun-on-change: output-truecaser
default-name: mml/indomain-target.truecased
ignore-unless: output-truecaser
only-factor-0: yes
template: $output-truecaser -model IN1.$output-extension < IN > OUT
parallelizable: yes
split-indomain-target
in: lowercased-indomain-target SPLITTER:splitter-model
out: indomain-target
rerun-on-change: output-splitter
default-name: mml/indomain-target.split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
train
in: indomain-stem outdomain-stem
out: model
ignore-unless: settings
default-name: model/mml
ignore-unless: AND settings indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
train-in-mono
in: indomain-source indomain-target outdomain-stem
out: model
ignore-unless: settings
ignore-if: indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
[TRAINING] single
consolidate

View File

@ -36,7 +36,7 @@ die("ERROR: model not specified (-model FILESTEM)") unless defined($model);
&train_lm($indomain_source,"in-source");
&train_lm($indomain_target,"in-target");
&train_lm($outdomain_source,"out-source");
&train_lm($outdomain_source,"out-target");
&train_lm($outdomain_target,"out-target");
sub train_lm {
my ($file,$type) = @_;