mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
extensions to modified moore-lewis filtering, bug fixes
This commit is contained in:
parent
ea610a0558
commit
b5d08745a5
@ -251,6 +251,35 @@ type = 8
|
||||
#
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# MODIFIED MOORE LEWIS FILTERING
|
||||
|
||||
[MML] IGNORE
|
||||
|
||||
### specifications for language models to be trained
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#lm-settings = "-interpolate -kndiscount -unk"
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#lm-query = $moses-src-dir/bin/query
|
||||
#order = 5
|
||||
|
||||
### in-/out-of-domain source/target corpora to train the 4 language model
|
||||
#
|
||||
# in-domain: point either to a parallel corpus
|
||||
#outdomain-stem = [CORPUS:toy:clean-split-stem]
|
||||
|
||||
# ... or to two separate monolingual corpora
|
||||
#indomain-target = [LM:toy:lowercased-corpus]
|
||||
#raw-indomain-source = $toy-data/nc-5k.$input-extension
|
||||
|
||||
# point to out-of-domain parallel corpus
|
||||
#outdomain-stem = [CORPUS:giga:clean-split-stem]
|
||||
|
||||
# settings: number of lines sampled from the corpora to train each language model on
|
||||
# (typically a million or so)
|
||||
#settings = "--line-count 1000000"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
@ -316,6 +345,12 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### filtering some corpora with modified Moore-Lewis
|
||||
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
|
||||
#mml-filter-corpora = toy
|
||||
#mml-before-wa = "-proportion 0.9"
|
||||
#mml-after-wa = "-proportion 0.9"
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
@ -271,6 +271,35 @@ temp-dir = $working-dir/training/factor
|
||||
mxpost = /home/pkoehn/bin/mxpost
|
||||
factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.perl -mxpost $mxpost"
|
||||
|
||||
#################################################################
|
||||
# MODIFIED MOORE LEWIS FILTERING
|
||||
|
||||
[MML] IGNORE
|
||||
|
||||
### specifications for language models to be trained
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#lm-settings = "-interpolate -kndiscount -unk"
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#lm-query = $moses-src-dir/bin/query
|
||||
#order = 5
|
||||
|
||||
### in-/out-of-domain source/target corpora to train the 4 language model
|
||||
#
|
||||
# in-domain: point either to a parallel corpus
|
||||
#outdomain-stem = [CORPUS:toy:clean-split-stem]
|
||||
|
||||
# ... or to two separate monolingual corpora
|
||||
#indomain-target = [LM:toy:lowercased-corpus]
|
||||
#raw-indomain-source = $toy-data/nc-5k.$input-extension
|
||||
|
||||
# point to out-of-domain parallel corpus
|
||||
#outdomain-stem = [CORPUS:giga:clean-split-stem]
|
||||
|
||||
# settings: number of lines sampled from the corpora to train each language model on
|
||||
# (typically a million or so)
|
||||
#settings = "--line-count 1000000"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
@ -336,6 +365,12 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### filtering some corpora with modified Moore-Lewis
|
||||
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
|
||||
#mml-filter-corpora = toy
|
||||
#mml-before-wa = "-proportion 0.9"
|
||||
#mml-after-wa = "-proportion 0.9"
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
@ -251,6 +251,35 @@ type = 8
|
||||
#
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# MODIFIED MOORE LEWIS FILTERING
|
||||
|
||||
[MML] IGNORE
|
||||
|
||||
### specifications for language models to be trained
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#lm-settings = "-interpolate -kndiscount -unk"
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#lm-query = $moses-src-dir/bin/query
|
||||
#order = 5
|
||||
|
||||
### in-/out-of-domain source/target corpora to train the 4 language model
|
||||
#
|
||||
# in-domain: point either to a parallel corpus
|
||||
#outdomain-stem = [CORPUS:toy:clean-split-stem]
|
||||
|
||||
# ... or to two separate monolingual corpora
|
||||
#indomain-target = [LM:toy:lowercased-corpus]
|
||||
#raw-indomain-source = $toy-data/nc-5k.$input-extension
|
||||
|
||||
# point to out-of-domain parallel corpus
|
||||
#outdomain-stem = [CORPUS:giga:clean-split-stem]
|
||||
|
||||
# settings: number of lines sampled from the corpora to train each language model on
|
||||
# (typically a million or so)
|
||||
#settings = "--line-count 1000000"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
@ -316,6 +345,12 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### filtering some corpora with modified Moore-Lewis
|
||||
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
|
||||
#mml-filter-corpora = toy
|
||||
#mml-before-wa = "-proportion 0.9"
|
||||
#mml-after-wa = "-proportion 0.9"
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
@ -255,6 +255,35 @@ type = 8
|
||||
#
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# MODIFIED MOORE LEWIS FILTERING
|
||||
|
||||
[MML] IGNORE
|
||||
|
||||
### specifications for language models to be trained
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#lm-settings = "-interpolate -kndiscount -unk"
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#lm-query = $moses-src-dir/bin/query
|
||||
#order = 5
|
||||
|
||||
### in-/out-of-domain source/target corpora to train the 4 language model
|
||||
#
|
||||
# in-domain: point either to a parallel corpus
|
||||
#outdomain-stem = [CORPUS:toy:clean-split-stem]
|
||||
|
||||
# ... or to two separate monolingual corpora
|
||||
#indomain-target = [LM:toy:lowercased-corpus]
|
||||
#raw-indomain-source = $toy-data/nc-5k.$input-extension
|
||||
|
||||
# point to out-of-domain parallel corpus
|
||||
#outdomain-stem = [CORPUS:giga:clean-split-stem]
|
||||
|
||||
# settings: number of lines sampled from the corpora to train each language model on
|
||||
# (typically a million or so)
|
||||
#settings = "--line-count 1000000"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
@ -320,6 +349,12 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### filtering some corpora with modified Moore-Lewis
|
||||
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
|
||||
#mml-filter-corpora = toy
|
||||
#mml-before-wa = "-proportion 0.9"
|
||||
#mml-after-wa = "-proportion 0.9"
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
@ -235,6 +235,35 @@ type = 8
|
||||
#
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# MODIFIED MOORE LEWIS FILTERING
|
||||
|
||||
[MML] IGNORE
|
||||
|
||||
### specifications for language models to be trained
|
||||
#
|
||||
#lm-training = $srilm-dir/ngram-count
|
||||
#lm-settings = "-interpolate -kndiscount -unk"
|
||||
#lm-binarizer = $moses-src-dir/bin/build_binary
|
||||
#lm-query = $moses-src-dir/bin/query
|
||||
#order = 5
|
||||
|
||||
### in-/out-of-domain source/target corpora to train the 4 language model
|
||||
#
|
||||
# in-domain: point either to a parallel corpus
|
||||
#outdomain-stem = [CORPUS:toy:clean-split-stem]
|
||||
|
||||
# ... or to two separate monolingual corpora
|
||||
#indomain-target = [LM:toy:lowercased-corpus]
|
||||
#raw-indomain-source = $toy-data/nc-5k.$input-extension
|
||||
|
||||
# point to out-of-domain parallel corpus
|
||||
#outdomain-stem = [CORPUS:giga:clean-split-stem]
|
||||
|
||||
# settings: number of lines sampled from the corpora to train each language model on
|
||||
# (typically a million or so)
|
||||
#settings = "--line-count 1000000"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
|
||||
@ -300,6 +329,12 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#word-alignment = $working-dir/model/aligned.1
|
||||
|
||||
### filtering some corpora with modified Moore-Lewis
|
||||
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
|
||||
#mml-filter-corpora = toy
|
||||
#mml-before-wa = "-proportion 0.9"
|
||||
#mml-after-wa = "-proportion 0.9"
|
||||
|
||||
### create a bilingual concordancer for the model
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
@ -277,12 +277,101 @@ binarize
|
||||
default-name: lm/interpolated-binlm
|
||||
error: set kMaxOrder to at least this value
|
||||
[MML] single
|
||||
tokenize-indomain-source
|
||||
in: raw-indomain-source
|
||||
out: tokenized-indomain-source
|
||||
default-name: mml/indomain-source.tok
|
||||
pass-unless: input-tokenizer
|
||||
template: $input-tokenizer < IN > OUT
|
||||
parallelizable: yes
|
||||
factorize-indomain-source
|
||||
in: tokenized-indomain-source
|
||||
out: factorized-indomain-source
|
||||
rerun-on-change: TRAINING:input-factors
|
||||
default-name: mml/indomain-source.factored
|
||||
pass-unless: factors
|
||||
parallelizable: yes
|
||||
error: can't open
|
||||
error: incompatible number of words in factor
|
||||
lowercase-indomain-source
|
||||
in: factorized-indomain-source
|
||||
out: lowercased-indomain-source
|
||||
default-name: mml/indomain-source.lowercased
|
||||
pass-unless: input-lowercaser
|
||||
ignore-if: input-truecaser
|
||||
only-factor-0: yes
|
||||
template: $input-lowercaser < IN > OUT
|
||||
parallelizable: yes
|
||||
truecase-indomain-source
|
||||
in: factorized-indomain-source TRUECASER:truecase-model
|
||||
out: lowercased-indomain-source
|
||||
rerun-on-change: input-truecaser
|
||||
default-name: mml/indomain-source.truecased
|
||||
ignore-unless: input-truecaser
|
||||
only-factor-0: yes
|
||||
template: $input-truecaser -model IN1.$input-extension < IN > OUT
|
||||
parallelizable: yes
|
||||
split-indomain-source
|
||||
in: lowercased-indomain-source SPLITTER:splitter-model
|
||||
out: indomain-source
|
||||
rerun-on-change: input-splitter
|
||||
default-name: mml/indomain-source.split
|
||||
pass-unless: input-splitter
|
||||
template: $input-splitter -model IN1.$input-extension < IN > OUT
|
||||
tokenize-indomain-target
|
||||
in: raw-indomain-target
|
||||
out: tokenized-indomain-target
|
||||
default-name: mml/indomain-target.tok
|
||||
pass-unless: output-tokenizer
|
||||
template: $output-tokenizer < IN > OUT
|
||||
parallelizable: yes
|
||||
factorize-indomain-target
|
||||
in: tokenized-indomain-target
|
||||
out: factorized-indomain-target
|
||||
rerun-on-change: TRAINING:output-factors
|
||||
default-name: mml/indomain-target.factored
|
||||
pass-unless: factors
|
||||
parallelizable: yes
|
||||
error: can't open
|
||||
error: incompatible number of words in factor
|
||||
lowercase-indomain-target
|
||||
in: factorized-indomain-target
|
||||
out: lowercased-indomain-target
|
||||
default-name: mml/indomain-target.lowercased
|
||||
pass-unless: output-lowercaser
|
||||
ignore-if: output-truecaser
|
||||
only-factor-0: yes
|
||||
template: $output-lowercaser < IN > OUT
|
||||
parallelizable: yes
|
||||
truecase-indomain-target
|
||||
in: factorized-indomain-target TRUECASER:truecase-model
|
||||
out: lowercased-indomain-target
|
||||
rerun-on-change: output-truecaser
|
||||
default-name: mml/indomain-target.truecased
|
||||
ignore-unless: output-truecaser
|
||||
only-factor-0: yes
|
||||
template: $output-truecaser -model IN1.$output-extension < IN > OUT
|
||||
parallelizable: yes
|
||||
split-indomain-target
|
||||
in: lowercased-indomain-target SPLITTER:splitter-model
|
||||
out: indomain-target
|
||||
rerun-on-change: output-splitter
|
||||
default-name: mml/indomain-target.split
|
||||
pass-unless: output-splitter
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
train
|
||||
in: indomain-stem outdomain-stem
|
||||
out: model
|
||||
ignore-unless: settings
|
||||
default-name: model/mml
|
||||
ignore-unless: AND settings indomain-stem
|
||||
default-name: mml/model
|
||||
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
|
||||
train-in-mono
|
||||
in: indomain-source indomain-target outdomain-stem
|
||||
out: model
|
||||
ignore-unless: settings
|
||||
ignore-if: indomain-stem
|
||||
default-name: mml/model
|
||||
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
|
||||
|
||||
[TRAINING] single
|
||||
consolidate
|
||||
|
@ -36,7 +36,7 @@ die("ERROR: model not specified (-model FILESTEM)") unless defined($model);
|
||||
&train_lm($indomain_source,"in-source");
|
||||
&train_lm($indomain_target,"in-target");
|
||||
&train_lm($outdomain_source,"out-source");
|
||||
&train_lm($outdomain_source,"out-target");
|
||||
&train_lm($outdomain_target,"out-target");
|
||||
|
||||
sub train_lm {
|
||||
my ($file,$type) = @_;
|
||||
|
Loading…
Reference in New Issue
Block a user