2010-01-25 20:38:53 +03:00
# experiment.meta: now with comments.
[CORPUS] multiple
get-corpus
in: get-corpus-script
out: raw-stem
default-name: corpus/txt
rerun-on-change: input-extension output-extension
template: IN OUT $input-extension $output-extension
tokenize
in: raw-stem
out: tokenized-stem
default-name: corpus/tok
pass-unless: input-tokenizer output-tokenizer
template-if: input-tokenizer IN.$input-extension OUT.$input-extension
template-if: output-tokenizer IN.$output-extension OUT.$output-extension
parallelizable: yes
clean
in: tokenized-stem
out: clean-stem
default-name: corpus/clean
rerun-on-change: max-sentence-length $moses-script-dir/training/clean-corpus-n.perl
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 $max-sentence-length $working-dir/corpus/clean.lines-retained.VERSION
error: there is a blank factor
parse
in: clean-stem
out: parsed-stem
default-name: corpus/parsed
pass-unless: input-parser output-parser
template-if: input-parser IN.$input-extension OUT.$input-extension
template-if: output-parser IN.$output-extension OUT.$output-extension
parallelizable: yes
post-parse-clean
in: parsed-stem
out: clean-parsed-stem
default-name: corpus/parsed-clean
pass-unless: input-parser output-parser
2010-09-17 17:28:04 +04:00
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 10000 $working-dir/corpus/parsed-clean.lines-retained.VERSION --ignore-xml
2010-01-25 20:38:53 +03:00
error: there is a blank factor
factorize
in: clean-parsed-stem
out: factorized-stem
rerun-on-change: TRAINING:input-factors TRAINING:output-factors
default-name: corpus/factored
pass-unless: TRAINING:input-factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
truecase
in: factorized-stem TRUECASER:truecase-model
out: truecased-stem
rerun-on-change: input-truecaser output-truecaser
default-name: corpus/truecased
pass-unless: input-truecaser output-truecaser
2012-04-12 15:29:12 +04:00
template-if: input-truecaser IN.$input-extension OUT.$input-extension -model IN1.$input-extension
template-if: output-truecaser IN.$output-extension OUT.$output-extension -model IN1.$output-extension
parallelizable: yes
2010-01-25 20:38:53 +03:00
lowercase
in: truecased-stem
out: lowercased-stem
default-name: corpus/lowercased
pass-unless: input-lowercaser output-lowercaser
template-if: input-lowercaser IN.$input-extension OUT.$input-extension
template-if: output-lowercaser IN.$output-extension OUT.$output-extension
split
in: lowercased-stem SPLITTER:splitter-model
out: split-stem
default-name: corpus/split
pass-unless: input-splitter output-splitter
template-if: input-splitter IN.$input-extension OUT.$input-extension -model IN1.$input-extension
template-if: output-splitter IN.$output-extension OUT.$output-extension -model IN1.$output-extension
post-split-clean
in: split-stem
out: clean-split-stem
default-name: corpus/split-clean
2010-09-17 17:28:04 +04:00
ignore-if: input-parser output-parser
2010-01-25 20:38:53 +03:00
pass-unless: input-splitter output-splitter
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 $max-sentence-length $working-dir/corpus/split-clean.lines-retained.VERSION
error: there is a blank factor
2010-09-17 17:28:04 +04:00
post-split-clean-syntax
in: split-stem
out: clean-split-stem
default-name: corpus/split-clean
ignore-unless: input-parser output-parser
pass-unless: input-splitter output-splitter
template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 10000 $working-dir/corpus/parsed-clean.lines-retained.VERSION --ignore-xml
error: there is a blank factor
2010-01-25 20:38:53 +03:00
[RECASING] single
tokenize
in: raw
out: tokenized
default-name: recasing/cased
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
train
in: tokenized
out: recase-config
template: $moses-script-dir/recaser/train-recaser.perl -train-script $TRAINING:script -dir $working-dir/recasing/model.VERSION -corpus IN -scripts-root-dir $moses-script-dir -config OUT -ngram-count $lm-training
default-name: recasing/moses.ini
ignore-unless: EVALUATION:recaser
2010-05-05 03:04:10 +04:00
error: cannot execute binary file
2010-01-25 20:38:53 +03:00
[TRUECASER] single
consolidate
in: CORPUS:clean-parsed-stem
out: tokenized-stem
default-name: truecaser/corpus
2010-05-05 03:04:10 +04:00
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
2010-01-25 20:38:53 +03:00
train
2012-03-21 17:02:47 +04:00
in: tokenized-stem
2010-01-25 20:38:53 +03:00
out: truecase-model
rerun-on-change: trainer
default-name: truecaser/truecase-model
template: $trainer -model OUT.$input-extension -corpus IN.$input-extension ; $trainer -model OUT.$output-extension -corpus IN.$output-extension
[SPLITTER] single
consolidate
in: CORPUS:lowercased-stem
out: truecased-stem
default-name: splitter/corpus
ignore-unless: input-splitter output-splitter
2010-05-05 03:04:10 +04:00
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
2010-01-25 20:38:53 +03:00
train
in: truecased-stem
out: splitter-model
default-name: splitter/split-model
ignore-unless: input-splitter output-splitter
[LM] multiple
get-corpus
2011-06-17 03:43:29 +04:00
in: get-corpus-script
2010-01-25 20:38:53 +03:00
out: raw-corpus
2011-06-17 03:43:29 +04:00
pass-unless: get-corpus-script
2010-01-25 20:38:53 +03:00
default-name: lm/txt
2011-06-17 03:43:29 +04:00
template: $get-corpus-script > OUT
2010-01-25 20:38:53 +03:00
tokenize
in: raw-corpus
out: tokenized-corpus
default-name: lm/tok
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
parallelizable: yes
factorize
in: tokenized-corpus
out: factorized-corpus
rerun-on-change: TRAINING:output-factors
default-name: lm/factored
pass-unless: factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
lowercase
in: factorized-corpus
out: lowercased-corpus
default-name: lm/lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser
only-factor-0: yes
template: $output-lowercaser < IN > OUT
2011-06-17 03:43:29 +04:00
parallelizable: yes
2010-01-25 20:38:53 +03:00
truecase
in: factorized-corpus TRUECASER:truecase-model
out: lowercased-corpus
rerun-on-change: output-truecaser
default-name: lm/truecased
ignore-unless: output-truecaser
only-factor-0: yes
2011-06-17 03:43:29 +04:00
template: $output-truecaser -model IN1.$output-extension < IN > OUT
parallelizable: yes
2010-01-25 20:38:53 +03:00
split
in: lowercased-corpus SPLITTER:splitter-model
out: split-corpus
rerun-on-change: output-splitter
default-name: lm/split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
train
in: split-corpus
out: lm
default-name: lm/lm
ignore-if: rlm-training
rerun-on-change: lm-training order settings
template: $lm-training -order $order $settings -text IN -lm OUT
2010-05-05 03:04:10 +04:00
error: cannot execute binary file
2010-01-25 20:38:53 +03:00
randomize
in: lm
out: rlm
default-name: lm/rlm
pass-unless: lm-randomizer
ignore-if: rlm-training
train-randomized
in: split-corpus
out: rlm
default-name: lm/rlm
ignore-unless: rlm-training
rerun-on-change: rlm-training order
quantize
in: rlm
out: qlm
pass-unless: lm-quantizer
default-name: lm/qlm
template: $lm-quantizer IN OUT
binarize
in: qlm
out: binlm
pass-unless: lm-binarizer
rerun-on-change: lm
default-name: lm/binlm
template: $lm-binarizer IN OUT
2012-10-01 23:36:52 +04:00
error: set KENLM_MAX_ORDER to at least this value
2010-01-25 20:38:53 +03:00
[INTERPOLATED-LM] single
2010-05-05 03:04:10 +04:00
tuning-from-sgm
in: tuning-sgm
out: raw-tuning
default-name: lm/interpolate-tuning.txt
template: $moses-script-dir/ems/support/input-from-sgm.perl < IN > OUT
2010-01-25 20:38:53 +03:00
tokenize-tuning
in: raw-tuning
out: tokenized-tuning
default-name: lm/interpolate-tuning.tok
pass-unless: output-tokenizer
template: $output-tokenizer < IN > OUT
parallelizable: yes
factorize-tuning
in: tokenized-tuning
out: factorized-tuning
rerun-on-change: TRAINING:output-factors
default-name: lm/interpolate-tuning.factored
pass-unless: factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor
lowercase-tuning
in: factorized-tuning
out: lowercased-tuning
default-name: lm/interpolate-tuning.lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser
template: $output-lowercaser < IN > OUT
truecase-tuning
in: factorized-tuning TRUECASER:truecase-model
out: lowercased-tuning
rerun-on-change: output-truecaser
default-name: lm/interpolate-tuning.truecased
ignore-unless: output-truecaser
template: $output-truecaser -model IN1.$output-extension < IN > OUT
split-tuning
in: lowercased-tuning SPLITTER:splitter-model
out: split-tuning
rerun-on-change: output-splitter
default-name: lm/interpolate-tuning.split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
interpolate
2010-09-03 16:57:16 +04:00
in: script split-tuning LM:lm
2012-03-21 17:02:47 +04:00
rerun-on-change: srilm-dir group
2010-01-25 20:38:53 +03:00
out: lm
default-name: lm/interpolated-lm
randomize
in: lm
out: rlm
pass-unless: lm-randomizer
2012-03-21 17:02:47 +04:00
default-name: lm/interpolated-rlm
2010-01-25 20:38:53 +03:00
quantize
in: rlm
out: qlm
pass-unless: lm-quantizer
default-name: lm/interpolated-qlm
binarize
in: qlm
out: binlm
pass-unless: lm-binarizer
2012-03-21 17:02:47 +04:00
ignore-unless: script
2010-01-25 20:38:53 +03:00
rerun-on-change: lm
default-name: lm/interpolated-binlm
2012-10-01 23:36:52 +04:00
error: set kMaxOrder to at least this value
2010-01-25 20:38:53 +03:00
[TRAINING] single
consolidate
in: CORPUS:clean-split-stem
out: corpus
default-name: corpus
2010-05-05 03:04:10 +04:00
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
2012-10-01 23:36:52 +04:00
build-domains
in: CORPUS:clean-split-stem
out: domains
default-name: model/domains
ignore-unless: domain-features
template: $moses-script-dir/ems/support/build-domain-file-from-subcorpora.perl $input-extension IN > OUT
2010-01-25 20:38:53 +03:00
prepare-data
in: corpus
out: prepared-data
2010-05-05 20:06:34 +04:00
rerun-on-change: alignment-factors training-options script
2010-01-25 20:38:53 +03:00
ignore-if: use-berkeley
default-name: prepared
run-giza
in: prepared-data
out: giza-alignment
ignore-if: use-berkeley
2010-05-05 20:06:34 +04:00
rerun-on-change: giza-settings training-options script
2010-01-25 20:38:53 +03:00
default-name: giza
2012-03-21 17:02:47 +04:00
error: not found
2010-01-25 20:38:53 +03:00
run-giza-inverse
in: prepared-data
out: giza-alignment-inverse
2010-05-05 20:06:34 +04:00
rerun-on-change: giza-settings training-options script
2010-01-25 20:38:53 +03:00
ignore-if: use-berkeley
default-name: giza-inverse
2012-03-21 17:02:47 +04:00
error: not found
2010-01-25 20:38:53 +03:00
run-berkeley
in: corpus
out: berkeley-alignment
ignore-unless: use-berkeley
rerun-on-change: berkeley-train berkeley-jar berkeley-training-options
default-name: berkeley
template: $berkeley-train " $berkeley-java-options " $berkeley-jar IN OUT $input-extension $output-extension $berkeley-training-options
not-error: 0 errors,
process-berkeley
in: corpus berkeley-alignment
out: word-alignment
default-name: model/aligned
rerun-on-change: berkeley-process berkeley-jar berkeley-posterior berkeley-process-options
ignore-unless: use-berkeley
template: $berkeley-process " $berkeley-java-options " $berkeley-jar IN IN1 OUT $input-extension $output-extension $alignment-symmetrization-method $berkeley-posterior $berkeley-process-options
not-error: 0 errors,
symmetrize-giza
in: giza-alignment giza-alignment-inverse
out: word-alignment
ignore-if: use-berkeley
2010-05-05 20:06:34 +04:00
rerun-on-change: alignment-symmetrization-method training-options script
2010-01-25 20:38:53 +03:00
default-name: model/aligned
error: skip=<[1-9]
2010-10-21 13:49:27 +04:00
build-biconcor
in: word-alignment corpus
out: biconcor-model
default-name: model/biconcor
ignore-unless: biconcor
error: usage
2012-10-01 23:36:52 +04:00
build-suffix-array
in: word-alignment corpus
out: phrase-translation-table
default-name: model/suffix-array
ignore-unless: suffix-array
error: usage
2010-01-25 20:38:53 +03:00
build-lex-trans
in: word-alignment corpus
out: lexical-translation-table
2010-05-05 20:06:34 +04:00
rerun-on-change: translation-factors training-options script
2010-01-25 20:38:53 +03:00
default-name: model/lex
parse-relax
in: corpus
out: parse-relaxed-corpus
default-name: model/parsed-relaxed
pass-unless: input-parse-relaxer output-parse-relaxer
template-if: input-parse-relaxer IN.$input-extension OUT.$input-extension
template-if: output-parse-relaxer IN.$output-extension OUT.$output-extension
2012-05-25 20:29:47 +04:00
pcfg-extract
in: parse-relaxed-corpus
out: pcfg
default-name: model/pcfg
ignore-unless: use-pcfg-feature
rerun-on-change: use-pcfg-feature
2012-10-03 22:57:51 +04:00
template: $moses-bin-dir/pcfg-extract < IN.$output-extension > OUT.$output-extension
2012-05-25 20:29:47 +04:00
pcfg-score
in: parse-relaxed-corpus pcfg
out: scored-corpus
default-name: model/scored-corpus
pass-unless: use-pcfg-feature
2012-10-03 22:57:51 +04:00
template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
2010-01-25 20:38:53 +03:00
extract-phrases
2012-05-25 20:29:47 +04:00
in: word-alignment scored-corpus
2010-01-25 20:38:53 +03:00
out: extracted-phrases
2012-10-01 23:36:52 +04:00
rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm domain-features
only-existence-matters: domain-features
2010-01-25 20:38:53 +03:00
default-name: model/extract
2012-10-01 23:36:52 +04:00
ignore-if: suffix-array
2010-01-25 20:38:53 +03:00
build-reordering
in: extracted-phrases
out: reordering-table
ignore-unless: lexicalized-reordering
rerun-on-change: lexicalized-reordering reordering-factors
default-name: model/reordering-table
build-ttable
2012-10-01 23:36:52 +04:00
in: extracted-phrases lexical-translation-table domains
2010-01-25 20:38:53 +03:00
out: phrase-translation-table
2012-10-01 23:36:52 +04:00
rerun-on-change: translation-factors hierarchical-rule-set score-settings training-options script EVALUATION:report-precision-by-coverage include-word-alignment-in-rules domain-features
2010-01-25 20:38:53 +03:00
default-name: model/phrase-table
2012-10-01 23:36:52 +04:00
ignore-if: suffix-array
2010-01-25 20:38:53 +03:00
build-generation
in: corpus
out: generation-table
2010-05-05 20:06:34 +04:00
rerun-on-change: generation-factors generation-type training-options script
2010-01-25 20:38:53 +03:00
ignore-unless: generation-factors
2012-10-01 23:36:52 +04:00
ignore-if: generation-corpus
2011-06-17 03:43:29 +04:00
default-name: model/generation-table
build-generation-custom
in: generation-corpus
out: generation-table
rerun-on-change: generation-factors generation-type training-options script generation-corpus
ignore-unless: AND generation-factors generation-corpus
2010-01-25 20:38:53 +03:00
default-name: model/generation-table
2012-10-01 23:36:52 +04:00
build-sparse-lexical
in: corpus
out: sparse-lexical
ignore-unless: sparse-lexical-features
rerun-on-change: sparse-lexical-features
default-name: model/most-frequent-words
template: $moses-script-dir/ems/support/build-sparse-lexical-features.perl IN $input-extension $output-extension OUT "$sparse-lexical-features"
2010-01-25 20:38:53 +03:00
create-config
2012-10-18 05:52:30 +04:00
in: reordering-table phrase-translation-table generation-table sparse-lexical domains INTERPOLATED-LM:binlm LM:binlm
2010-01-25 20:38:53 +03:00
out: config
ignore-if: use-hiero
2012-03-21 17:02:47 +04:00
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
2010-01-25 20:38:53 +03:00
default-name: model/moses.ini
error: Unknown option
binarize-config
in: config
out: bin-config
pass-unless: binarize-all
rerun-on-change: config
default-name: model/moses.bin.ini
template: $binarize-all $ttable-binarizer $rtable-binarizer OUT IN
hiero-compile-source-suffix-array
in: corpus
out: hiero-source-suffix-array
ignore-unless: use-hiero
default-name: hiero-model/f.sa.bin
template: $hiero-decode-dir/compile_bin.py -s IN.$input-extension OUT
hiero-compile-target
in: corpus
out: hiero-target-array
ignore-unless: use-hiero
default-name: hiero-model/e.bin
template: $hiero-decode-dir/compile_bin.py IN.$output-extension OUT
hiero-compile-alignment
in: word-alignment
out: hiero-alignment-array
ignore-unless: use-hiero
default-name: hiero-model/a.bin
template: $hiero-decode-dir/compile_bin.py -a IN.$alignment-symmetrization-method OUT
hiero-compile-lex
in: hiero-alignment-array hiero-source-suffix-array hiero-target-array
out: hiero-lex-array
ignore-unless: use-hiero
default-name: hiero-model/lex.bin
template: $hiero-decode-dir/compile_bin.py -x IN1 IN2 IN OUT
hiero-find-frequencies
in: hiero-source-suffix-array
out: hiero-topN
ignore-unless: use-hiero
default-name: hiero-model/f.topN
template: $hiero-decode-dir/lcp_ops.py -t 4 IN | sort -nr | head -100 > OUT
hiero-compile-precomputations
in: hiero-topN hiero-source-suffix-array
out: hiero-precomputation-array
ignore-unless: use-hiero
default-name: hiero-model/f.precomputations.bin
rerun-on-change: hiero-max-phrase-length hiero-max-nonterminals hiero-max-phrase-span hiero-min-gap-length hiero-freq-rank1 hiero-freq-rank2
template: $hiero-decode-dir/compile_bin.py -r max-len=$hiero-max-phrase-length max-nt=$hiero-max-nonterminals max-size=$hiero-max-phrase-span min-gap=$hiero-min-gap-length rank1=$hiero-freq-rank1 rank2=$hiero-freq-rank2 sa=IN1 IN OUT
hiero-create-config
in: hiero-source-suffix-array hiero-target-array hiero-alignment-array hiero-lex-array hiero-precomputation-array LM:lm
out: hiero-config
ignore-unless: use-hiero
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors
default-name: hiero-model/hiero.ini
template: $hiero-util-dir/generate-ini.pl IN IN1 IN2 IN3 IN4 IN5 $hiero-max-phrase-length $hiero-max-nonterminals $hiero-max-phrase-span $hiero-min-gap-length $hiero-freq-rank1 $hiero-freq-rank2 < $GENERAL:hiero-template-ini > OUT
[TUNING] single
2010-05-05 03:04:10 +04:00
input-from-sgm
in: input-sgm
out: raw-input
default-name: tuning/input.txt
template: $moses-script-dir/ems/support/input-from-sgm.perl < IN > OUT
2012-05-10 21:54:24 +04:00
input-devtest-from-sgm
in: input-devtest-sgm
out: raw-input-devtest
default-name: tuning/input.devtest.txt
ignore-unless: use-mira
template: $moses-script-dir/ems/support/input-from-sgm.perl < IN > OUT
2010-01-25 20:38:53 +03:00
tokenize-input
in: raw-input
out: tokenized-input
default-name: tuning/input.tok
pass-unless: input-tokenizer
template: $input-tokenizer < IN > OUT
2012-04-13 18:43:01 +04:00
tokenize-input-devtest
in: raw-input-devtest
out: tokenized-input-devtest
default-name: tuning/input.devtest.tok
pass-unless: input-tokenizer
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
template: $input-tokenizer < IN > OUT
2010-01-25 20:38:53 +03:00
parse-input
in: tokenized-input
out: parsed-input
default-name: tuning/input.parsed
pass-unless: input-parser
template: $input-parser < IN > OUT
2012-04-13 18:43:01 +04:00
parse-input-devtest
in: tokenized-input-devtest
out: parsed-input-devtest
default-name: tuning/input.devtest.parsed
pass-unless: input-parser
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
template: $input-parser < IN > OUT
2010-01-25 20:38:53 +03:00
parse-relax-input
in: parsed-input
out: parse-relaxed-input
default-name: tuning/input.parse-relaxed
pass-unless: input-parse-relaxer
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
2012-04-13 18:43:01 +04:00
parse-relax-input-devtest
in: parsed-input-devtest
out: parse-relaxed-input-devtest
default-name: tuning/input.devtest.parse-relaxed
pass-unless: input-parse-relaxer
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
2010-01-25 20:38:53 +03:00
factorize-input
in: parse-relaxed-input
out: factorized-input
default-name: tuning/input.factorized
rerun-on-change: TRAINING:input-factors
pass-unless: TRAINING:input-factors
error: can't open
error: incompatible number of words in factor
2012-04-13 18:43:01 +04:00
factorize-input-devtest
in: parse-relaxed-input-devtest
out: factorized-input-devtest
default-name: tuning/input.devtest.factorized
rerun-on-change: TRAINING:input-factors
pass-unless: TRAINING:input-factors
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
error: can't open
error: incompatible number of words in factor
2010-01-25 20:38:53 +03:00
lowercase-input
in: factorized-input
2012-05-10 21:54:24 +04:00
out: truecased-input
2010-01-25 20:38:53 +03:00
default-name: tuning/input.lc
pass-unless: input-lowercaser
ignore-if: input-truecaser
template: $input-lowercaser < IN > OUT
2012-04-13 18:43:01 +04:00
lowercase-input-devtest
in: factorized-input-devtest
2012-05-10 21:54:24 +04:00
out: truecased-input-devtest
2012-04-13 18:43:01 +04:00
default-name: tuning/input.devtest.lc
pass-unless: input-lowercaser
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
ignore-if: input-truecaser
template: $input-lowercaser < IN > OUT
2010-01-25 20:38:53 +03:00
truecase-input
in: factorized-input TRUECASER:truecase-model
2012-05-10 21:54:24 +04:00
out: truecased-input
2010-01-25 20:38:53 +03:00
rerun-on-change: input-truecaser
default-name: tuning/input.tc
ignore-unless: input-truecaser
template: $input-truecaser -model IN1.$input-extension < IN > OUT
2012-04-13 18:43:01 +04:00
truecase-input-devtest
in: factorized-input-devtest TRUECASER:truecase-model
2012-05-10 21:54:24 +04:00
out: truecased-input-devtest
2012-04-13 18:43:01 +04:00
rerun-on-change: input-truecaser
default-name: tuning/input.devtest.tc
2012-05-10 21:54:24 +04:00
ignore-unless: AND input-truecaser use-mira
2012-04-13 18:43:01 +04:00
template: $input-truecaser -model IN1.$input-extension < IN > OUT
2010-01-25 20:38:53 +03:00
split-input
2012-05-10 21:54:24 +04:00
in: truecased-input SPLITTER:splitter-model
2010-01-25 20:38:53 +03:00
out: input
rerun-on-change: input-splitter
default-name: tuning/input.split
pass-unless: input-splitter
template: $input-splitter -model IN1.$input-extension $input-extension < IN > OUT
2012-04-13 18:43:01 +04:00
split-input-devtest
2012-05-10 21:54:24 +04:00
in: truecased-input-devtest SPLITTER:splitter-model
2012-04-13 18:43:01 +04:00
out: input-devtest
rerun-on-change: input-splitter
default-name: tuning/input.devtest.split
pass-unless: input-splitter
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
template: $input-splitter -model IN1.$input-extension $input-extension < IN > OUT
2010-05-05 03:04:10 +04:00
reference-from-sgm
in: reference-sgm input-sgm
out: raw-reference
default-name: tuning/reference.txt
template: $moses-script-dir/ems/support/reference-from-sgm.perl IN IN1 OUT
2012-05-10 21:54:24 +04:00
reference-devtest-from-sgm
in: reference-devtest-sgm input-devtest-sgm
out: raw-reference-devtest
default-name: tuning/reference.devtest.txt
ignore-unless: use-mira
template: $moses-script-dir/ems/support/reference-from-sgm.perl IN IN1 OUT
2010-01-25 20:38:53 +03:00
tokenize-reference
in: raw-reference
out: tokenized-reference
default-name: tuning/reference.tok
pass-unless: output-tokenizer
2010-05-18 21:39:16 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
2010-01-25 20:38:53 +03:00
template: $output-tokenizer < IN > OUT
2012-04-13 18:43:01 +04:00
tokenize-reference-devtest
in: raw-reference-devtest
out: tokenized-reference-devtest
default-name: tuning/reference.devtest.tok
pass-unless: output-tokenizer
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-tokenizer < IN > OUT
2010-01-25 20:38:53 +03:00
lowercase-reference
in: tokenized-reference
2012-05-10 21:54:24 +04:00
out: truecased-reference
2010-01-25 20:38:53 +03:00
default-name: tuning/reference.lc
pass-unless: output-lowercaser
ignore-if: output-truecaser
2010-05-18 21:39:16 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
2010-01-25 20:38:53 +03:00
template: $output-lowercaser < IN > OUT
2012-04-13 18:43:01 +04:00
lowercase-reference-devtest
in: tokenized-reference-devtest
2012-05-10 21:54:24 +04:00
out: truecased-reference-devtest
2012-04-13 18:43:01 +04:00
default-name: tuning/reference.devtest.lc
pass-unless: output-lowercaser
ignore-if: output-truecaser
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
2010-01-25 20:38:53 +03:00
truecase-reference
in: tokenized-reference TRUECASER:truecase-model
2012-05-10 21:54:24 +04:00
out: truecased-reference
2010-01-25 20:38:53 +03:00
rerun-on-change: output-truecaser
default-name: tuning/reference.tc
ignore-unless: output-truecaser
2010-05-18 21:39:16 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
2010-01-25 20:38:53 +03:00
template: $output-truecaser -model IN1.$output-extension < IN > OUT
2012-04-13 18:43:01 +04:00
truecase-reference-devtest
in: tokenized-reference-devtest TRUECASER:truecase-model
2012-05-10 21:54:24 +04:00
out: truecased-reference-devtest
2012-04-13 18:43:01 +04:00
rerun-on-change: output-truecaser
default-name: tuning/reference.devtest.tc
2012-05-10 21:54:24 +04:00
ignore-unless: AND output-truecaser use-mira
2012-04-13 18:43:01 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-truecaser -model IN1.$output-extension < IN > OUT
2010-01-25 20:38:53 +03:00
split-reference
2012-05-10 21:54:24 +04:00
in: truecased-reference SPLITTER:splitter-model
2010-01-25 20:38:53 +03:00
out: reference
default-name: tuning/reference.split
pass-unless: output-splitter
2010-05-18 21:39:16 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
2010-01-25 20:38:53 +03:00
template: $output-splitter -model IN1.$output-extension < IN > OUT
2012-04-13 18:43:01 +04:00
split-reference-devtest
2012-05-10 21:54:24 +04:00
in: truecased-reference-devtest SPLITTER:splitter-model
2012-04-13 18:43:01 +04:00
out: reference-devtest
default-name: tuning/reference.devtest.split
pass-unless: output-splitter
2012-04-24 08:17:19 +04:00
ignore-unless: use-mira
2012-04-13 18:43:01 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
2011-06-17 03:43:29 +04:00
filter
2012-10-01 23:36:52 +04:00
in: input TRAINING:phrase-translation-table TRAINING:reordering-table TRAINING:domains
2012-03-22 04:13:02 +04:00
out: filtered-dir
default-name: tuning/filtered
rerun-on-change: filter-settings
pass-if: TRAINING:binarize-all
ignore-if: use-hiero
error: already exists. Please delete
2012-06-07 14:16:52 +04:00
filter-devtest
in: input-devtest TRAINING:phrase-translation-table TRAINING:reordering-table
out: filtered-dir-devtest
2012-10-03 21:53:55 +04:00
default-name: tuning/filtered.devtest
2012-06-07 14:16:52 +04:00
rerun-on-change: filter-settings
pass-if: TRAINING:binarize-all
ignore-unless: use-mira
error: already exists. Please delete
2012-03-22 04:13:02 +04:00
apply-filter
in: TRAINING:config filtered-dir
2011-06-17 03:43:29 +04:00
out: filtered-config
default-name: tuning/moses.filtered.ini
2012-04-07 00:01:59 +04:00
pass-if: TRAINING:binarize-all
2011-06-17 03:43:29 +04:00
ignore-if: use-hiero
2012-03-22 04:13:02 +04:00
template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
2012-06-07 14:16:52 +04:00
apply-filter-devtest
in: TRAINING:config filtered-dir-devtest
2012-04-07 00:01:59 +04:00
out: filtered-config-devtest
default-name: tuning/moses.filtered.devtest.ini
pass-if: TRAINING:binarize-all
2012-10-01 23:36:52 +04:00
ignore-unless: use-mira
2012-06-07 14:16:52 +04:00
template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
2010-01-25 20:38:53 +03:00
tune
2012-04-07 00:01:59 +04:00
in: filtered-config input reference filtered-config-devtest input-devtest reference-devtest
2010-01-25 20:38:53 +03:00
out: weight-config
ignore-if: use-hiero
qsub-script: yes
default-name: tuning/moses.ini
rerun-on-change: decoder-settings tuning-settings nbest lambda async
not-error: trans: No such file or directory
apply-weights
in: TRAINING:config weight-config
out: config-with-reused-weights
ignore-if: use-hiero
2012-10-01 23:36:52 +04:00
default-name: tuning/moses.tuned.ini
2010-05-05 03:04:10 +04:00
template: $moses-script-dir/ems/support/reuse-weights.perl IN1 < IN > OUT
2012-03-21 17:02:47 +04:00
error: cannot open
2010-01-25 20:38:53 +03:00
hiero-tune
in: TRAINING:hiero-config input reference
out: hiero-weight-config
ignore-unless: use-hiero
qsub-script: yes
default-name: hiero-tuning/mert
rerun-on-change: nbest
template: $hiero-mert --nbest $nbest --decoder $hiero-decoder --workdir OUT IN --source-file IN1 --ref-files "IN2*" --no-test
hiero-apply-weights
in: hiero-weight-config TRAINING:hiero-config
out: hiero-config-with-reused-weights
default-name: hiero-tuning/hiero.weight-reused.ini
ignore-unless: use-hiero
template: $hiero-util-dir/apply-weights.pl IN/best.weights < IN1 > OUT
2010-05-05 03:04:10 +04:00
[EVALUATION] multiple
input-from-sgm
in: input-sgm
out: raw-input
ignore-unless: input-sgm
default-name: evaluation/input.txt
template: $moses-script-dir/ems/support/input-from-sgm.perl < IN > OUT
get-input
2010-01-25 20:38:53 +03:00
in: get-corpus-script
out: raw-input
2010-05-05 03:04:10 +04:00
ignore-if: input-sgm
default-name: evaluation/input.txt
2010-01-25 20:38:53 +03:00
template: IN OUT
2010-05-05 03:04:10 +04:00
tokenize-input
2010-01-25 20:38:53 +03:00
in: raw-input
out: tokenized-input
default-name: evaluation/input.tok
pass-unless: input-tokenizer
template: $input-tokenizer < IN > OUT
2010-05-05 03:04:10 +04:00
parse-input
2010-01-25 20:38:53 +03:00
in: tokenized-input
out: parsed-input
default-name: evaluation/input.parsed
pass-unless: input-parser
template: $input-parser < IN > OUT
2010-05-05 03:04:10 +04:00
parse-relax-input
2010-01-25 20:38:53 +03:00
in: parsed-input
out: parse-relaxed-input
default-name: tuning/input.parse-relaxed
pass-unless: input-parse-relaxer
template: $input-parse-relaxer < IN.$input-extension > OUT.$input-extension
2010-05-05 03:04:10 +04:00
factorize-input
2010-01-25 20:38:53 +03:00
in: parse-relaxed-input
out: factorized-input
default-name: evaluation/input.factorized
rerun-on-change: TRAINING:input-factors
pass-unless: TRAINING:input-factors
error: can't open
error: incompatible number of words in factor
2010-05-05 03:04:10 +04:00
lowercase-input
2010-01-25 20:38:53 +03:00
in: factorized-input
2012-05-10 21:54:24 +04:00
out: truecased-input
2010-01-25 20:38:53 +03:00
default-name: evaluation/input.lc
pass-unless: input-lowercaser
ignore-if: input-truecaser
template: $input-lowercaser < IN > OUT
2010-05-05 03:04:10 +04:00
truecase-input
2010-01-25 20:38:53 +03:00
in: factorized-input TRUECASER:truecase-model
2012-05-10 21:54:24 +04:00
out: truecased-input
2010-01-25 20:38:53 +03:00
default-name: evaluation/input.tc
rerun-on-change: input-truecaser
ignore-unless: input-truecaser
template: $input-truecaser -model IN1.$input-extension < IN > OUT
2010-05-05 03:04:10 +04:00
split-input
2012-05-10 21:54:24 +04:00
in: truecased-input SPLITTER:splitter-model
2010-01-25 20:38:53 +03:00
out: input
default-name: evaluation/input.split
pass-unless: input-splitter
template: $input-splitter -model IN1.$input-extension < IN > OUT
2011-06-17 03:43:29 +04:00
filter
2012-10-01 23:36:52 +04:00
in: input TRAINING:phrase-translation-table TRAINING:reordering-table TRAINING:domains
2012-03-22 04:13:02 +04:00
out: filtered-dir
default-name: evaluation/filtered
rerun-on-change: filter-settings report-precision-by-coverage
pass-if: TRAINING:binarize-all
ignore-if: use-hiero
error: already exists. Please delete
apply-filter
in: TUNING:config-with-reused-weights filtered-dir
2011-06-17 03:43:29 +04:00
out: filtered-config
default-name: evaluation/filtered.ini
pass-if: TRAINING:binarize-all
2012-03-22 04:13:02 +04:00
ignore-if: use-hiero
template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
2011-06-17 03:43:29 +04:00
decode
in: filtered-config input
2010-01-25 20:38:53 +03:00
out: system-output
default-name: evaluation/output
qsub-script: yes
ignore-if: use-hiero
2012-10-01 23:36:52 +04:00
rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph
2010-01-25 20:38:53 +03:00
error: Translation was not performed correctly
not-error: trans: No such file or directory
hiero-decode
in: TUNING:hiero-config-with-reused-weights input
out: system-output
default-name: evaluation/output
qsub-script: yes
ignore-unless: use-hiero
template: $hiero-parallelizer -e OUT.edir -r -- $hiero-decoder -c IN < IN1 > OUT
rerun-on-change: hiero-decoder
2010-05-07 15:28:55 +04:00
remove-markup
in: system-output
out: cleaned-output
default-name: evaluation/cleaned
2010-10-21 13:49:27 +04:00
pass-if: TRAINING:hierarchical-rule-set
2010-05-07 15:28:55 +04:00
pass-unless: report-segmentation
template: $moses-script-dir/ems/support/remove-segmenation-markup.perl < IN > OUT
2010-05-05 03:04:10 +04:00
recase-output
2010-05-07 15:28:55 +04:00
in: cleaned-output RECASING:recase-config
2010-01-25 20:38:53 +03:00
out: recased-output
default-name: evaluation/recased
pass-unless: recaser
ignore-if: output-truecaser
template: $recaser -moses $RECASING:decoder -in IN -model IN1 > OUT
2010-05-05 03:04:10 +04:00
detruecase-output
2010-05-07 15:28:55 +04:00
in: cleaned-output
2010-01-25 20:38:53 +03:00
out: recased-output
default-name: evaluation/truecased
ignore-unless: output-truecaser
template: $detruecaser < IN > OUT
2010-05-05 03:04:10 +04:00
detokenize-output
2010-01-25 20:38:53 +03:00
in: recased-output
out: detokenized-output
default-name: evaluation/detokenized
pass-unless: detokenizer
template: $detokenizer < IN > OUT
wrap
in: detokenized-output
out: wrapped-output
default-name: evaluation/detokenized.sgm
rerun-on-change: wrapping-frame use-hiero
template: $wrapping-script $wrapping-frame < IN > OUT
error: Use of uninitialized value in pattern match
2010-05-05 03:04:10 +04:00
reference-from-sgm
in: reference-sgm input-sgm
out: raw-reference
default-name: evaluation/reference.txt
template: $moses-script-dir/ems/support/reference-from-sgm.perl IN IN1 OUT
2010-01-25 20:38:53 +03:00
tokenize-reference
in: raw-reference
out: tokenized-reference
2010-05-05 03:04:10 +04:00
default-name: evaluation/reference.tok
2010-01-25 20:38:53 +03:00
pass-unless: output-tokenizer
2010-05-18 21:39:16 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
2010-01-25 20:38:53 +03:00
template: $output-tokenizer < IN > OUT
lowercase-reference
in: tokenized-reference
out: reference
2010-05-05 03:04:10 +04:00
default-name: evaluation/reference
2010-01-25 20:38:53 +03:00
pass-unless: output-lowercaser
2010-05-18 21:39:16 +04:00
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
2010-01-25 20:38:53 +03:00
template: $output-lowercaser < IN > OUT
nist-bleu
in: wrapped-output reference-sgm
out: nist-bleu-score
default-name: evaluation/nist-bleu
ignore-unless: nist-bleu
rerun-on-change: nist-bleu
error: Illegal division by zero
template: $nist-bleu -s $input-sgm -r IN1 -t IN > OUT
nist-bleu-c
in: wrapped-output reference-sgm
out: nist-bleu-c-score
default-name: evaluation/nist-bleu-c
ignore-unless: nist-bleu-c
rerun-on-change: nist-bleu-c
error: Illegal division by zero
template: $nist-bleu-c -c -s $input-sgm -r IN1 -t IN > OUT
ibm-bleu
in: wrapped-output reference-sgm
out: ibm-bleu-score
default-name: evaluation/ibm-bleu
ignore-unless: ibm-bleu
rerun-on-change: ibm-bleu
template: $ibm-bleu -ci -s $input-sgm -r IN1 -t IN > OUT
ibm-bleu-c
in: wrapped-output reference-sgm
out: ibm-bleu-c-score
default-name: evaluation/ibm-bleu-c
ignore-unless: ibm-bleu-c
rerun-on-change: ibm-bleu-c
template: $ibm-bleu-c -s $input-sgm -r IN1 -t IN > OUT
2012-05-26 03:06:34 +04:00
bolt-bleu
in: detokenized-output
out: bolt-bleu-score
default-name: evaluation/bolt-bleu
ignore-unless: bolt-bleu
rerun-on-change: bolt-bleu
template: $bolt-bleu IN > OUT
bolt-bleu-c
in: detokenized-output
out: bolt-bleu-c-score
default-name: evaluation/bolt-bleu-c
ignore-unless: bolt-bleu-c
rerun-on-change: bolt-bleu-c
template: $bolt-bleu-c IN > OUT
2010-01-25 20:38:53 +03:00
multi-bleu
2011-01-13 07:53:09 +03:00
in: cleaned-output reference
2010-01-25 20:38:53 +03:00
out: multi-bleu-score
default-name: evaluation/multi-bleu
ignore-unless: multi-bleu
rerun-on-change: multi-bleu
template: $multi-bleu IN1 < IN > OUT
2011-01-13 07:53:09 +03:00
multi-bleu-c
in: recased-output tokenized-reference
out: multi-bleu-c-score
default-name: evaluation/multi-bleu-c
ignore-unless: multi-bleu-c
rerun-on-change: multi-bleu-c
template: $multi-bleu-c IN1 < IN > OUT
2010-01-25 20:38:53 +03:00
ter
in: wrapped-output reference-sgm
out: ter-score
default-name: evaluation/detokenized.sgm.TER
ignore-unless: ter
rerun-on-change: ter
wer
in: recased-output reference
out: wer-score
default-name: evaluation/wer
ignore-unless: wer
rerun-on-change: wer
template: $wer IN IN1 > OUT
meteor
2012-03-21 17:02:47 +04:00
in: cleaned-output reference
2010-01-25 20:38:53 +03:00
out: meteor-score
2012-03-21 17:02:47 +04:00
default-name: evaluation/meteor
2010-01-25 20:38:53 +03:00
ignore-unless: meteor
rerun-on-change: meteor
2012-03-21 17:02:47 +04:00
template: $meteor IN IN1 $meteor-params > OUT
2010-05-05 03:04:10 +04:00
analysis
2010-05-07 15:28:55 +04:00
in: recased-output reference input
2010-05-05 03:04:10 +04:00
out: analysis
default-name: evaluation/analysis
2011-02-23 13:27:54 +03:00
ignore-if: report-precision-by-coverage
2010-05-05 03:04:10 +04:00
ignore-unless: analysis
2012-10-01 23:36:52 +04:00
rerun-on-change: analyze-search-graph
2010-05-07 15:28:55 +04:00
analysis-coverage
in: input TRAINING:corpus TRAINING:phrase-translation-table
out: analysis-coverage
default-name: evaluation/analysis
2010-05-18 21:39:16 +04:00
ignore-unless: AND analysis analyze-coverage
2011-02-23 13:27:54 +03:00
rerun-on-change: score-settings
analysis-precision
2011-06-17 03:43:29 +04:00
in: recased-output reference input TRAINING:corpus TRAINING:phrase-translation-table analysis-coverage
2011-02-23 13:27:54 +03:00
out: analysis
default-name: evaluation/analysis
ignore-unless: AND analysis analyze-coverage report-precision-by-coverage
2011-06-17 03:43:29 +04:00
rerun-on-change: precision-by-coverage-base
2010-01-25 20:38:53 +03:00
2010-05-05 03:04:10 +04:00
[REPORTING] single
2010-01-25 20:38:53 +03:00
report
2012-10-01 23:36:52 +04:00
in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model
2010-01-25 20:38:53 +03:00
out: report
default-name: evaluation/report