mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-19 15:17:10 +03:00
updates to EMS: mteval-v13a.pl, parallel preparation, better paths and defaults in examples
This commit is contained in:
parent
3ab37ca321
commit
b95c372e3a
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses
|
||||
decoder = $moses-bin-dir/moses
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -138,27 +147,21 @@ order = 5
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
|
||||
#
|
||||
# if binarized, set type (default srilm; if binarized: irstlm)
|
||||
#
|
||||
# set to 8 when using kenlm
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -219,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
@ -261,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
|
||||
#generation-factors = "word -> pos"
|
||||
#decoding-steps = "t0, g0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -355,7 +364,7 @@ score-settings = "--GoodTuring"
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
@ -394,14 +403,14 @@ decoder-settings = ""
|
||||
# and also point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#########################################################
|
||||
## RECASER: restore case, this part only trains the model
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -448,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
@ -470,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses
|
||||
decoder = $moses-bin-dir/moses
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
|
||||
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -132,27 +141,27 @@ order = 5
|
||||
### tool to be used for training randomized language model from scratch
|
||||
# (more commonly, a SRILM is trained)
|
||||
#
|
||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#lm-binarizer = $moses-bin-dir/build_binary
|
||||
#type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -218,21 +227,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#lm-binarizer = $moses-bin-dir/build_binary
|
||||
#type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# FACTOR DEFINITION
|
||||
@ -275,12 +284,18 @@ reordering-factors = "word -> word"
|
||||
#generation-factors =
|
||||
decoding-steps = "t0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -354,7 +369,7 @@ score-settings = "--GoodTuring"
|
||||
# point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#####################################################
|
||||
### TUNING: finding good weights for model components
|
||||
@ -369,7 +384,7 @@ score-settings = "--GoodTuring"
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
@ -415,7 +430,7 @@ decoder-settings = ""
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -462,6 +477,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
@ -484,8 +504,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses_chart
|
||||
decoder = $moses-bin-dir/moses_chart
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
#ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -132,27 +141,27 @@ order = 5
|
||||
### tool to be used for training randomized language model from scratch
|
||||
# (more commonly, a SRILM is trained)
|
||||
#
|
||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -213,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
@ -255,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
|
||||
#generation-factors = "word -> pos"
|
||||
#decoding-steps = "t0, g0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -334,7 +349,7 @@ score-settings = "--GoodTuring"
|
||||
# point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#####################################################
|
||||
### TUNING: finding good weights for model components
|
||||
@ -349,7 +364,7 @@ score-settings = "--GoodTuring"
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
@ -395,7 +410,7 @@ decoder-settings = ""
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -442,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
#decoder-settings = ""
|
||||
|
||||
@ -464,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses_chart
|
||||
decoder = $moses-bin-dir/moses_chart
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
#ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -136,27 +145,27 @@ order = 5
|
||||
### tool to be used for training randomized language model from scratch
|
||||
# (more commonly, a SRILM is trained)
|
||||
#
|
||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -217,21 +226,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
@ -259,12 +268,18 @@ script = $moses-script-dir/training/train-model.perl
|
||||
#generation-factors = "word -> pos"
|
||||
#decoding-steps = "t0, g0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -338,7 +353,7 @@ score-settings = "--GoodTuring"
|
||||
# point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#####################################################
|
||||
### TUNING: finding good weights for model components
|
||||
@ -353,7 +368,7 @@ score-settings = "--GoodTuring"
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
@ -399,7 +414,7 @@ decoder-settings = ""
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -446,6 +461,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
#decoder-settings = ""
|
||||
|
||||
@ -468,8 +488,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
toy-data = $moses-script-dir/ems/example/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses
|
||||
decoder = $moses-bin-dir/moses
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -126,27 +135,27 @@ order = 5
|
||||
### tool to be used for training randomized language model from scratch
|
||||
# (more commonly, a SRILM is trained)
|
||||
#
|
||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -197,21 +206,21 @@ raw-corpus = $toy-data/nc-5k.$output-extension
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
@ -239,12 +248,18 @@ script = $moses-script-dir/training/train-model.perl
|
||||
#generation-factors = "word -> pos"
|
||||
#decoding-steps = "t0, g0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -318,7 +333,7 @@ score-settings = "--GoodTuring"
|
||||
# point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#####################################################
|
||||
### TUNING: finding good weights for model components
|
||||
@ -333,7 +348,7 @@ weight-config = $toy-data/weight.ini
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
@ -379,7 +394,7 @@ decoder-settings = ""
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -422,6 +437,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
@ -444,8 +464,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
|
@ -1275,7 +1275,8 @@ sub check_if_crashed {
|
||||
foreach my $pattern (@{$ERROR{&defined_step_id($i)}},
|
||||
'error','killed','core dumped','can\'t read',
|
||||
'no such file or directory','unknown option',
|
||||
'died at','exit code','permission denied') {
|
||||
'died at','exit code','permission denied',
|
||||
"Can't locate") {
|
||||
if (/$pattern/i) {
|
||||
my $not_error = 0;
|
||||
if (defined($NOT_ERROR{&defined_step_id($i)})) {
|
||||
|
@ -553,6 +553,7 @@ sub bleu_score {
|
||||
my $score = 0;
|
||||
my $iscore = 0;
|
||||
my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
|
||||
print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";
|
||||
|
||||
for (my $j=1; $j<=$max_Ngram; $j++) {
|
||||
if ($matching_ngrams->[$j] == 0) {
|
||||
|
1168
scripts/generic/mteval-v13a.pl
Executable file
1168
scripts/generic/mteval-v13a.pl
Executable file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user