updates to EMS: mteval-v13a.pl, parallel preparation, better paths and defaults in examples

This commit is contained in:
Philipp Koehn 2011-12-21 04:26:27 +00:00
parent 3ab37ca321
commit b95c372e3a
8 changed files with 1373 additions and 109 deletions

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses # moses
moses-src-dir = /home/pkoehn/moses moses-src-dir = /home/pkoehn/moses
# #
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts # moses scripts
moses-script-dir = /home/pkoehn/moses/scripts moses-script-dir = $moses-src-dir/scripts
# #
# srilm # srilm
srilm-dir = $moses-src-dir/srilm/bin/i686 srilm-dir = $moses-src-dir/srilm/bin/i686
# #
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data # data
wmt10-data = $working-dir/data wmt10-data = $working-dir/data
### basic tools ### basic tools
# #
# moses decoder # moses decoder
decoder = $moses-src-dir/dist/bin/moses decoder = $moses-bin-dir/moses
# conversion of phrase table into binary on-disk format # conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format # conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2" #ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized # tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -138,27 +147,21 @@ order = 5
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary lm-binarizer = $moses-bin-dir/build_binary
#type = 8 type = 8
#
# if binarized, set type (default srilm; if binarized: irstlm)
#
# set to 8 when using kenlm
#type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here ### each language model to be used has its own section here
@ -219,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary lm-binarizer = $moses-bin-dir/build_binary
#type = 8 type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
################################################################# #################################################################
# TRANSLATION MODEL TRAINING # TRANSLATION MODEL TRAINING
@ -261,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
#generation-factors = "word -> pos" #generation-factors = "word -> pos"
#decoding-steps = "t0, g0" #decoding-steps = "t0, g0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++ ### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be # giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces # initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts # memory requirements. set here the number of parts
# #
run-giza-in-parts = 5 #run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output ### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and) # (commonly used: grow-diag-final-and)
@ -355,7 +364,7 @@ score-settings = "--GoodTuring"
### tuning script to be used ### tuning script to be used
# #
tuning-script = $moses-script-dir/training/mert-moses.pl tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert" tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning ### specify the corpus used for tuning
# it should contain 1000s of sentences # it should contain 1000s of sentences
@ -394,14 +403,14 @@ decoder-settings = ""
# and also point to a configuration file that contains # and also point to a configuration file that contains
# pointers to all relevant model files # pointers to all relevant model files
# #
#config = #config-with-reused-weights =
######################################################### #########################################################
## RECASER: restore case, this part only trains the model ## RECASER: restore case, this part only trains the model
[RECASING] [RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm #decoder = $moses-bin-dir/moses
### training data ### training data
# raw input needs to be still tokenized, # raw input needs to be still tokenized,
@ -448,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings ### additional decoder settings
# switches for the Moses decoder # switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
# #
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
@ -470,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU ### BLEU
# #
nist-bleu = $moses-script-dir/generic/mteval-v12.pl nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c" nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu = #ibm-bleu =

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses # moses
moses-src-dir = /home/pkoehn/moses moses-src-dir = /home/pkoehn/moses
# #
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts # moses scripts
moses-script-dir = /home/pkoehn/moses/scripts moses-script-dir = $moses-src-dir/scripts
# #
# srilm # srilm
srilm-dir = $moses-src-dir/srilm/bin/i686 srilm-dir = $moses-src-dir/srilm/bin/i686
# #
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data # data
wmt10-data = $working-dir/data wmt10-data = $working-dir/data
### basic tools ### basic tools
# #
# moses decoder # moses decoder
decoder = $moses-src-dir/dist/bin/moses decoder = $moses-bin-dir/moses
# conversion of phrase table into binary on-disk format # conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/misc/processPhraseTable ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format # conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2" #ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized # tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -132,27 +141,27 @@ order = 5
### tool to be used for training randomized language model from scratch ### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained) # (more commonly, a SRILM is trained)
# #
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
### script to use for binary table format for irstlm or kenlm ### script to use for binary table format for irstlm or kenlm
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary #lm-binarizer = $moses-bin-dir/build_binary
#type = 8 #type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here ### each language model to be used has its own section here
@ -218,21 +227,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary #lm-binarizer = $moses-bin-dir/build_binary
#type = 8 #type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
################################################################# #################################################################
# FACTOR DEFINITION # FACTOR DEFINITION
@ -275,12 +284,18 @@ reordering-factors = "word -> word"
#generation-factors = #generation-factors =
decoding-steps = "t0" decoding-steps = "t0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++ ### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be # giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces # initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts # memory requirements. set here the number of parts
# #
run-giza-in-parts = 5 #run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output ### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and) # (commonly used: grow-diag-final-and)
@ -354,7 +369,7 @@ score-settings = "--GoodTuring"
# point to a configuration file that contains # point to a configuration file that contains
# pointers to all relevant model files # pointers to all relevant model files
# #
#config = #config-with-reused-weights =
##################################################### #####################################################
### TUNING: finding good weights for model components ### TUNING: finding good weights for model components
@ -369,7 +384,7 @@ score-settings = "--GoodTuring"
### tuning script to be used ### tuning script to be used
# #
tuning-script = $moses-script-dir/training/mert-moses.pl tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert" tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning ### specify the corpus used for tuning
# it should contain 1000s of sentences # it should contain 1000s of sentences
@ -415,7 +430,7 @@ decoder-settings = ""
[RECASING] [RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm #decoder = $moses-bin-dir/moses
### training data ### training data
# raw input needs to be still tokenized, # raw input needs to be still tokenized,
@ -462,6 +477,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings ### additional decoder settings
# switches for the Moses decoder # switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
# #
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
@ -484,8 +504,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU ### BLEU
# #
nist-bleu = $moses-script-dir/generic/mteval-v12.pl nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c" nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu = #ibm-bleu =

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses # moses
moses-src-dir = /home/pkoehn/moses moses-src-dir = /home/pkoehn/moses
# #
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts # moses scripts
moses-script-dir = /home/pkoehn/moses/scripts moses-script-dir = $moses-src-dir/scripts
# #
# srilm # srilm
srilm-dir = $moses-src-dir/srilm/bin/i686 srilm-dir = $moses-src-dir/srilm/bin/i686
# #
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data # data
wmt10-data = $working-dir/data wmt10-data = $working-dir/data
### basic tools ### basic tools
# #
# moses decoder # moses decoder
decoder = $moses-src-dir/dist/bin/moses_chart decoder = $moses-bin-dir/moses_chart
# conversion of phrase table into binary on-disk format # conversion of phrase table into binary on-disk format
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable #ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format # conversion of rule table into binary on-disk format
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2" ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized # tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -132,27 +141,27 @@ order = 5
### tool to be used for training randomized language model from scratch ### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained) # (more commonly, a SRILM is trained)
# #
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
### script to use for binary table format for irstlm or kenlm ### script to use for binary table format for irstlm or kenlm
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary lm-binarizer = $moses-bin-dir/build_binary
#type = 8 type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here ### each language model to be used has its own section here
@ -213,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary lm-binarizer = $moses-bin-dir/build_binary
#type = 8 type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
################################################################# #################################################################
# TRANSLATION MODEL TRAINING # TRANSLATION MODEL TRAINING
@ -255,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
#generation-factors = "word -> pos" #generation-factors = "word -> pos"
#decoding-steps = "t0, g0" #decoding-steps = "t0, g0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++ ### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be # giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces # initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts # memory requirements. set here the number of parts
# #
run-giza-in-parts = 5 #run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output ### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and) # (commonly used: grow-diag-final-and)
@ -334,7 +349,7 @@ score-settings = "--GoodTuring"
# point to a configuration file that contains # point to a configuration file that contains
# pointers to all relevant model files # pointers to all relevant model files
# #
#config = #config-with-reused-weights =
##################################################### #####################################################
### TUNING: finding good weights for model components ### TUNING: finding good weights for model components
@ -349,7 +364,7 @@ score-settings = "--GoodTuring"
### tuning script to be used ### tuning script to be used
# #
tuning-script = $moses-script-dir/training/mert-moses.pl tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert" tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning ### specify the corpus used for tuning
# it should contain 1000s of sentences # it should contain 1000s of sentences
@ -395,7 +410,7 @@ decoder-settings = ""
[RECASING] [RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm #decoder = $moses-bin-dir/moses
### training data ### training data
# raw input needs to be still tokenized, # raw input needs to be still tokenized,
@ -442,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings ### additional decoder settings
# switches for the Moses decoder # switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
# #
#decoder-settings = "" #decoder-settings = ""
@ -464,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU ### BLEU
# #
nist-bleu = $moses-script-dir/generic/mteval-v12.pl nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c" nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu = #ibm-bleu =

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses # moses
moses-src-dir = /home/pkoehn/moses moses-src-dir = /home/pkoehn/moses
# #
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts # moses scripts
moses-script-dir = /home/pkoehn/moses/scripts moses-script-dir = $moses-src-dir/scripts
# #
# srilm # srilm
srilm-dir = $moses-src-dir/srilm/bin/i686 srilm-dir = $moses-src-dir/srilm/bin/i686
# #
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data # data
wmt10-data = $working-dir/data wmt10-data = $working-dir/data
### basic tools ### basic tools
# #
# moses decoder # moses decoder
decoder = $moses-src-dir/dist/bin/moses_chart decoder = $moses-bin-dir/moses_chart
# conversion of phrase table into binary on-disk format # conversion of phrase table into binary on-disk format
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable #ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format # conversion of rule table into binary on-disk format
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2" ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized # tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -136,27 +145,27 @@ order = 5
### tool to be used for training randomized language model from scratch ### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained) # (more commonly, a SRILM is trained)
# #
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
### script to use for binary table format for irstlm or kenlm ### script to use for binary table format for irstlm or kenlm
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary lm-binarizer = $moses-bin-dir/build_binary
#type = 8 type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here ### each language model to be used has its own section here
@ -217,21 +226,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary lm-binarizer = $moses-bin-dir/build_binary
#type = 8 type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
################################################################# #################################################################
# TRANSLATION MODEL TRAINING # TRANSLATION MODEL TRAINING
@ -259,12 +268,18 @@ script = $moses-script-dir/training/train-model.perl
#generation-factors = "word -> pos" #generation-factors = "word -> pos"
#decoding-steps = "t0, g0" #decoding-steps = "t0, g0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++ ### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be # giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces # initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts # memory requirements. set here the number of parts
# #
run-giza-in-parts = 5 #run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output ### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and) # (commonly used: grow-diag-final-and)
@ -338,7 +353,7 @@ score-settings = "--GoodTuring"
# point to a configuration file that contains # point to a configuration file that contains
# pointers to all relevant model files # pointers to all relevant model files
# #
#config = #config-with-reused-weights =
##################################################### #####################################################
### TUNING: finding good weights for model components ### TUNING: finding good weights for model components
@ -353,7 +368,7 @@ score-settings = "--GoodTuring"
### tuning script to be used ### tuning script to be used
# #
tuning-script = $moses-script-dir/training/mert-moses.pl tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert" tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning ### specify the corpus used for tuning
# it should contain 1000s of sentences # it should contain 1000s of sentences
@ -399,7 +414,7 @@ decoder-settings = ""
[RECASING] [RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm #decoder = $moses-bin-dir/moses
### training data ### training data
# raw input needs to be still tokenized, # raw input needs to be still tokenized,
@ -446,6 +461,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings ### additional decoder settings
# switches for the Moses decoder # switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
# #
#decoder-settings = "" #decoder-settings = ""
@ -468,8 +488,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU ### BLEU
# #
nist-bleu = $moses-script-dir/generic/mteval-v12.pl nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c" nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu = #ibm-bleu =

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses # moses
moses-src-dir = /home/pkoehn/moses moses-src-dir = /home/pkoehn/moses
# #
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts # moses scripts
moses-script-dir = /home/pkoehn/moses/scripts moses-script-dir = $moses-src-dir/scripts
# #
# srilm # srilm
srilm-dir = $moses-src-dir/srilm/bin/i686 srilm-dir = $moses-src-dir/srilm/bin/i686
# #
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data # data
toy-data = $moses-script-dir/ems/example/data toy-data = $moses-script-dir/ems/example/data
### basic tools ### basic tools
# #
# moses decoder # moses decoder
decoder = $moses-src-dir/dist/bin/moses decoder = $moses-bin-dir/moses
# conversion of phrase table into binary on-disk format # conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format # conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2" #ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized # tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -126,27 +135,27 @@ order = 5
### tool to be used for training randomized language model from scratch ### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained) # (more commonly, a SRILM is trained)
# #
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
### script to use for binary table format for irstlm or kenlm ### script to use for binary table format for irstlm or kenlm
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary lm-binarizer = $moses-bin-dir/build_binary
#type = 8 type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here ### each language model to be used has its own section here
@ -197,21 +206,21 @@ raw-corpus = $toy-data/nc-5k.$output-extension
# (default: no binarization) # (default: no binarization)
# irstlm # irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm #lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8 # kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary lm-binarizer = $moses-bin-dir/build_binary
#type = 8 type = 8
### script to create quantized language model format (irstlm) ### script to create quantized language model format (irstlm)
# (default: no quantization) # (default: no quantization)
# #
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm #lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format ### script to use for converting into randomized table format
# (default: no randomization) # (default: no randomization)
# #
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8" #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
################################################################# #################################################################
# TRANSLATION MODEL TRAINING # TRANSLATION MODEL TRAINING
@ -239,12 +248,18 @@ script = $moses-script-dir/training/train-model.perl
#generation-factors = "word -> pos" #generation-factors = "word -> pos"
#decoding-steps = "t0, g0" #decoding-steps = "t0, g0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++ ### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be # giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces # initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts # memory requirements. set here the number of parts
# #
run-giza-in-parts = 5 #run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output ### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and) # (commonly used: grow-diag-final-and)
@ -318,7 +333,7 @@ score-settings = "--GoodTuring"
# point to a configuration file that contains # point to a configuration file that contains
# pointers to all relevant model files # pointers to all relevant model files
# #
#config = #config-with-reused-weights =
##################################################### #####################################################
### TUNING: finding good weights for model components ### TUNING: finding good weights for model components
@ -333,7 +348,7 @@ weight-config = $toy-data/weight.ini
### tuning script to be used ### tuning script to be used
# #
tuning-script = $moses-script-dir/training/mert-moses.pl tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert" tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning ### specify the corpus used for tuning
# it should contain 1000s of sentences # it should contain 1000s of sentences
@ -379,7 +394,7 @@ decoder-settings = ""
[RECASING] [RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm #decoder = $moses-bin-dir/moses
### training data ### training data
# raw input needs to be still tokenized, # raw input needs to be still tokenized,
@ -422,6 +437,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings ### additional decoder settings
# switches for the Moses decoder # switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
# #
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
@ -444,8 +464,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU ### BLEU
# #
nist-bleu = $moses-script-dir/generic/mteval-v12.pl nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c" nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu = #ibm-bleu =

View File

@ -1275,7 +1275,8 @@ sub check_if_crashed {
foreach my $pattern (@{$ERROR{&defined_step_id($i)}}, foreach my $pattern (@{$ERROR{&defined_step_id($i)}},
'error','killed','core dumped','can\'t read', 'error','killed','core dumped','can\'t read',
'no such file or directory','unknown option', 'no such file or directory','unknown option',
'died at','exit code','permission denied') { 'died at','exit code','permission denied',
"Can't locate") {
if (/$pattern/i) { if (/$pattern/i) {
my $not_error = 0; my $not_error = 0;
if (defined($NOT_ERROR{&defined_step_id($i)})) { if (defined($NOT_ERROR{&defined_step_id($i)})) {

View File

@ -553,6 +553,7 @@ sub bleu_score {
my $score = 0; my $score = 0;
my $iscore = 0; my $iscore = 0;
my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]); my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";
for (my $j=1; $j<=$max_Ngram; $j++) { for (my $j=1; $j<=$max_Ngram; $j++) {
if ($matching_ngrams->[$j] == 0) { if ($matching_ngrams->[$j] == 0) {

1168
scripts/generic/mteval-v13a.pl Executable file

File diff suppressed because it is too large Load Diff