updates to EMS: mteval-v13a.pl, parallel preparation, better paths and defaults in examples

2024-09-19 15:17:10 +03:00 · 2011-12-21 04:26:27 +00:00 · 2011-12-21 04:26:27 +00:00 · b95c372e3a
commit b95c372e3a
parent 3ab37ca321
8 changed files with 1373 additions and 109 deletions
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
 wmt10-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses
+decoder = $moses-bin-dir/moses

 # conversion of phrase table into binary on-disk format
-ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
+ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -138,27 +147,21 @@ order = 5
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
-
-#
-# if binarized, set type (default srilm; if binarized: irstlm)
-#
-# set to 8 when using kenlm
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -219,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # TRANSLATION MODEL TRAINING
@ -261,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -355,7 +364,7 @@ score-settings = "--GoodTuring"
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
@ -394,14 +403,14 @@ decoder-settings = ""
 # and also point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #########################################################
 ## RECASER: restore case, this part only trains the model

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -448,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

@ -470,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
 wmt10-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses
+decoder = $moses-bin-dir/moses

 # conversion of phrase table into binary on-disk format
-ttable-binarizer = $moses-src-dir/misc/processPhraseTable
+ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -132,27 +141,27 @@ order = 5
 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
-#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
+#lm-binarizer = $moses-bin-dir/build_binary
 #type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -218,21 +227,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
+#lm-binarizer = $moses-bin-dir/build_binary
 #type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # FACTOR DEFINITION
@ -275,12 +284,18 @@ reordering-factors = "word -> word"
 #generation-factors = 
 decoding-steps = "t0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -354,7 +369,7 @@ score-settings = "--GoodTuring"
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #####################################################
 ### TUNING: finding good weights for model components
@ -369,7 +384,7 @@ score-settings = "--GoodTuring"
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
@ -415,7 +430,7 @@ decoder-settings = ""

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -462,6 +477,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

@ -484,8 +504,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
 wmt10-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses_chart
+decoder = $moses-bin-dir/moses_chart

 # conversion of phrase table into binary on-disk format
-#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
+#ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -132,27 +141,27 @@ order = 5
 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
-#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -213,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # TRANSLATION MODEL TRAINING
@ -255,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -334,7 +349,7 @@ score-settings = "--GoodTuring"
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #####################################################
 ### TUNING: finding good weights for model components
@ -349,7 +364,7 @@ score-settings = "--GoodTuring"
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
@ -395,7 +410,7 @@ decoder-settings = ""

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -442,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 #decoder-settings = ""

@ -464,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
 wmt10-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses_chart
+decoder = $moses-bin-dir/moses_chart

 # conversion of phrase table into binary on-disk format
-#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
+#ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -136,27 +145,27 @@ order = 5
 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
-#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -217,21 +226,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # TRANSLATION MODEL TRAINING
@ -259,12 +268,18 @@ script = $moses-script-dir/training/train-model.perl
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -338,7 +353,7 @@ score-settings = "--GoodTuring"
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #####################################################
 ### TUNING: finding good weights for model components
@ -353,7 +368,7 @@ score-settings = "--GoodTuring"
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
@ -399,7 +414,7 @@ decoder-settings = ""

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -446,6 +461,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 #decoder-settings = ""

@ -468,8 +488,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
 toy-data = $moses-script-dir/ems/example/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses
+decoder = $moses-bin-dir/moses

 # conversion of phrase table into binary on-disk format
-ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
+ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -126,27 +135,27 @@ order = 5
 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
-#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -197,21 +206,21 @@ raw-corpus = $toy-data/nc-5k.$output-extension
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # TRANSLATION MODEL TRAINING
@ -239,12 +248,18 @@ script = $moses-script-dir/training/train-model.perl
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -318,7 +333,7 @@ score-settings = "--GoodTuring"
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #####################################################
 ### TUNING: finding good weights for model components
@ -333,7 +348,7 @@ weight-config = $toy-data/weight.ini
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
@ -379,7 +394,7 @@ decoder-settings = ""

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -422,6 +437,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

@ -444,8 +464,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -1275,7 +1275,8 @@ sub check_if_crashed {
 	foreach my $pattern (@{$ERROR{&defined_step_id($i)}},
 			     'error','killed','core dumped','can\'t read',
 			     'no such file or directory','unknown option',
-			     'died at','exit code','permission denied') {
+			     'died at','exit code','permission denied',
+           "Can't locate") {
 	    if (/$pattern/i) {
 		my $not_error = 0;
 		if (defined($NOT_ERROR{&defined_step_id($i)})) {
--- a/scripts/generic/mteval-v12.pl
+++ b/scripts/generic/mteval-v12.pl
@ -553,6 +553,7 @@ sub bleu_score {
    my $score = 0;
    my $iscore = 0;
    my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
+    print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";

    for (my $j=1; $j<=$max_Ngram; $j++) {
        if ($matching_ngrams->[$j] == 0) {
--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl