updates to EMS: mteval-v13a.pl, parallel preparation, better paths and defaults in examples

This commit is contained in:
Philipp Koehn 2011-12-21 04:26:27 +00:00
parent 3ab37ca321
commit b95c372e3a
8 changed files with 1373 additions and 109 deletions

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses
moses-src-dir = /home/pkoehn/moses
#
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts
moses-script-dir = /home/pkoehn/moses/scripts
moses-script-dir = $moses-src-dir/scripts
#
# srilm
srilm-dir = $moses-src-dir/srilm/bin/i686
#
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data
wmt10-data = $working-dir/data
### basic tools
#
# moses decoder
decoder = $moses-src-dir/dist/bin/moses
decoder = $moses-bin-dir/moses
# conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -138,27 +147,21 @@ order = 5
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8
#
# if binarized, set type (default srilm; if binarized: irstlm)
#
# set to 8 when using kenlm
#type = 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here
@ -219,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# TRANSLATION MODEL TRAINING
@ -261,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
#generation-factors = "word -> pos"
#decoding-steps = "t0, g0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts
#
run-giza-in-parts = 5
#run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
@ -355,7 +364,7 @@ score-settings = "--GoodTuring"
### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert"
tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
@ -394,14 +403,14 @@ decoder-settings = ""
# and also point to a configuration file that contains
# pointers to all relevant model files
#
#config =
#config-with-reused-weights =
#########################################################
## RECASER: restore case, this part only trains the model
[RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
#decoder = $moses-bin-dir/moses
### training data
# raw input needs to be still tokenized,
@ -448,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings
# switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
@ -470,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU
#
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu =

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses
moses-src-dir = /home/pkoehn/moses
#
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts
moses-script-dir = /home/pkoehn/moses/scripts
moses-script-dir = $moses-src-dir/scripts
#
# srilm
srilm-dir = $moses-src-dir/srilm/bin/i686
#
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data
wmt10-data = $working-dir/data
### basic tools
#
# moses decoder
decoder = $moses-src-dir/dist/bin/moses
decoder = $moses-bin-dir/moses
# conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -132,27 +141,27 @@ order = 5
### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained)
#
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#lm-binarizer = $moses-bin-dir/build_binary
#type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here
@ -218,21 +227,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#lm-binarizer = $moses-bin-dir/build_binary
#type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# FACTOR DEFINITION
@ -275,12 +284,18 @@ reordering-factors = "word -> word"
#generation-factors =
decoding-steps = "t0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts
#
run-giza-in-parts = 5
#run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
@ -354,7 +369,7 @@ score-settings = "--GoodTuring"
# point to a configuration file that contains
# pointers to all relevant model files
#
#config =
#config-with-reused-weights =
#####################################################
### TUNING: finding good weights for model components
@ -369,7 +384,7 @@ score-settings = "--GoodTuring"
### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert"
tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
@ -415,7 +430,7 @@ decoder-settings = ""
[RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
#decoder = $moses-bin-dir/moses
### training data
# raw input needs to be still tokenized,
@ -462,6 +477,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings
# switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
@ -484,8 +504,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU
#
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu =

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses
moses-src-dir = /home/pkoehn/moses
#
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts
moses-script-dir = /home/pkoehn/moses/scripts
moses-script-dir = $moses-src-dir/scripts
#
# srilm
srilm-dir = $moses-src-dir/srilm/bin/i686
#
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data
wmt10-data = $working-dir/data
### basic tools
#
# moses decoder
decoder = $moses-src-dir/dist/bin/moses_chart
decoder = $moses-bin-dir/moses_chart
# conversion of phrase table into binary on-disk format
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
#ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -132,27 +141,27 @@ order = 5
### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained)
#
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here
@ -213,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# TRANSLATION MODEL TRAINING
@ -255,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
#generation-factors = "word -> pos"
#decoding-steps = "t0, g0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts
#
run-giza-in-parts = 5
#run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
@ -334,7 +349,7 @@ score-settings = "--GoodTuring"
# point to a configuration file that contains
# pointers to all relevant model files
#
#config =
#config-with-reused-weights =
#####################################################
### TUNING: finding good weights for model components
@ -349,7 +364,7 @@ score-settings = "--GoodTuring"
### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert"
tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
@ -395,7 +410,7 @@ decoder-settings = ""
[RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
#decoder = $moses-bin-dir/moses
### training data
# raw input needs to be still tokenized,
@ -442,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings
# switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
#
#decoder-settings = ""
@ -464,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU
#
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu =

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses
moses-src-dir = /home/pkoehn/moses
#
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts
moses-script-dir = /home/pkoehn/moses/scripts
moses-script-dir = $moses-src-dir/scripts
#
# srilm
srilm-dir = $moses-src-dir/srilm/bin/i686
#
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data
wmt10-data = $working-dir/data
### basic tools
#
# moses decoder
decoder = $moses-src-dir/dist/bin/moses_chart
decoder = $moses-bin-dir/moses_chart
# conversion of phrase table into binary on-disk format
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
#ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -136,27 +145,27 @@ order = 5
### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained)
#
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here
@ -217,21 +226,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# TRANSLATION MODEL TRAINING
@ -259,12 +268,18 @@ script = $moses-script-dir/training/train-model.perl
#generation-factors = "word -> pos"
#decoding-steps = "t0, g0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts
#
run-giza-in-parts = 5
#run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
@ -338,7 +353,7 @@ score-settings = "--GoodTuring"
# point to a configuration file that contains
# pointers to all relevant model files
#
#config =
#config-with-reused-weights =
#####################################################
### TUNING: finding good weights for model components
@ -353,7 +368,7 @@ score-settings = "--GoodTuring"
### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert"
tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
@ -399,7 +414,7 @@ decoder-settings = ""
[RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
#decoder = $moses-bin-dir/moses
### training data
# raw input needs to be still tokenized,
@ -446,6 +461,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings
# switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
#
#decoder-settings = ""
@ -468,8 +488,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU
#
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu =

View File

@ -18,25 +18,34 @@ pair-extension = fr-en
# moses
moses-src-dir = /home/pkoehn/moses
#
# moses binaries
moses-bin-dir = $moses-src-dir/dist/bin
#
# moses scripts
moses-script-dir = /home/pkoehn/moses/scripts
moses-script-dir = $moses-src-dir/scripts
#
# srilm
srilm-dir = $moses-src-dir/srilm/bin/i686
#
# irstlm
irstlm-dir = $moses-src-dir/irstlm/bin
#
# randlm
randlm-dir = $moses-src-dir/randlm/bin
#
# data
toy-data = $moses-script-dir/ems/example/data
### basic tools
#
# moses decoder
decoder = $moses-src-dir/dist/bin/moses
decoder = $moses-bin-dir/moses
# conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
ttable-binarizer = $moses-bin-dir/processPhraseTable
# conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -126,27 +135,27 @@ order = 5
### tool to be used for training randomized language model from scratch
# (more commonly, a SRILM is trained)
#
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
### script to use for binary table format for irstlm or kenlm
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
### each language model to be used has its own section here
@ -197,21 +206,21 @@ raw-corpus = $toy-data/nc-5k.$output-extension
# (default: no binarization)
# irstlm
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
#lm-binarizer = $irstlm-dir/compile-lm
# kenlm, also set type to 8
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
#type = 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8
### script to create quantized language model format (irstlm)
# (default: no quantization)
#
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
#lm-quantizer = $irstlm-dir/quantize-lm
### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
# TRANSLATION MODEL TRAINING
@ -239,12 +248,18 @@ script = $moses-script-dir/training/train-model.perl
#generation-factors = "word -> pos"
#decoding-steps = "t0, g0"
### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes
### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts
#
run-giza-in-parts = 5
#run-giza-in-parts = 5
### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
@ -318,7 +333,7 @@ score-settings = "--GoodTuring"
# point to a configuration file that contains
# pointers to all relevant model files
#
#config =
#config-with-reused-weights =
#####################################################
### TUNING: finding good weights for model components
@ -333,7 +348,7 @@ weight-config = $toy-data/weight.ini
### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-src-dir/mert"
tuning-settings = "-mertdir $moses-bin-dir"
### specify the corpus used for tuning
# it should contain 1000s of sentences
@ -379,7 +394,7 @@ decoder-settings = ""
[RECASING]
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
#decoder = $moses-bin-dir/moses
### training data
# raw input needs to be still tokenized,
@ -422,6 +437,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### additional decoder settings
# switches for the Moses decoder
# common choices:
# "-threads N" for multi-threading
# "-mbr" for MBR decoding
# "-drop-unknown" for dropping unknown source words
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
@ -444,8 +464,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
### BLEU
#
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
#ibm-bleu =

View File

@ -1275,7 +1275,8 @@ sub check_if_crashed {
foreach my $pattern (@{$ERROR{&defined_step_id($i)}},
'error','killed','core dumped','can\'t read',
'no such file or directory','unknown option',
'died at','exit code','permission denied') {
'died at','exit code','permission denied',
"Can't locate") {
if (/$pattern/i) {
my $not_error = 0;
if (defined($NOT_ERROR{&defined_step_id($i)})) {

View File

@ -553,6 +553,7 @@ sub bleu_score {
my $score = 0;
my $iscore = 0;
my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";
for (my $j=1; $j<=$max_Ngram; $j++) {
if ($matching_ngrams->[$j] == 0) {

1168
scripts/generic/mteval-v13a.pl Executable file

File diff suppressed because it is too large Load Diff