mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-10 00:47:31 +03:00
updates to EMS: mteval-v13a.pl, parallel preparation, better paths and defaults in examples
This commit is contained in:
parent
3ab37ca321
commit
b95c372e3a
@ -18,25 +18,34 @@ pair-extension = fr-en
|
|||||||
# moses
|
# moses
|
||||||
moses-src-dir = /home/pkoehn/moses
|
moses-src-dir = /home/pkoehn/moses
|
||||||
#
|
#
|
||||||
|
# moses binaries
|
||||||
|
moses-bin-dir = $moses-src-dir/dist/bin
|
||||||
|
#
|
||||||
# moses scripts
|
# moses scripts
|
||||||
moses-script-dir = /home/pkoehn/moses/scripts
|
moses-script-dir = $moses-src-dir/scripts
|
||||||
#
|
#
|
||||||
# srilm
|
# srilm
|
||||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||||
#
|
#
|
||||||
|
# irstlm
|
||||||
|
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||||
|
#
|
||||||
|
# randlm
|
||||||
|
randlm-dir = $moses-src-dir/randlm/bin
|
||||||
|
#
|
||||||
# data
|
# data
|
||||||
wmt10-data = $working-dir/data
|
wmt10-data = $working-dir/data
|
||||||
|
|
||||||
### basic tools
|
### basic tools
|
||||||
#
|
#
|
||||||
# moses decoder
|
# moses decoder
|
||||||
decoder = $moses-src-dir/dist/bin/moses
|
decoder = $moses-bin-dir/moses
|
||||||
|
|
||||||
# conversion of phrase table into binary on-disk format
|
# conversion of phrase table into binary on-disk format
|
||||||
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||||
|
|
||||||
# conversion of rule table into binary on-disk format
|
# conversion of rule table into binary on-disk format
|
||||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||||
|
|
||||||
# tokenizers - comment out if all your data is already tokenized
|
# tokenizers - comment out if all your data is already tokenized
|
||||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||||
@ -138,27 +147,21 @@ order = 5
|
|||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
type = 8
|
||||||
|
|
||||||
#
|
|
||||||
# if binarized, set type (default srilm; if binarized: irstlm)
|
|
||||||
#
|
|
||||||
# set to 8 when using kenlm
|
|
||||||
#type = 8
|
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### each language model to be used has its own section here
|
### each language model to be used has its own section here
|
||||||
|
|
||||||
@ -219,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
|||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
#################################################################
|
#################################################################
|
||||||
# TRANSLATION MODEL TRAINING
|
# TRANSLATION MODEL TRAINING
|
||||||
@ -261,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
|
|||||||
#generation-factors = "word -> pos"
|
#generation-factors = "word -> pos"
|
||||||
#decoding-steps = "t0, g0"
|
#decoding-steps = "t0, g0"
|
||||||
|
|
||||||
|
### parallelization of data preparation step
|
||||||
|
# the two directions of the data preparation can be run in parallel
|
||||||
|
# comment out if not needed
|
||||||
|
#
|
||||||
|
parallel = yes
|
||||||
|
|
||||||
### pre-computation for giza++
|
### pre-computation for giza++
|
||||||
# giza++ has a more efficient data structure that needs to be
|
# giza++ has a more efficient data structure that needs to be
|
||||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||||
# memory requirements. set here the number of parts
|
# memory requirements. set here the number of parts
|
||||||
#
|
#
|
||||||
run-giza-in-parts = 5
|
#run-giza-in-parts = 5
|
||||||
|
|
||||||
### symmetrization method to obtain word alignments from giza output
|
### symmetrization method to obtain word alignments from giza output
|
||||||
# (commonly used: grow-diag-final-and)
|
# (commonly used: grow-diag-final-and)
|
||||||
@ -355,7 +364,7 @@ score-settings = "--GoodTuring"
|
|||||||
### tuning script to be used
|
### tuning script to be used
|
||||||
#
|
#
|
||||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
tuning-settings = "-mertdir $moses-bin-dir"
|
||||||
|
|
||||||
### specify the corpus used for tuning
|
### specify the corpus used for tuning
|
||||||
# it should contain 1000s of sentences
|
# it should contain 1000s of sentences
|
||||||
@ -394,14 +403,14 @@ decoder-settings = ""
|
|||||||
# and also point to a configuration file that contains
|
# and also point to a configuration file that contains
|
||||||
# pointers to all relevant model files
|
# pointers to all relevant model files
|
||||||
#
|
#
|
||||||
#config =
|
#config-with-reused-weights =
|
||||||
|
|
||||||
#########################################################
|
#########################################################
|
||||||
## RECASER: restore case, this part only trains the model
|
## RECASER: restore case, this part only trains the model
|
||||||
|
|
||||||
[RECASING]
|
[RECASING]
|
||||||
|
|
||||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
#decoder = $moses-bin-dir/moses
|
||||||
|
|
||||||
### training data
|
### training data
|
||||||
# raw input needs to be still tokenized,
|
# raw input needs to be still tokenized,
|
||||||
@ -448,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
|||||||
|
|
||||||
### additional decoder settings
|
### additional decoder settings
|
||||||
# switches for the Moses decoder
|
# switches for the Moses decoder
|
||||||
|
# common choices:
|
||||||
|
# "-threads N" for multi-threading
|
||||||
|
# "-mbr" for MBR decoding
|
||||||
|
# "-drop-unknown" for dropping unknown source words
|
||||||
|
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||||
#
|
#
|
||||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||||
|
|
||||||
@ -470,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
|||||||
|
|
||||||
### BLEU
|
### BLEU
|
||||||
#
|
#
|
||||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||||
#ibm-bleu =
|
#ibm-bleu =
|
||||||
|
|
||||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
|||||||
# moses
|
# moses
|
||||||
moses-src-dir = /home/pkoehn/moses
|
moses-src-dir = /home/pkoehn/moses
|
||||||
#
|
#
|
||||||
|
# moses binaries
|
||||||
|
moses-bin-dir = $moses-src-dir/dist/bin
|
||||||
|
#
|
||||||
# moses scripts
|
# moses scripts
|
||||||
moses-script-dir = /home/pkoehn/moses/scripts
|
moses-script-dir = $moses-src-dir/scripts
|
||||||
#
|
#
|
||||||
# srilm
|
# srilm
|
||||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||||
#
|
#
|
||||||
|
# irstlm
|
||||||
|
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||||
|
#
|
||||||
|
# randlm
|
||||||
|
randlm-dir = $moses-src-dir/randlm/bin
|
||||||
|
#
|
||||||
# data
|
# data
|
||||||
wmt10-data = $working-dir/data
|
wmt10-data = $working-dir/data
|
||||||
|
|
||||||
### basic tools
|
### basic tools
|
||||||
#
|
#
|
||||||
# moses decoder
|
# moses decoder
|
||||||
decoder = $moses-src-dir/dist/bin/moses
|
decoder = $moses-bin-dir/moses
|
||||||
|
|
||||||
# conversion of phrase table into binary on-disk format
|
# conversion of phrase table into binary on-disk format
|
||||||
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
|
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||||
|
|
||||||
# conversion of rule table into binary on-disk format
|
# conversion of rule table into binary on-disk format
|
||||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||||
|
|
||||||
# tokenizers - comment out if all your data is already tokenized
|
# tokenizers - comment out if all your data is already tokenized
|
||||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||||
@ -132,27 +141,27 @@ order = 5
|
|||||||
### tool to be used for training randomized language model from scratch
|
### tool to be used for training randomized language model from scratch
|
||||||
# (more commonly, a SRILM is trained)
|
# (more commonly, a SRILM is trained)
|
||||||
#
|
#
|
||||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### script to use for binary table format for irstlm or kenlm
|
### script to use for binary table format for irstlm or kenlm
|
||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
#lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
#type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### each language model to be used has its own section here
|
### each language model to be used has its own section here
|
||||||
|
|
||||||
@ -218,21 +227,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
|||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
#lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
#type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
#################################################################
|
#################################################################
|
||||||
# FACTOR DEFINITION
|
# FACTOR DEFINITION
|
||||||
@ -275,12 +284,18 @@ reordering-factors = "word -> word"
|
|||||||
#generation-factors =
|
#generation-factors =
|
||||||
decoding-steps = "t0"
|
decoding-steps = "t0"
|
||||||
|
|
||||||
|
### parallelization of data preparation step
|
||||||
|
# the two directions of the data preparation can be run in parallel
|
||||||
|
# comment out if not needed
|
||||||
|
#
|
||||||
|
parallel = yes
|
||||||
|
|
||||||
### pre-computation for giza++
|
### pre-computation for giza++
|
||||||
# giza++ has a more efficient data structure that needs to be
|
# giza++ has a more efficient data structure that needs to be
|
||||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||||
# memory requirements. set here the number of parts
|
# memory requirements. set here the number of parts
|
||||||
#
|
#
|
||||||
run-giza-in-parts = 5
|
#run-giza-in-parts = 5
|
||||||
|
|
||||||
### symmetrization method to obtain word alignments from giza output
|
### symmetrization method to obtain word alignments from giza output
|
||||||
# (commonly used: grow-diag-final-and)
|
# (commonly used: grow-diag-final-and)
|
||||||
@ -354,7 +369,7 @@ score-settings = "--GoodTuring"
|
|||||||
# point to a configuration file that contains
|
# point to a configuration file that contains
|
||||||
# pointers to all relevant model files
|
# pointers to all relevant model files
|
||||||
#
|
#
|
||||||
#config =
|
#config-with-reused-weights =
|
||||||
|
|
||||||
#####################################################
|
#####################################################
|
||||||
### TUNING: finding good weights for model components
|
### TUNING: finding good weights for model components
|
||||||
@ -369,7 +384,7 @@ score-settings = "--GoodTuring"
|
|||||||
### tuning script to be used
|
### tuning script to be used
|
||||||
#
|
#
|
||||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
tuning-settings = "-mertdir $moses-bin-dir"
|
||||||
|
|
||||||
### specify the corpus used for tuning
|
### specify the corpus used for tuning
|
||||||
# it should contain 1000s of sentences
|
# it should contain 1000s of sentences
|
||||||
@ -415,7 +430,7 @@ decoder-settings = ""
|
|||||||
|
|
||||||
[RECASING]
|
[RECASING]
|
||||||
|
|
||||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
#decoder = $moses-bin-dir/moses
|
||||||
|
|
||||||
### training data
|
### training data
|
||||||
# raw input needs to be still tokenized,
|
# raw input needs to be still tokenized,
|
||||||
@ -462,6 +477,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
|||||||
|
|
||||||
### additional decoder settings
|
### additional decoder settings
|
||||||
# switches for the Moses decoder
|
# switches for the Moses decoder
|
||||||
|
# common choices:
|
||||||
|
# "-threads N" for multi-threading
|
||||||
|
# "-mbr" for MBR decoding
|
||||||
|
# "-drop-unknown" for dropping unknown source words
|
||||||
|
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||||
#
|
#
|
||||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||||
|
|
||||||
@ -484,8 +504,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
|||||||
|
|
||||||
### BLEU
|
### BLEU
|
||||||
#
|
#
|
||||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||||
#ibm-bleu =
|
#ibm-bleu =
|
||||||
|
|
||||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
|||||||
# moses
|
# moses
|
||||||
moses-src-dir = /home/pkoehn/moses
|
moses-src-dir = /home/pkoehn/moses
|
||||||
#
|
#
|
||||||
|
# moses binaries
|
||||||
|
moses-bin-dir = $moses-src-dir/dist/bin
|
||||||
|
#
|
||||||
# moses scripts
|
# moses scripts
|
||||||
moses-script-dir = /home/pkoehn/moses/scripts
|
moses-script-dir = $moses-src-dir/scripts
|
||||||
#
|
#
|
||||||
# srilm
|
# srilm
|
||||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||||
#
|
#
|
||||||
|
# irstlm
|
||||||
|
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||||
|
#
|
||||||
|
# randlm
|
||||||
|
randlm-dir = $moses-src-dir/randlm/bin
|
||||||
|
#
|
||||||
# data
|
# data
|
||||||
wmt10-data = $working-dir/data
|
wmt10-data = $working-dir/data
|
||||||
|
|
||||||
### basic tools
|
### basic tools
|
||||||
#
|
#
|
||||||
# moses decoder
|
# moses decoder
|
||||||
decoder = $moses-src-dir/dist/bin/moses_chart
|
decoder = $moses-bin-dir/moses_chart
|
||||||
|
|
||||||
# conversion of phrase table into binary on-disk format
|
# conversion of phrase table into binary on-disk format
|
||||||
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
#ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||||
|
|
||||||
# conversion of rule table into binary on-disk format
|
# conversion of rule table into binary on-disk format
|
||||||
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||||
|
|
||||||
# tokenizers - comment out if all your data is already tokenized
|
# tokenizers - comment out if all your data is already tokenized
|
||||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||||
@ -132,27 +141,27 @@ order = 5
|
|||||||
### tool to be used for training randomized language model from scratch
|
### tool to be used for training randomized language model from scratch
|
||||||
# (more commonly, a SRILM is trained)
|
# (more commonly, a SRILM is trained)
|
||||||
#
|
#
|
||||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### script to use for binary table format for irstlm or kenlm
|
### script to use for binary table format for irstlm or kenlm
|
||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### each language model to be used has its own section here
|
### each language model to be used has its own section here
|
||||||
|
|
||||||
@ -213,21 +222,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
|||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
#################################################################
|
#################################################################
|
||||||
# TRANSLATION MODEL TRAINING
|
# TRANSLATION MODEL TRAINING
|
||||||
@ -255,12 +264,18 @@ script = $moses-script-dir/training/train-model.perl
|
|||||||
#generation-factors = "word -> pos"
|
#generation-factors = "word -> pos"
|
||||||
#decoding-steps = "t0, g0"
|
#decoding-steps = "t0, g0"
|
||||||
|
|
||||||
|
### parallelization of data preparation step
|
||||||
|
# the two directions of the data preparation can be run in parallel
|
||||||
|
# comment out if not needed
|
||||||
|
#
|
||||||
|
parallel = yes
|
||||||
|
|
||||||
### pre-computation for giza++
|
### pre-computation for giza++
|
||||||
# giza++ has a more efficient data structure that needs to be
|
# giza++ has a more efficient data structure that needs to be
|
||||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||||
# memory requirements. set here the number of parts
|
# memory requirements. set here the number of parts
|
||||||
#
|
#
|
||||||
run-giza-in-parts = 5
|
#run-giza-in-parts = 5
|
||||||
|
|
||||||
### symmetrization method to obtain word alignments from giza output
|
### symmetrization method to obtain word alignments from giza output
|
||||||
# (commonly used: grow-diag-final-and)
|
# (commonly used: grow-diag-final-and)
|
||||||
@ -334,7 +349,7 @@ score-settings = "--GoodTuring"
|
|||||||
# point to a configuration file that contains
|
# point to a configuration file that contains
|
||||||
# pointers to all relevant model files
|
# pointers to all relevant model files
|
||||||
#
|
#
|
||||||
#config =
|
#config-with-reused-weights =
|
||||||
|
|
||||||
#####################################################
|
#####################################################
|
||||||
### TUNING: finding good weights for model components
|
### TUNING: finding good weights for model components
|
||||||
@ -349,7 +364,7 @@ score-settings = "--GoodTuring"
|
|||||||
### tuning script to be used
|
### tuning script to be used
|
||||||
#
|
#
|
||||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
tuning-settings = "-mertdir $moses-bin-dir"
|
||||||
|
|
||||||
### specify the corpus used for tuning
|
### specify the corpus used for tuning
|
||||||
# it should contain 1000s of sentences
|
# it should contain 1000s of sentences
|
||||||
@ -395,7 +410,7 @@ decoder-settings = ""
|
|||||||
|
|
||||||
[RECASING]
|
[RECASING]
|
||||||
|
|
||||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
#decoder = $moses-bin-dir/moses
|
||||||
|
|
||||||
### training data
|
### training data
|
||||||
# raw input needs to be still tokenized,
|
# raw input needs to be still tokenized,
|
||||||
@ -442,6 +457,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
|||||||
|
|
||||||
### additional decoder settings
|
### additional decoder settings
|
||||||
# switches for the Moses decoder
|
# switches for the Moses decoder
|
||||||
|
# common choices:
|
||||||
|
# "-threads N" for multi-threading
|
||||||
|
# "-mbr" for MBR decoding
|
||||||
|
# "-drop-unknown" for dropping unknown source words
|
||||||
|
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||||
#
|
#
|
||||||
#decoder-settings = ""
|
#decoder-settings = ""
|
||||||
|
|
||||||
@ -464,8 +484,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
|||||||
|
|
||||||
### BLEU
|
### BLEU
|
||||||
#
|
#
|
||||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||||
#ibm-bleu =
|
#ibm-bleu =
|
||||||
|
|
||||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
|||||||
# moses
|
# moses
|
||||||
moses-src-dir = /home/pkoehn/moses
|
moses-src-dir = /home/pkoehn/moses
|
||||||
#
|
#
|
||||||
|
# moses binaries
|
||||||
|
moses-bin-dir = $moses-src-dir/dist/bin
|
||||||
|
#
|
||||||
# moses scripts
|
# moses scripts
|
||||||
moses-script-dir = /home/pkoehn/moses/scripts
|
moses-script-dir = $moses-src-dir/scripts
|
||||||
#
|
#
|
||||||
# srilm
|
# srilm
|
||||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||||
#
|
#
|
||||||
|
# irstlm
|
||||||
|
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||||
|
#
|
||||||
|
# randlm
|
||||||
|
randlm-dir = $moses-src-dir/randlm/bin
|
||||||
|
#
|
||||||
# data
|
# data
|
||||||
wmt10-data = $working-dir/data
|
wmt10-data = $working-dir/data
|
||||||
|
|
||||||
### basic tools
|
### basic tools
|
||||||
#
|
#
|
||||||
# moses decoder
|
# moses decoder
|
||||||
decoder = $moses-src-dir/dist/bin/moses_chart
|
decoder = $moses-bin-dir/moses_chart
|
||||||
|
|
||||||
# conversion of phrase table into binary on-disk format
|
# conversion of phrase table into binary on-disk format
|
||||||
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
#ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||||
|
|
||||||
# conversion of rule table into binary on-disk format
|
# conversion of rule table into binary on-disk format
|
||||||
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||||
|
|
||||||
# tokenizers - comment out if all your data is already tokenized
|
# tokenizers - comment out if all your data is already tokenized
|
||||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||||
@ -136,27 +145,27 @@ order = 5
|
|||||||
### tool to be used for training randomized language model from scratch
|
### tool to be used for training randomized language model from scratch
|
||||||
# (more commonly, a SRILM is trained)
|
# (more commonly, a SRILM is trained)
|
||||||
#
|
#
|
||||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### script to use for binary table format for irstlm or kenlm
|
### script to use for binary table format for irstlm or kenlm
|
||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### each language model to be used has its own section here
|
### each language model to be used has its own section here
|
||||||
|
|
||||||
@ -217,21 +226,21 @@ tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
|||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
#################################################################
|
#################################################################
|
||||||
# TRANSLATION MODEL TRAINING
|
# TRANSLATION MODEL TRAINING
|
||||||
@ -259,12 +268,18 @@ script = $moses-script-dir/training/train-model.perl
|
|||||||
#generation-factors = "word -> pos"
|
#generation-factors = "word -> pos"
|
||||||
#decoding-steps = "t0, g0"
|
#decoding-steps = "t0, g0"
|
||||||
|
|
||||||
|
### parallelization of data preparation step
|
||||||
|
# the two directions of the data preparation can be run in parallel
|
||||||
|
# comment out if not needed
|
||||||
|
#
|
||||||
|
parallel = yes
|
||||||
|
|
||||||
### pre-computation for giza++
|
### pre-computation for giza++
|
||||||
# giza++ has a more efficient data structure that needs to be
|
# giza++ has a more efficient data structure that needs to be
|
||||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||||
# memory requirements. set here the number of parts
|
# memory requirements. set here the number of parts
|
||||||
#
|
#
|
||||||
run-giza-in-parts = 5
|
#run-giza-in-parts = 5
|
||||||
|
|
||||||
### symmetrization method to obtain word alignments from giza output
|
### symmetrization method to obtain word alignments from giza output
|
||||||
# (commonly used: grow-diag-final-and)
|
# (commonly used: grow-diag-final-and)
|
||||||
@ -338,7 +353,7 @@ score-settings = "--GoodTuring"
|
|||||||
# point to a configuration file that contains
|
# point to a configuration file that contains
|
||||||
# pointers to all relevant model files
|
# pointers to all relevant model files
|
||||||
#
|
#
|
||||||
#config =
|
#config-with-reused-weights =
|
||||||
|
|
||||||
#####################################################
|
#####################################################
|
||||||
### TUNING: finding good weights for model components
|
### TUNING: finding good weights for model components
|
||||||
@ -353,7 +368,7 @@ score-settings = "--GoodTuring"
|
|||||||
### tuning script to be used
|
### tuning script to be used
|
||||||
#
|
#
|
||||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
tuning-settings = "-mertdir $moses-bin-dir"
|
||||||
|
|
||||||
### specify the corpus used for tuning
|
### specify the corpus used for tuning
|
||||||
# it should contain 1000s of sentences
|
# it should contain 1000s of sentences
|
||||||
@ -399,7 +414,7 @@ decoder-settings = ""
|
|||||||
|
|
||||||
[RECASING]
|
[RECASING]
|
||||||
|
|
||||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
#decoder = $moses-bin-dir/moses
|
||||||
|
|
||||||
### training data
|
### training data
|
||||||
# raw input needs to be still tokenized,
|
# raw input needs to be still tokenized,
|
||||||
@ -446,6 +461,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
|||||||
|
|
||||||
### additional decoder settings
|
### additional decoder settings
|
||||||
# switches for the Moses decoder
|
# switches for the Moses decoder
|
||||||
|
# common choices:
|
||||||
|
# "-threads N" for multi-threading
|
||||||
|
# "-mbr" for MBR decoding
|
||||||
|
# "-drop-unknown" for dropping unknown source words
|
||||||
|
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||||
#
|
#
|
||||||
#decoder-settings = ""
|
#decoder-settings = ""
|
||||||
|
|
||||||
@ -468,8 +488,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
|||||||
|
|
||||||
### BLEU
|
### BLEU
|
||||||
#
|
#
|
||||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||||
#ibm-bleu =
|
#ibm-bleu =
|
||||||
|
|
||||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
|||||||
# moses
|
# moses
|
||||||
moses-src-dir = /home/pkoehn/moses
|
moses-src-dir = /home/pkoehn/moses
|
||||||
#
|
#
|
||||||
|
# moses binaries
|
||||||
|
moses-bin-dir = $moses-src-dir/dist/bin
|
||||||
|
#
|
||||||
# moses scripts
|
# moses scripts
|
||||||
moses-script-dir = /home/pkoehn/moses/scripts
|
moses-script-dir = $moses-src-dir/scripts
|
||||||
#
|
#
|
||||||
# srilm
|
# srilm
|
||||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||||
#
|
#
|
||||||
|
# irstlm
|
||||||
|
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||||
|
#
|
||||||
|
# randlm
|
||||||
|
randlm-dir = $moses-src-dir/randlm/bin
|
||||||
|
#
|
||||||
# data
|
# data
|
||||||
toy-data = $moses-script-dir/ems/example/data
|
toy-data = $moses-script-dir/ems/example/data
|
||||||
|
|
||||||
### basic tools
|
### basic tools
|
||||||
#
|
#
|
||||||
# moses decoder
|
# moses decoder
|
||||||
decoder = $moses-src-dir/dist/bin/moses
|
decoder = $moses-bin-dir/moses
|
||||||
|
|
||||||
# conversion of phrase table into binary on-disk format
|
# conversion of phrase table into binary on-disk format
|
||||||
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||||
|
|
||||||
# conversion of rule table into binary on-disk format
|
# conversion of rule table into binary on-disk format
|
||||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||||
|
|
||||||
# tokenizers - comment out if all your data is already tokenized
|
# tokenizers - comment out if all your data is already tokenized
|
||||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||||
@ -126,27 +135,27 @@ order = 5
|
|||||||
### tool to be used for training randomized language model from scratch
|
### tool to be used for training randomized language model from scratch
|
||||||
# (more commonly, a SRILM is trained)
|
# (more commonly, a SRILM is trained)
|
||||||
#
|
#
|
||||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### script to use for binary table format for irstlm or kenlm
|
### script to use for binary table format for irstlm or kenlm
|
||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
### each language model to be used has its own section here
|
### each language model to be used has its own section here
|
||||||
|
|
||||||
@ -197,21 +206,21 @@ raw-corpus = $toy-data/nc-5k.$output-extension
|
|||||||
# (default: no binarization)
|
# (default: no binarization)
|
||||||
|
|
||||||
# irstlm
|
# irstlm
|
||||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
#lm-binarizer = $irstlm-dir/compile-lm
|
||||||
|
|
||||||
# kenlm, also set type to 8
|
# kenlm, also set type to 8
|
||||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
lm-binarizer = $moses-bin-dir/build_binary
|
||||||
#type = 8
|
type = 8
|
||||||
|
|
||||||
### script to create quantized language model format (irstlm)
|
### script to create quantized language model format (irstlm)
|
||||||
# (default: no quantization)
|
# (default: no quantization)
|
||||||
#
|
#
|
||||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||||
|
|
||||||
### script to use for converting into randomized table format
|
### script to use for converting into randomized table format
|
||||||
# (default: no randomization)
|
# (default: no randomization)
|
||||||
#
|
#
|
||||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||||
|
|
||||||
#################################################################
|
#################################################################
|
||||||
# TRANSLATION MODEL TRAINING
|
# TRANSLATION MODEL TRAINING
|
||||||
@ -239,12 +248,18 @@ script = $moses-script-dir/training/train-model.perl
|
|||||||
#generation-factors = "word -> pos"
|
#generation-factors = "word -> pos"
|
||||||
#decoding-steps = "t0, g0"
|
#decoding-steps = "t0, g0"
|
||||||
|
|
||||||
|
### parallelization of data preparation step
|
||||||
|
# the two directions of the data preparation can be run in parallel
|
||||||
|
# comment out if not needed
|
||||||
|
#
|
||||||
|
parallel = yes
|
||||||
|
|
||||||
### pre-computation for giza++
|
### pre-computation for giza++
|
||||||
# giza++ has a more efficient data structure that needs to be
|
# giza++ has a more efficient data structure that needs to be
|
||||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||||
# memory requirements. set here the number of parts
|
# memory requirements. set here the number of parts
|
||||||
#
|
#
|
||||||
run-giza-in-parts = 5
|
#run-giza-in-parts = 5
|
||||||
|
|
||||||
### symmetrization method to obtain word alignments from giza output
|
### symmetrization method to obtain word alignments from giza output
|
||||||
# (commonly used: grow-diag-final-and)
|
# (commonly used: grow-diag-final-and)
|
||||||
@ -318,7 +333,7 @@ score-settings = "--GoodTuring"
|
|||||||
# point to a configuration file that contains
|
# point to a configuration file that contains
|
||||||
# pointers to all relevant model files
|
# pointers to all relevant model files
|
||||||
#
|
#
|
||||||
#config =
|
#config-with-reused-weights =
|
||||||
|
|
||||||
#####################################################
|
#####################################################
|
||||||
### TUNING: finding good weights for model components
|
### TUNING: finding good weights for model components
|
||||||
@ -333,7 +348,7 @@ weight-config = $toy-data/weight.ini
|
|||||||
### tuning script to be used
|
### tuning script to be used
|
||||||
#
|
#
|
||||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
tuning-settings = "-mertdir $moses-bin-dir"
|
||||||
|
|
||||||
### specify the corpus used for tuning
|
### specify the corpus used for tuning
|
||||||
# it should contain 1000s of sentences
|
# it should contain 1000s of sentences
|
||||||
@ -379,7 +394,7 @@ decoder-settings = ""
|
|||||||
|
|
||||||
[RECASING]
|
[RECASING]
|
||||||
|
|
||||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
#decoder = $moses-bin-dir/moses
|
||||||
|
|
||||||
### training data
|
### training data
|
||||||
# raw input needs to be still tokenized,
|
# raw input needs to be still tokenized,
|
||||||
@ -422,6 +437,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
|||||||
|
|
||||||
### additional decoder settings
|
### additional decoder settings
|
||||||
# switches for the Moses decoder
|
# switches for the Moses decoder
|
||||||
|
# common choices:
|
||||||
|
# "-threads N" for multi-threading
|
||||||
|
# "-mbr" for MBR decoding
|
||||||
|
# "-drop-unknown" for dropping unknown source words
|
||||||
|
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||||
#
|
#
|
||||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||||
|
|
||||||
@ -444,8 +464,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
|||||||
|
|
||||||
### BLEU
|
### BLEU
|
||||||
#
|
#
|
||||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||||
#ibm-bleu =
|
#ibm-bleu =
|
||||||
|
|
||||||
|
@ -1275,7 +1275,8 @@ sub check_if_crashed {
|
|||||||
foreach my $pattern (@{$ERROR{&defined_step_id($i)}},
|
foreach my $pattern (@{$ERROR{&defined_step_id($i)}},
|
||||||
'error','killed','core dumped','can\'t read',
|
'error','killed','core dumped','can\'t read',
|
||||||
'no such file or directory','unknown option',
|
'no such file or directory','unknown option',
|
||||||
'died at','exit code','permission denied') {
|
'died at','exit code','permission denied',
|
||||||
|
"Can't locate") {
|
||||||
if (/$pattern/i) {
|
if (/$pattern/i) {
|
||||||
my $not_error = 0;
|
my $not_error = 0;
|
||||||
if (defined($NOT_ERROR{&defined_step_id($i)})) {
|
if (defined($NOT_ERROR{&defined_step_id($i)})) {
|
||||||
|
@ -553,6 +553,7 @@ sub bleu_score {
|
|||||||
my $score = 0;
|
my $score = 0;
|
||||||
my $iscore = 0;
|
my $iscore = 0;
|
||||||
my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
|
my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
|
||||||
|
print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";
|
||||||
|
|
||||||
for (my $j=1; $j<=$max_Ngram; $j++) {
|
for (my $j=1; $j<=$max_Ngram; $j++) {
|
||||||
if ($matching_ngrams->[$j] == 0) {
|
if ($matching_ngrams->[$j] == 0) {
|
||||||
|
1168
scripts/generic/mteval-v13a.pl
Executable file
1168
scripts/generic/mteval-v13a.pl
Executable file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user