diff --git a/Makefile.config b/Makefile.config index f3b27da0..5e0fc355 100644 --- a/Makefile.config +++ b/Makefile.config @@ -49,6 +49,9 @@ else TRGEXT = ${TRG} endif +## set additional argument options for opus_read (if it is used) +## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3 +OPUSREAD_ARGS = ## all of OPUS (NEW: don't require MOSES format) # OPUSCORPORA = ${patsubst %/latest/moses/${LANGPAIR}.txt.zip,%,\ diff --git a/Makefile.data b/Makefile.data index e0234f22..63784267 100644 --- a/Makefile.data +++ b/Makefile.data @@ -243,7 +243,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ rm -f $@.zip ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \ elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \ echo "extract $$c (${LANGPAIR}) from OPUS"; \ - opus_read -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} -wm moses -p raw > $@.tmp; \ + opus_read ${OPUSREAD_ARGS} -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} -wm moses -p raw > $@.tmp; \ cut -f1 $@.tmp > $@; \ cut -f2 $@.tmp > ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \ rm -f $@.tmp; \ diff --git a/Makefile.dist b/Makefile.dist index 690e28ef..e311b3b9 100644 --- a/Makefile.dist +++ b/Makefile.dist @@ -97,7 +97,7 @@ best_dist: echo "------------------------------------------------"; \ echo "search best model for ${LANGPAIRSTR}"; \ for d in ${ALT_MODEL_DIR}; do \ - e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \ + e=`ls work-$$d/${LANGPAIRSTR}/test/*.trg | tail -1 | xargs basename | sed 's/\.trg//'`; \ echo "evaldata = $$e"; \ if [ "$$e" != "GNOME" ]; then \ I=`find work-$$d/${LANGPAIRSTR}/ -maxdepth 1 -name "$$e.*.eval" -printf "%f\n"`; \ diff --git a/finetune/Makefile b/finetune/Makefile index 414967f5..21131187 100644 --- a/finetune/Makefile +++ b/finetune/Makefile @@ -70,6 +70,7 @@ MARIAN_EARLY_STOPPING = 5 + .PHONY: all all: model-index.txt ${MAKE} ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare @@ -142,6 +143,24 @@ tmx-tune: +## awful hack to fix a problem with the pre-processing script for the target language + +.PHONY: en-simplify +en-simplify: + ${MAKE} SRC=en1 TRG=en2 \ + MODEL=simplewiki_v1 \ + SRCPRE_PARA="en en en1-en2/opus-2020-03-02/source.spm" \ + TRGPRE_PARA="en en en1-en2/opus-2020-03-02/target.spm | sed 's/^>>en<< //'" \ + BASEMODELHOME=../models/en+el+es+fi-en+el+es+fi \ + BASEMODELZIP=opus-2020-03-02.zip \ + TRAIN_SRC=en1-en2/simplewiki_v1/train/simplewiki_v1-training.en-en.clean.en1 \ + TRAIN_TRG=en1-en2/simplewiki_v1/train/simplewiki_v1-training.en-en.clean.en2 \ + DEV_SRC=en1-en2/simplewiki_v1/dev/simplewiki_v1-tuning.en-en.clean.en1 \ + DEV_TRG=en1-en2/simplewiki_v1/dev/simplewiki_v1-tuning.en-en.clean.en2 \ + TEST_SRC=en1-en2/simplewiki_v1/test/simplewiki_v1-testing.en-en.clean.en1 \ + TEST_TRG=en1-en2/simplewiki_v1/test/simplewiki_v1-testing.en-en.clean.en2 \ + all + .PHONY: news-enfi news-enfi: @@ -256,9 +275,13 @@ endif .PHONY: data data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz +.PHONY: basemodel +basemodel: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml ${LANGPAIR}/${BASEMODELNAME}/decoder.yml: mkdir -p ${dir $@} -ifneq (${BASEMODELZIP},) +ifneq (${wildcard ${BASEMODELHOME}/${BASEMODELZIP}},) + unzip -u -d ${dir $@} ${BASEMODELHOME}/${BASEMODELZIP} +else ifneq (${BASEMODELZIP},) wget -nv -O ${dir $@}/${BASEMODELZIP} ${BASEMODELHOME}/${BASEMODELZIP} cd ${dir $@} && unzip -u ${BASEMODELZIP} else @@ -266,16 +289,19 @@ else endif +SRCPRE_PARA = ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm +TRGPRE_PARA = ${TRG} ${LANGPAIR}/${BASEMODELNAME}/target.spm + .INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml zcat $< |\ - ${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\ + ${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRCPRE_PARA} |\ gzip -c > $@ .INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml zcat $< |\ - ${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\ + ${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${TRGPRE_PARA} |\ gzip -c > $@ diff --git a/models/en+el+es+fi-en+el+es+fi/README.md b/models/en+el+es+fi-en+el+es+fi/README.md new file mode 100644 index 00000000..9b6d63c9 --- /dev/null +++ b/models/en+el+es+fi-en+el+es+fi/README.md @@ -0,0 +1,36 @@ +# opus-2020-03-02.zip + +* dataset: opus +* model: transformer +* pre-processing: normalization + SentencePiece +* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID) +* download: [opus-2020-03-02.zip](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.zip) +* test set translations: [opus-2020-03-02.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.test.txt) +* test set scores: [opus-2020-03-02.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| newsdev2015-enfi.en.fi | 16.0 | 0.498 | +| newssyscomb2009.en.es | 29.9 | 0.570 | +| newssyscomb2009.es.en | 29.7 | 0.569 | +| news-test2008.en.es | 27.3 | 0.549 | +| news-test2008.es.en | 27.3 | 0.548 | +| newstest2009.en.es | 28.4 | 0.564 | +| newstest2009.es.en | 28.4 | 0.564 | +| newstest2010.en.es | 34.0 | 0.599 | +| newstest2010.es.en | 34.0 | 0.599 | +| newstest2011.en.es | 35.1 | 0.600 | +| newstest2012.en.es | 35.4 | 0.602 | +| newstest2013.en.es | 31.9 | 0.576 | +| newstest2015-enfi.en.fi | 17.8 | 0.509 | +| newstest2016-enfi.en.fi | 19.0 | 0.521 | +| newstest2017-enfi.en.fi | 21.2 | 0.539 | +| newstest2018-enfi.en.fi | 13.9 | 0.478 | +| newstest2019-enfi.en.fi | 18.8 | 0.503 | +| newstestB2016-enfi.en.fi | 14.9 | 0.491 | +| newstestB2017-enfi.en.fi | 16.9 | 0.503 | +| simplification.en.en | 63.0 | 0.798 | +| Tatoeba.en.fi | 56.7 | 0.719 | + diff --git a/models/en-ml/README.md b/models/en-ml/README.md index 7caaedfd..3ab2ae65 100644 --- a/models/en-ml/README.md +++ b/models/en-ml/README.md @@ -11,5 +11,20 @@ | testset | BLEU | chr-F | |-----------------------|-------|-------| -| Tatoeba.en.ml | 50.4 | 0.598 | +| Bible.en.ml | 50.4 | 0.598 | + +# opus+bt-2020-03-02.zip + +* dataset: opus+bt +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus+bt-2020-03-02.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.zip) +* test set translations: [opus+bt-2020-03-02.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.test.txt) +* test set scores: [opus+bt-2020-03-02.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| Tatoeba.en.ml | 17.0 | 0.507 | diff --git a/models/ml-en/README.md b/models/ml-en/README.md index 7612909a..e24c88f4 100644 --- a/models/ml-en/README.md +++ b/models/ml-en/README.md @@ -43,3 +43,18 @@ |-----------------------|-------|-------| | Tatoeba.ml.en | 42.6 | 0.591 | +# opus-2020-03-01.zip + +* dataset: opus +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus-2020-03-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.zip) +* test set translations: [opus-2020-03-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.test.txt) +* test set scores: [opus-2020-03-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| Tatoeba.ml.en | 40.5 | 0.576 | + diff --git a/preprocess-bpe-multi-target.sh b/preprocess-bpe-multi-target.sh index ae986dc1..5b354075 100755 --- a/preprocess-bpe-multi-target.sh +++ b/preprocess-bpe-multi-target.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# USAGE preprocess.sh source-langid target-langid bpecodes < input > output +# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output # # # replace MOSESHOME and SNMTPATH with your own setup! @@ -25,11 +25,20 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer THREADS=4 -${TOKENIZER}/replace-unicode-punctuation.perl | -${TOKENIZER}/remove-non-printing-char.perl | -${TOKENIZER}/normalize-punctuation.perl -l $1 | -${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 | -sed 's/ */ /g;s/^ *//g;s/ *$//g' | -python3 ${SNMTPATH}/apply_bpe.py -c $3 | -sed "s/^/>>$2<< /" +if [ $4 == "noflags" ]; then + ${TOKENIZER}/replace-unicode-punctuation.perl | + ${TOKENIZER}/remove-non-printing-char.perl | + ${TOKENIZER}/normalize-punctuation.perl -l $1 | + ${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 | + sed 's/ */ /g;s/^ *//g;s/ *$//g' | + python3 ${SNMTPATH}/apply_bpe.py -c $3 +else + ${TOKENIZER}/replace-unicode-punctuation.perl | + ${TOKENIZER}/remove-non-printing-char.perl | + ${TOKENIZER}/normalize-punctuation.perl -l $1 | + ${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 | + sed 's/ */ /g;s/^ *//g;s/ *$//g' | + python3 ${SNMTPATH}/apply_bpe.py -c $3 | + sed "s/^/>>$2<< /" +fi diff --git a/preprocess-spm-multi-target.sh b/preprocess-spm-multi-target.sh index 47e98709..a6a005af 100755 --- a/preprocess-spm-multi-target.sh +++ b/preprocess-spm-multi-target.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# USAGE preprocess.sh source-langid target-langid bpecodes < input > output +# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output # # # replace MOSESHOME and SPMENCODE with your own setup! @@ -21,11 +21,17 @@ fi MOSESSCRIPTS=${MOSESHOME}/scripts TOKENIZER=${MOSESSCRIPTS}/tokenizer - -${TOKENIZER}/replace-unicode-punctuation.perl | -${TOKENIZER}/remove-non-printing-char.perl | -sed 's/ */ /g;s/^ *//g;s/ *$//g' | -${SPMENCODE} --model $3 | -sed "s/^/>>$2<< /" +if [ $4 == "noflags" ]; then + ${TOKENIZER}/replace-unicode-punctuation.perl | + ${TOKENIZER}/remove-non-printing-char.perl | + sed 's/ */ /g;s/^ *//g;s/ *$//g' | + ${SPMENCODE} --model $3 +else + ${TOKENIZER}/replace-unicode-punctuation.perl | + ${TOKENIZER}/remove-non-printing-char.perl | + sed 's/ */ /g;s/^ *//g;s/ *$//g' | + ${SPMENCODE} --model $3 | + sed "s/^/>>$2<< /" +fi # ${TOKENIZER}/normalize-punctuation.perl -l $1 |