new models

This commit is contained in:
Joerg Tiedemann 2020-03-02 16:59:31 +02:00
parent 233083a8b8
commit c573551713
9 changed files with 131 additions and 21 deletions

View File

@ -49,6 +49,9 @@ else
TRGEXT = ${TRG}
endif
## set additional argument options for opus_read (if it is used)
## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3
OPUSREAD_ARGS =
## all of OPUS (NEW: don't require MOSES format)
# OPUSCORPORA = ${patsubst %/latest/moses/${LANGPAIR}.txt.zip,%,\

View File

@ -243,7 +243,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
rm -f $@.zip ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
echo "extract $$c (${LANGPAIR}) from OPUS"; \
opus_read -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} -wm moses -p raw > $@.tmp; \
opus_read ${OPUSREAD_ARGS} -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} -wm moses -p raw > $@.tmp; \
cut -f1 $@.tmp > $@; \
cut -f2 $@.tmp > ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
rm -f $@.tmp; \

View File

@ -97,7 +97,7 @@ best_dist:
echo "------------------------------------------------"; \
echo "search best model for ${LANGPAIRSTR}"; \
for d in ${ALT_MODEL_DIR}; do \
e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
e=`ls work-$$d/${LANGPAIRSTR}/test/*.trg | tail -1 | xargs basename | sed 's/\.trg//'`; \
echo "evaldata = $$e"; \
if [ "$$e" != "GNOME" ]; then \
I=`find work-$$d/${LANGPAIRSTR}/ -maxdepth 1 -name "$$e.*.eval" -printf "%f\n"`; \

View File

@ -70,6 +70,7 @@ MARIAN_EARLY_STOPPING = 5
.PHONY: all
all: model-index.txt
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
@ -142,6 +143,24 @@ tmx-tune:
## awful hack to fix a problem with the pre-processing script for the target language
.PHONY: en-simplify
en-simplify:
${MAKE} SRC=en1 TRG=en2 \
MODEL=simplewiki_v1 \
SRCPRE_PARA="en en en1-en2/opus-2020-03-02/source.spm" \
TRGPRE_PARA="en en en1-en2/opus-2020-03-02/target.spm | sed 's/^>>en<< //'" \
BASEMODELHOME=../models/en+el+es+fi-en+el+es+fi \
BASEMODELZIP=opus-2020-03-02.zip \
TRAIN_SRC=en1-en2/simplewiki_v1/train/simplewiki_v1-training.en-en.clean.en1 \
TRAIN_TRG=en1-en2/simplewiki_v1/train/simplewiki_v1-training.en-en.clean.en2 \
DEV_SRC=en1-en2/simplewiki_v1/dev/simplewiki_v1-tuning.en-en.clean.en1 \
DEV_TRG=en1-en2/simplewiki_v1/dev/simplewiki_v1-tuning.en-en.clean.en2 \
TEST_SRC=en1-en2/simplewiki_v1/test/simplewiki_v1-testing.en-en.clean.en1 \
TEST_TRG=en1-en2/simplewiki_v1/test/simplewiki_v1-testing.en-en.clean.en2 \
all
.PHONY: news-enfi
news-enfi:
@ -256,9 +275,13 @@ endif
.PHONY: data
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
.PHONY: basemodel
basemodel: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
mkdir -p ${dir $@}
ifneq (${BASEMODELZIP},)
ifneq (${wildcard ${BASEMODELHOME}/${BASEMODELZIP}},)
unzip -u -d ${dir $@} ${BASEMODELHOME}/${BASEMODELZIP}
else ifneq (${BASEMODELZIP},)
wget -nv -O ${dir $@}/${BASEMODELZIP} ${BASEMODELHOME}/${BASEMODELZIP}
cd ${dir $@} && unzip -u ${BASEMODELZIP}
else
@ -266,16 +289,19 @@ else
endif
SRCPRE_PARA = ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm
TRGPRE_PARA = ${TRG} ${LANGPAIR}/${BASEMODELNAME}/target.spm
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRCPRE_PARA} |\
gzip -c > $@
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${TRGPRE_PARA} |\
gzip -c > $@

View File

@ -0,0 +1,36 @@
# opus-2020-03-02.zip
* dataset: opus
* model: transformer
* pre-processing: normalization + SentencePiece
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus-2020-03-02.zip](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.zip)
* test set translations: [opus-2020-03-02.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.test.txt)
* test set scores: [opus-2020-03-02.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newsdev2015-enfi.en.fi | 16.0 | 0.498 |
| newssyscomb2009.en.es | 29.9 | 0.570 |
| newssyscomb2009.es.en | 29.7 | 0.569 |
| news-test2008.en.es | 27.3 | 0.549 |
| news-test2008.es.en | 27.3 | 0.548 |
| newstest2009.en.es | 28.4 | 0.564 |
| newstest2009.es.en | 28.4 | 0.564 |
| newstest2010.en.es | 34.0 | 0.599 |
| newstest2010.es.en | 34.0 | 0.599 |
| newstest2011.en.es | 35.1 | 0.600 |
| newstest2012.en.es | 35.4 | 0.602 |
| newstest2013.en.es | 31.9 | 0.576 |
| newstest2015-enfi.en.fi | 17.8 | 0.509 |
| newstest2016-enfi.en.fi | 19.0 | 0.521 |
| newstest2017-enfi.en.fi | 21.2 | 0.539 |
| newstest2018-enfi.en.fi | 13.9 | 0.478 |
| newstest2019-enfi.en.fi | 18.8 | 0.503 |
| newstestB2016-enfi.en.fi | 14.9 | 0.491 |
| newstestB2017-enfi.en.fi | 16.9 | 0.503 |
| simplification.en.en | 63.0 | 0.798 |
| Tatoeba.en.fi | 56.7 | 0.719 |

View File

@ -11,5 +11,20 @@
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.en.ml | 50.4 | 0.598 |
| Bible.en.ml | 50.4 | 0.598 |
# opus+bt-2020-03-02.zip
* dataset: opus+bt
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus+bt-2020-03-02.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.zip)
* test set translations: [opus+bt-2020-03-02.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.test.txt)
* test set scores: [opus+bt-2020-03-02.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.en.ml | 17.0 | 0.507 |

View File

@ -43,3 +43,18 @@
|-----------------------|-------|-------|
| Tatoeba.ml.en | 42.6 | 0.591 |
# opus-2020-03-01.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-03-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.zip)
* test set translations: [opus-2020-03-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.test.txt)
* test set scores: [opus-2020-03-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.ml.en | 40.5 | 0.576 |

View File

@ -1,6 +1,6 @@
#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
#
#
# replace MOSESHOME and SNMTPATH with your own setup!
@ -25,11 +25,20 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
THREADS=4
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
sed "s/^/>>$2<< /"
if [ $4 == "noflags" ]; then
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
python3 ${SNMTPATH}/apply_bpe.py -c $3
else
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
sed "s/^/>>$2<< /"
fi

View File

@ -1,6 +1,6 @@
#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
#
#
# replace MOSESHOME and SPMENCODE with your own setup!
@ -21,11 +21,17 @@ fi
MOSESSCRIPTS=${MOSESHOME}/scripts
TOKENIZER=${MOSESSCRIPTS}/tokenizer
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"
if [ $4 == "noflags" ]; then
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3
else
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"
fi
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |