mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
new models
This commit is contained in:
parent
233083a8b8
commit
c573551713
@ -49,6 +49,9 @@ else
|
||||
TRGEXT = ${TRG}
|
||||
endif
|
||||
|
||||
## set additional argument options for opus_read (if it is used)
|
||||
## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3
|
||||
OPUSREAD_ARGS =
|
||||
|
||||
## all of OPUS (NEW: don't require MOSES format)
|
||||
# OPUSCORPORA = ${patsubst %/latest/moses/${LANGPAIR}.txt.zip,%,\
|
||||
|
@ -243,7 +243,7 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
rm -f $@.zip ${@:.${SRCEXT}.raw=.xml} ${@:.${SRCEXT}.raw=.ids} ${dir $@}/README ${dir $@}/LICENSE; \
|
||||
elif [ -e ${OPUSHOME}/$$c/latest/xml/${LANGPAIR}.xml.gz ]; then \
|
||||
echo "extract $$c (${LANGPAIR}) from OPUS"; \
|
||||
opus_read -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} -wm moses -p raw > $@.tmp; \
|
||||
opus_read ${OPUSREAD_ARGS} -rd ${OPUSHOME} -d $$c -s ${SRC} -t ${TRG} -wm moses -p raw > $@.tmp; \
|
||||
cut -f1 $@.tmp > $@; \
|
||||
cut -f2 $@.tmp > ${@:.${SRCEXT}.raw=.${TRGEXT}.raw}; \
|
||||
rm -f $@.tmp; \
|
||||
|
@ -97,7 +97,7 @@ best_dist:
|
||||
echo "------------------------------------------------"; \
|
||||
echo "search best model for ${LANGPAIRSTR}"; \
|
||||
for d in ${ALT_MODEL_DIR}; do \
|
||||
e=`ls work-$$d/${LANGPAIRSTR}/val/*.trg | xargs basename | sed 's/\.trg//'`; \
|
||||
e=`ls work-$$d/${LANGPAIRSTR}/test/*.trg | tail -1 | xargs basename | sed 's/\.trg//'`; \
|
||||
echo "evaldata = $$e"; \
|
||||
if [ "$$e" != "GNOME" ]; then \
|
||||
I=`find work-$$d/${LANGPAIRSTR}/ -maxdepth 1 -name "$$e.*.eval" -printf "%f\n"`; \
|
||||
|
@ -70,6 +70,7 @@ MARIAN_EARLY_STOPPING = 5
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: all
|
||||
all: model-index.txt
|
||||
${MAKE} ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
|
||||
@ -142,6 +143,24 @@ tmx-tune:
|
||||
|
||||
|
||||
|
||||
## awful hack to fix a problem with the pre-processing script for the target language
|
||||
|
||||
.PHONY: en-simplify
|
||||
en-simplify:
|
||||
${MAKE} SRC=en1 TRG=en2 \
|
||||
MODEL=simplewiki_v1 \
|
||||
SRCPRE_PARA="en en en1-en2/opus-2020-03-02/source.spm" \
|
||||
TRGPRE_PARA="en en en1-en2/opus-2020-03-02/target.spm | sed 's/^>>en<< //'" \
|
||||
BASEMODELHOME=../models/en+el+es+fi-en+el+es+fi \
|
||||
BASEMODELZIP=opus-2020-03-02.zip \
|
||||
TRAIN_SRC=en1-en2/simplewiki_v1/train/simplewiki_v1-training.en-en.clean.en1 \
|
||||
TRAIN_TRG=en1-en2/simplewiki_v1/train/simplewiki_v1-training.en-en.clean.en2 \
|
||||
DEV_SRC=en1-en2/simplewiki_v1/dev/simplewiki_v1-tuning.en-en.clean.en1 \
|
||||
DEV_TRG=en1-en2/simplewiki_v1/dev/simplewiki_v1-tuning.en-en.clean.en2 \
|
||||
TEST_SRC=en1-en2/simplewiki_v1/test/simplewiki_v1-testing.en-en.clean.en1 \
|
||||
TEST_TRG=en1-en2/simplewiki_v1/test/simplewiki_v1-testing.en-en.clean.en2 \
|
||||
all
|
||||
|
||||
|
||||
.PHONY: news-enfi
|
||||
news-enfi:
|
||||
@ -256,9 +275,13 @@ endif
|
||||
.PHONY: data
|
||||
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
|
||||
|
||||
.PHONY: basemodel
|
||||
basemodel: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
|
||||
mkdir -p ${dir $@}
|
||||
ifneq (${BASEMODELZIP},)
|
||||
ifneq (${wildcard ${BASEMODELHOME}/${BASEMODELZIP}},)
|
||||
unzip -u -d ${dir $@} ${BASEMODELHOME}/${BASEMODELZIP}
|
||||
else ifneq (${BASEMODELZIP},)
|
||||
wget -nv -O ${dir $@}/${BASEMODELZIP} ${BASEMODELHOME}/${BASEMODELZIP}
|
||||
cd ${dir $@} && unzip -u ${BASEMODELZIP}
|
||||
else
|
||||
@ -266,16 +289,19 @@ else
|
||||
endif
|
||||
|
||||
|
||||
SRCPRE_PARA = ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm
|
||||
TRGPRE_PARA = ${TRG} ${LANGPAIR}/${BASEMODELNAME}/target.spm
|
||||
|
||||
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
|
||||
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
zcat $< |\
|
||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
|
||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRCPRE_PARA} |\
|
||||
gzip -c > $@
|
||||
|
||||
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
|
||||
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
zcat $< |\
|
||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
|
||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${TRGPRE_PARA} |\
|
||||
gzip -c > $@
|
||||
|
||||
|
||||
|
36
models/en+el+es+fi-en+el+es+fi/README.md
Normal file
36
models/en+el+es+fi-en+el+es+fi/README.md
Normal file
@ -0,0 +1,36 @@
|
||||
# opus-2020-03-02.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
|
||||
* download: [opus-2020-03-02.zip](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.zip)
|
||||
* test set translations: [opus-2020-03-02.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.test.txt)
|
||||
* test set scores: [opus-2020-03-02.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en+el+es+fi-en+el+es+fi/opus-2020-03-02.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| newsdev2015-enfi.en.fi | 16.0 | 0.498 |
|
||||
| newssyscomb2009.en.es | 29.9 | 0.570 |
|
||||
| newssyscomb2009.es.en | 29.7 | 0.569 |
|
||||
| news-test2008.en.es | 27.3 | 0.549 |
|
||||
| news-test2008.es.en | 27.3 | 0.548 |
|
||||
| newstest2009.en.es | 28.4 | 0.564 |
|
||||
| newstest2009.es.en | 28.4 | 0.564 |
|
||||
| newstest2010.en.es | 34.0 | 0.599 |
|
||||
| newstest2010.es.en | 34.0 | 0.599 |
|
||||
| newstest2011.en.es | 35.1 | 0.600 |
|
||||
| newstest2012.en.es | 35.4 | 0.602 |
|
||||
| newstest2013.en.es | 31.9 | 0.576 |
|
||||
| newstest2015-enfi.en.fi | 17.8 | 0.509 |
|
||||
| newstest2016-enfi.en.fi | 19.0 | 0.521 |
|
||||
| newstest2017-enfi.en.fi | 21.2 | 0.539 |
|
||||
| newstest2018-enfi.en.fi | 13.9 | 0.478 |
|
||||
| newstest2019-enfi.en.fi | 18.8 | 0.503 |
|
||||
| newstestB2016-enfi.en.fi | 14.9 | 0.491 |
|
||||
| newstestB2017-enfi.en.fi | 16.9 | 0.503 |
|
||||
| simplification.en.en | 63.0 | 0.798 |
|
||||
| Tatoeba.en.fi | 56.7 | 0.719 |
|
||||
|
@ -11,5 +11,20 @@
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| Tatoeba.en.ml | 50.4 | 0.598 |
|
||||
| Bible.en.ml | 50.4 | 0.598 |
|
||||
|
||||
# opus+bt-2020-03-02.zip
|
||||
|
||||
* dataset: opus+bt
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus+bt-2020-03-02.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.zip)
|
||||
* test set translations: [opus+bt-2020-03-02.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.test.txt)
|
||||
* test set scores: [opus+bt-2020-03-02.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ml/opus+bt-2020-03-02.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| Tatoeba.en.ml | 17.0 | 0.507 |
|
||||
|
||||
|
@ -43,3 +43,18 @@
|
||||
|-----------------------|-------|-------|
|
||||
| Tatoeba.ml.en | 42.6 | 0.591 |
|
||||
|
||||
# opus-2020-03-01.zip
|
||||
|
||||
* dataset: opus
|
||||
* model: transformer-align
|
||||
* pre-processing: normalization + SentencePiece
|
||||
* download: [opus-2020-03-01.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.zip)
|
||||
* test set translations: [opus-2020-03-01.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.test.txt)
|
||||
* test set scores: [opus-2020-03-01.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-03-01.eval.txt)
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| testset | BLEU | chr-F |
|
||||
|-----------------------|-------|-------|
|
||||
| Tatoeba.ml.en | 40.5 | 0.576 |
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
|
||||
#
|
||||
#
|
||||
# replace MOSESHOME and SNMTPATH with your own setup!
|
||||
@ -25,11 +25,20 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
THREADS=4
|
||||
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
if [ $4 == "noflags" ]; then
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
python3 ${SNMTPATH}/apply_bpe.py -c $3
|
||||
else
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
fi
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
|
||||
#
|
||||
#
|
||||
# replace MOSESHOME and SPMENCODE with your own setup!
|
||||
@ -21,11 +21,17 @@ fi
|
||||
MOSESSCRIPTS=${MOSESHOME}/scripts
|
||||
TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
if [ $4 == "noflags" ]; then
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3
|
||||
else
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
fi
|
||||
|
||||
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
|
Loading…
Reference in New Issue
Block a user