updated model list

This commit is contained in:
Joerg Tiedemann 2020-11-26 12:57:46 +02:00
parent 3a66dc6fd4
commit 7bd502edcc
28 changed files with 915 additions and 12 deletions

View File

@ -63,6 +63,10 @@ SKIP_LANGPAIRS ?= "nothing"
## ##
# SHUFFLE_DATA = 1 # SHUFFLE_DATA = 1
## devtest data is shuffled by default
SHUFFLE_DEVDATA = 1
## set FIT_DATA_SIZE to a specific value to fit the training data ## set FIT_DATA_SIZE to a specific value to fit the training data
## to a certain number of lines for each language pair in the collection ## to a certain number of lines for each language pair in the collection
## --> especially useful for multilingual models for balancing the ## --> especially useful for multilingual models for balancing the

View File

@ -454,7 +454,11 @@ ${DEV_SRC}.shuffled.gz:
fi \ fi \
done \ done \
done done
ifeq (${SHUFFLE_DEVDATA},0)
paste ${DEV_SRC} ${DEV_TRG} | ${GZIP} -c > $@
else
paste ${DEV_SRC} ${DEV_TRG} | ${UNIQ} | ${SHUFFLE} | ${GZIP} -c > $@ paste ${DEV_SRC} ${DEV_TRG} | ${UNIQ} | ${SHUFFLE} | ${GZIP} -c > $@
endif
echo -n "* total size of shuffled dev data: " >> ${dir ${DEV_SRC}}README.md echo -n "* total size of shuffled dev data: " >> ${dir ${DEV_SRC}}README.md
${GZIP} -cd < $@ | wc -l >> ${dir ${DEV_SRC}}README.md ${GZIP} -cd < $@ | wc -l >> ${dir ${DEV_SRC}}README.md

View File

@ -306,11 +306,11 @@ endif
## document level models ## document level models
## devtest data should not be shuffled
%-doc: %-doc:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \ ${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \
PRE=norm SUBWORDS=spm \
PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \
PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \ PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \
SHUFFLE_DEVDATA=0 \
${@:-doc=} ${@:-doc=}

View File

@ -88,11 +88,21 @@
## general parameters for Tatoeba models ## general parameters for Tatoeba models
TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge
TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master ## NEW: release
TATOEBA_WORK ?= ${PWD}/work-tatoeba TATOEBA_VERSION ?= v2020-07-28
TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE}
TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge
# TATOEBA_TEST_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
# TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
# TATOEBA_MONO_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION}
TATOEBA_TEST_URL := ${TATOEBA_DATAURL}
TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}
TATOEBA_MONO_URL := ${TATOEBA_DATAURL}
TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_WORK ?= ${PWD}/work-tatoeba
TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE}
TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono
TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models
@ -109,6 +119,7 @@ TATOEBA_PARAMS := TRAINSET=Tatoeba-train \
DEVMINSIZE=200 \ DEVMINSIZE=200 \
WORKHOME=${TATOEBA_WORK} \ WORKHOME=${TATOEBA_WORK} \
MODELSHOME=${PWD}/models-tatoeba \ MODELSHOME=${PWD}/models-tatoeba \
RELEASEDIR=${PWD}/models-tatoeba \
MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \ MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \ MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
ALT_MODEL_DIR=tatoeba \ ALT_MODEL_DIR=tatoeba \
@ -807,8 +818,13 @@ print-skiplangids:
${TATOEBA_MONO}/%.labels: ${TATOEBA_MONO}/%.labels:
mkdir -p $@.d mkdir -p $@.d
wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar # the old URL without versioning:
tar -C $@.d -xf $@.d/mono.tar -wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar
-tar -C $@.d -xf $@.d/mono.tar
rm -f $@.d/mono.tar
# the new URLs with versioning:
-wget -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar
-tar -C $@.d -xf $@.d/mono.tar
rm -f $@.d/mono.tar rm -f $@.d/mono.tar
find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@ find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@
for c in `find $@.d -name '*.id.gz' | sed 's/\.id\.gz//'`; do \ for c in `find $@.d -name '*.id.gz' | sed 's/\.id\.gz//'`; do \
@ -837,8 +853,9 @@ ${TATOEBA_MONO}/%.labels:
## TODO: should we do some filtering like bitext-match, OPUS-filter ... ## TODO: should we do some filtering like bitext-match, OPUS-filter ...
%/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz: %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz:
mkdir -p $@.d mkdir -p $@.d
-wget -q -O $@.d/train.tar ${TATOEBA_DATAURL}/${LANGPAIR}.tar -wget -q -O $@.d/train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar
-tar -C $@.d -xf $@.d/train.tar -tar -C $@.d -xf $@.d/train.tar
rm -f $@.d/train.tar
if [ -e $@.d/data/${LANGPAIR}/test.src ]; then \ if [ -e $@.d/data/${LANGPAIR}/test.src ]; then \
mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}; \ mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}; \
mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}; \ mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}; \

View File

@ -257,7 +257,7 @@ endif
%.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}: %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}:
${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k devdata ${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k devdata
${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k testdata ${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k testdata
./large-context.pl -l ${CONTEXT_SIZE} \ ${SCRIPTDIR}/large-context.pl -l ${CONTEXT_SIZE} \
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE},%.src.spm${SRCBPESIZE:000=}k,$@} \ ${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE},%.src.spm${SRCBPESIZE:000=}k,$@} \
${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE},%.trg.spm${TRGBPESIZE:000=}k,$@} \ ${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE},%.trg.spm${TRGBPESIZE:000=}k,$@} \
| ${GZIP} > $@.tmp.gz | ${GZIP} > $@.tmp.gz

36
models/ar-fi/README.md Normal file
View File

@ -0,0 +1,36 @@
# opus+bt+thl-2020-05-16.zip
* dataset: opus+bt+thl
* model: transformer-align
* source language(s): ar
* target language(s): fi
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+bt+thl-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus+bt+thl-2020-05-16.zip)
* test set translations: [opus+bt+thl-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus+bt+thl-2020-05-16.test.txt)
* test set scores: [opus+bt+thl-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus+bt+thl-2020-05-16.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| infopankki.ar.fi | 49.7 | 0.750 |
# opus-2020-05-20.zip
* dataset: opus
* model: transformer-align
* source language(s): ar
* target language(s): fi
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-05-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus-2020-05-20.zip)
* test set translations: [opus-2020-05-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus-2020-05-20.test.txt)
* test set scores: [opus-2020-05-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus-2020-05-20.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| infopankki.ar.fi | 56.8 | 0.779 |

View File

@ -0,0 +1,177 @@
# opus-2020-10-04.zip
* dataset: opus
* model: transformer
* source language(s): de en fi fr nl sv
* target language(s): de en fi fr nl sv
* model: transformer
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
* download: [opus-2020-10-04.zip](https://object.pouta.csc.fi/OPUS-MT-models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/opus-2020-10-04.zip)
* test set translations: [opus-2020-10-04.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/opus-2020-10-04.test.txt)
* test set scores: [opus-2020-10-04.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/opus-2020-10-04.eval.txt)
## Training data: opus
* de-en:
* de-fi:
* de-fr:
* de-nl:
* de-sv:
* en-de:
* en-fi:
* en-fr:
* en-nl:
* en-sv:
* fi-de:
* fi-en:
* fi-fr:
* fi-nl:
* fi-sv:
* fr-de:
* fr-en:
* fr-fi:
* fr-nl:
* fr-sv:
* nl-de:
* nl-en:
* nl-fi:
* nl-fr:
* nl-sv:
* sv-de:
* sv-en:
* sv-fi:
* sv-fr:
* sv-nl:
* unused dev/test data is added to training data
* total size (opus): 501437198
## Validation data
* de-en: OpenSubtitles, 21508596
* de-fi: OpenSubtitles, 12641471
* de-fr: OpenSubtitles, 15031870
* de-nl: OpenSubtitles, 14722152
* de-sv: OpenSubtitles, 8785182
* de-en: OpenSubtitles, 21508596
* en-fi: OpenSubtitles, 26457741
* en-fr: OpenSubtitles, 39527656
* en-nl: OpenSubtitles, 35049286
* en-sv: OpenSubtitles, 16169056
* de-fi: OpenSubtitles, 12641471
* en-fi: OpenSubtitles, 26457741
* fi-fr: OpenSubtitles, 18216582
* fi-nl: OpenSubtitles, 19868494
* fi-sv: OpenSubtitles, 13138133
* de-fr: OpenSubtitles, 15031870
* en-fr: OpenSubtitles, 39527656
* fi-fr: OpenSubtitles, 18216582
* fr-nl: OpenSubtitles, 23406778
* fr-sv: OpenSubtitles, 11422393
* de-nl: OpenSubtitles, 14722152
* en-nl: OpenSubtitles, 35049286
* fi-nl: OpenSubtitles, 19868494
* fr-nl: OpenSubtitles, 23406778
* nl-sv: OpenSubtitles, 12642707
* de-sv: OpenSubtitles, 8785182
* en-sv: OpenSubtitles, 16169056
* fi-sv: OpenSubtitles, 13138133
* fr-sv: OpenSubtitles, 11422393
* nl-sv: OpenSubtitles, 12642707
* total size of shuffled dev data: 501506812
* devset = top 2500 lines of opus-dev.src.shuffled!
* testset = next 2500 lines of opus-dev.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| euelections_dev2019.de-fr-defr.de.fr | 16.5 | 0.447 |
| euelections_dev2019.fr-de-frde.fr.de | 14.7 | 0.439 |
| fiskmo_testset-fisv.fi.sv | 10.5 | 0.394 |
| fiskmo_testset-svfi.sv.fi | 7.4 | 0.402 |
| goethe-institute-test1-defi.de.fi | 9.7 | 0.381 |
| goethe-institute-test2-defi.de.fi | 9.6 | 0.376 |
| newsdev2015-enfi-enfi.en.fi | 11.3 | 0.423 |
| newsdev2015-enfi-fien.fi.en | 17.3 | 0.453 |
| newsdiscussdev2015-enfr-enfr.en.fr | 26.6 | 0.532 |
| newsdiscussdev2015-enfr-fren.fr.en | 24.3 | 0.512 |
| newsdiscusstest2015-enfr-enfr.en.fr | 29.4 | 0.552 |
| newsdiscusstest2015-enfr-fren.fr.en | 26.6 | 0.521 |
| newssyscomb2009-deen.de.en | 18.6 | 0.468 |
| newssyscomb2009-defr.de.fr | 16.6 | 0.449 |
| newssyscomb2009-ende.en.de | 14.7 | 0.448 |
| newssyscomb2009-enfr.en.fr | 21.5 | 0.502 |
| newssyscomb2009-frde.fr.de | 14.3 | 0.438 |
| newssyscomb2009-fren.fr.en | 22.0 | 0.498 |
| news-test2008-deen.de.en | 18.4 | 0.464 |
| news-test2008-defr.de.fr | 16.3 | 0.443 |
| news-test2008-ende.en.de | 15.1 | 0.437 |
| news-test2008-enfr.en.fr | 19.2 | 0.477 |
| news-test2008-frde.fr.de | 14.1 | 0.431 |
| news-test2008-fren.fr.en | 18.9 | 0.474 |
| newstest2009-deen.de.en | 17.6 | 0.454 |
| newstest2009-defr.de.fr | 15.7 | 0.434 |
| newstest2009-ende.en.de | 14.4 | 0.439 |
| newstest2009-enfr.en.fr | 20.2 | 0.487 |
| newstest2009-frde.fr.de | 13.9 | 0.428 |
| newstest2009-fren.fr.en | 20.6 | 0.484 |
| newstest2010-deen.de.en | 20.1 | 0.484 |
| newstest2010-defr.de.fr | 17.3 | 0.457 |
| newstest2010-ende.en.de | 15.7 | 0.449 |
| newstest2010-enfr.en.fr | 22.4 | 0.506 |
| newstest2010-frde.fr.de | 13.9 | 0.432 |
| newstest2010-fren.fr.en | 22.5 | 0.507 |
| newstest2011-deen.de.en | 17.9 | 0.463 |
| newstest2011-defr.de.fr | 16.5 | 0.444 |
| newstest2011-ende.en.de | 14.4 | 0.435 |
| newstest2011-enfr.en.fr | 23.1 | 0.515 |
| newstest2011-frde.fr.de | 13.6 | 0.422 |
| newstest2011-fren.fr.en | 22.9 | 0.510 |
| newstest2012-deen.de.en | 19.2 | 0.469 |
| newstest2012-defr.de.fr | 16.8 | 0.445 |
| newstest2012-ende.en.de | 14.6 | 0.436 |
| newstest2012-enfr.en.fr | 22.2 | 0.505 |
| newstest2012-frde.fr.de | 14.1 | 0.422 |
| newstest2012-fren.fr.en | 23.1 | 0.506 |
| newstest2013-deen.de.en | 21.3 | 0.486 |
| newstest2013-defr.de.fr | 18.7 | 0.453 |
| newstest2013-ende.en.de | 17.8 | 0.464 |
| newstest2013-enfr.en.fr | 23.9 | 0.505 |
| newstest2013-frde.fr.de | 15.7 | 0.438 |
| newstest2013-fren.fr.en | 23.6 | 0.506 |
| newstest2014-deen-deen.de.en | 20.7 | 0.485 |
| newstest2014-fren-fren.fr.en | 25.5 | 0.537 |
| newstest2015-ende-deen.de.en | 23.2 | 0.501 |
| newstest2015-ende-ende.en.de | 21.4 | 0.493 |
| newstest2015-enfi-enfi.en.fi | 13.2 | 0.445 |
| newstest2015-enfi-fien.fi.en | 18.9 | 0.463 |
| newstest2016-ende-deen.de.en | 25.8 | 0.528 |
| newstest2016-ende-ende.en.de | 22.5 | 0.505 |
| newstest2016-enfi-enfi.en.fi | 14.0 | 0.446 |
| newstest2016-enfi-fien.fi.en | 19.4 | 0.478 |
| newstest2017-ende-deen.de.en | 24.4 | 0.511 |
| newstest2017-ende-ende.en.de | 20.0 | 0.486 |
| newstest2017-enfi-enfi.en.fi | 15.6 | 0.466 |
| newstest2017-enfi-fien.fi.en | 21.7 | 0.496 |
| newstest2018-ende-deen.de.en | 28.4 | 0.542 |
| newstest2018-ende-ende.en.de | 26.6 | 0.537 |
| newstest2018-enfi-enfi.en.fi | 11.1 | 0.421 |
| newstest2018-enfi-fien.fi.en | 16.1 | 0.438 |
| newstest2019-deen-deen.de.en | 22.5 | 0.499 |
| newstest2019-defr-defr.de.fr | 17.0 | 0.460 |
| newstest2019-ende-ende.en.de | 25.3 | 0.523 |
| newstest2019-enfi-enfi.en.fi | 14.1 | 0.438 |
| newstest2019-fien-fien.fi.en | 19.7 | 0.470 |
| newstest2019-frde-frde.fr.de | 14.9 | 0.454 |
| newstestB2016-enfi-enfi.en.fi | 11.2 | 0.422 |
| newstestB2016-enfi-fien.fi.en | 15.9 | 0.443 |
| newstestB2017-enfi-enfi.en.fi | 12.7 | 0.437 |
| newstestB2017-enfi-fien.fi.en | 18.6 | 0.467 |
| newstestB2017-fien-fien.fi.en | 18.6 | 0.467 |
| opus-test.multi.multi | 24.1 | 0.439 |
| simplification-enen.en.en | 49.6 | 0.714 |

35
models/en-chr/README.md Normal file
View File

@ -0,0 +1,35 @@
# opus-2020-05-23.zip
* dataset: opus
* model: transformer-align
* source language(s): en
* target language(s): chr
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm1k,spm1k)
* download: [opus-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-chr/opus-2020-05-23.zip)
* test set translations: [opus-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-chr/opus-2020-05-23.test.txt)
* test set scores: [opus-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-chr/opus-2020-05-23.eval.txt)
## Training data: opus+bt
* en-chr: Tatoeba (22) Ubuntu (6) wikimedia (5)
* en-chr: total size = 33
* unused dev/test data is added to training data
* total size (opus+bt): 10938
## Validation data
* chr-en: bible-uedin, 15905
* total size of shuffled dev data: 15905
* devset = top 2500 lines of bible-uedin.src.shuffled!
* testset = next 2500 lines of bible-uedin.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.en.chr | 44.6 | 0.569 |

33
models/en-cjp/README.md Normal file
View File

@ -0,0 +1,33 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): cjp
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm1k,spm1k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-cjp/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cjp/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cjp/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* unused dev/test data is added to training data
* total size (opus+bt): 10750
## Validation data
* cjp-en: bible-uedin, 15749
* total size of shuffled dev data: 15749
* devset = top 2500 lines of bible-uedin.src.shuffled!
* testset = next 2500 lines of bible-uedin.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.en.cjp | 31.8 | 0.535 |

33
models/en-cop/README.md Normal file
View File

@ -0,0 +1,33 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): cop
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm1k,spm1k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-cop/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cop/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cop/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* unused dev/test data is added to training data
* total size (opus+bt): 10896
## Validation data
* cop-en: bible-uedin, 15901
* total size of shuffled dev data: 15901
* devset = top 2500 lines of bible-uedin.src.shuffled!
* testset = next 2500 lines of bible-uedin.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.en.cop | 45.6 | 0.651 |

32
models/en-dop/README.md Normal file
View File

@ -0,0 +1,32 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): dop
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm1k,spm1k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-dop/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-dop/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-dop/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* unused dev/test data is added to training data
* total size (opus+bt): 10738
## Validation data
* dop-en: bible-uedin
* devset = top 2500 lines of bible-uedin.src.shuffled!
* testset = next 2500 lines of bible-uedin.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.en.dop | 26.0 | 0.480 |

32
models/en-quw/README.md Normal file
View File

@ -0,0 +1,32 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): quw
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm1k,spm1k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-quw/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-quw/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-quw/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* unused dev/test data is added to training data
* total size (opus+bt): 10380
## Validation data
* en-quw: bible-uedin
* devset = top 2500 lines of bible-uedin.src.shuffled!
* testset = next 2500 lines of bible-uedin.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.en.quw | 39.5 | 0.618 |

35
models/en-syr/README.md Normal file
View File

@ -0,0 +1,35 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): syr
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm1k,spm1k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-syr/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-syr/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-syr/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* en-syr: Ubuntu (70)
* en-syr: total size = 70
* unused dev/test data is added to training data
* total size (opus+bt): 10970
## Validation data
* en-syr: bible-uedin, 15899
* total size of shuffled dev data: 15899
* devset = top 2500 lines of bible-uedin.src.shuffled!
* testset = next 2500 lines of bible-uedin.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.en.syr | 49.6 | 0.681 |

35
models/en-ve/README.md Normal file
View File

@ -0,0 +1,35 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): ve
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ve/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ve/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ve/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* en-ve: Ubuntu (4) wikimedia (2)
* en-ve: total size = 6
* unused dev/test data is added to training data
* total size (opus+bt): 208084
## Validation data
* en-ve: JW300, 213086
* total size of shuffled dev data: 213086
* devset = top 2500 lines of JW300.src.shuffled!
* testset = next 2500 lines of JW300.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.ve | 40.5 | 0.616 |

32
models/en-wls/README.md Normal file
View File

@ -0,0 +1,32 @@
# opus-2020-05-23.zip
* dataset: opus
* model: transformer-align
* source language(s): en
* target language(s): wls
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-wls/opus-2020-05-23.zip)
* test set translations: [opus-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-wls/opus-2020-05-23.test.txt)
* test set scores: [opus-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-wls/opus-2020-05-23.eval.txt)
## Training data: opus+bt
* unused dev/test data is added to training data
* total size (opus+bt): 156156
## Validation data
* en-wls: JW300
* devset = top 2500 lines of JW300.src.shuffled!
* testset = next 2500 lines of JW300.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.wls | 35.6 | 0.565 |

34
models/en-wo/README.md Normal file
View File

@ -0,0 +1,34 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): wo
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm1k,spm1k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-wo/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-wo/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-wo/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* en-wo: Tatoeba (3) Ubuntu (126)
* en-wo: total size = 129
* unused dev/test data is added to training data
* total size (opus+bt): 10931
## Validation data
* en-wo: bible-uedin
* devset = top 2500 lines of bible-uedin.src.shuffled!
* testset = next 2500 lines of bible-uedin.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| bible-uedin.en.wo | 46.6 | 0.604 |

32
models/en-yap/README.md Normal file
View File

@ -0,0 +1,32 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): yap
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-yap/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-yap/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-yap/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* unused dev/test data is added to training data
* total size (opus+bt): 120218
## Validation data
* en-yap: JW300
* devset = top 2500 lines of JW300.src.shuffled!
* testset = next 2500 lines of JW300.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.yap | 28.5 | 0.471 |

34
models/en-yo/README.md Normal file
View File

@ -0,0 +1,34 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): yo
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-yo/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-yo/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-yo/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* en-yo: GNOME (3928) Tatoeba (31) Ubuntu (76)
* en-yo: total size = 4035
* unused dev/test data is added to training data
* total size (opus+bt): 435278
## Validation data
* en-yo: JW300
* devset = top 2500 lines of JW300.src.shuffled!
* testset = next 2500 lines of JW300.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.yo | 35.2 | 0.515 |

33
models/en-zne/README.md Normal file
View File

@ -0,0 +1,33 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): en
* target language(s): zne
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-zne/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-zne/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-zne/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* unused dev/test data is added to training data
* total size (opus+bt): 184853
## Validation data
* en-zne: JW300, 189924
* total size of shuffled dev data: 189924
* devset = top 2500 lines of JW300.src.shuffled!
* testset = next 2500 lines of JW300.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.zne | 32.0 | 0.556 |

36
models/fa-fi/README.md Normal file
View File

@ -0,0 +1,36 @@
# opus-2020-05-20.zip
* dataset: opus
* model: transformer-align
* source language(s): fa
* target language(s): fi
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-05-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus-2020-05-20.zip)
* test set translations: [opus-2020-05-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus-2020-05-20.test.txt)
* test set scores: [opus-2020-05-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus-2020-05-20.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.fa.fi | 17.5 | 0.407 |
# opus+bt+thl-2020-05-20.zip
* dataset: opus+bt+thl
* model: transformer
* source language(s): fa
* target language(s): fi
* model: transformer
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+bt+thl-2020-05-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus+bt+thl-2020-05-20.zip)
* test set translations: [opus+bt+thl-2020-05-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus+bt+thl-2020-05-20.test.txt)
* test set scores: [opus+bt+thl-2020-05-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus+bt+thl-2020-05-20.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| infopankki.fa.fi | 57.9 | 0.773 |

47
models/fi+sv-en/README.md Normal file
View File

@ -0,0 +1,47 @@
# opus-2020-10-10.zip
* dataset: opus
* model: transformer
* source language(s): fi sv
* target language(s): en
* model: transformer
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-10-10.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi+sv-en/opus-2020-10-10.zip)
* test set translations: [opus-2020-10-10.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi+sv-en/opus-2020-10-10.test.txt)
* test set scores: [opus-2020-10-10.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi+sv-en/opus-2020-10-10.eval.txt)
## Training data: opus
* fi-en: Books (3618) DGT (4887825) ECB (139089) ELRA-W0217 (9772) ELRA-W0220 (53703) ELRA-W0305 (15) ELRC_2922 (312) ELRC_2923 (394) ELRC_3382 (3357) ELRC_416 (696) EMEA (875550) EUbookshop (2027241) EUconst (7220) Europarl (1954995) GNOME (59709) JRC-Acquis (15927) JW300 (2001165) KDE4 (90150) OpenSubtitles (26457741) PHP (24293) ParaCrawl (3089564) QED (98509) TildeMODEL (2983582) Ubuntu (7470) bible-uedin (61917) infopankki (84378)
* fi-en: total size = 44938192
* sv-en: Books (3047) DGT (4780207) ELRA-W0130 (2170) ELRA-W0213 (1924) ELRA-W0222 (6560) ELRA-W0239 (8265) ELRA-W0305 (1132) ELRC_2922 (499) ELRC_2923 (492) ELRC_3382 (3738) ELRC_416 (1062) EMEA (840417) EUbookshop (1885825) EUconst (7010) Europarl (1870175) GNOME (126) GlobalVoices (8012) JRC-Acquis (666453) JW300 (1641702) KDE4 (190266) OpenSubtitles (16169056) PHP (18420) ParaCrawl (6000734) QED (161764) RF (174) Tanzil (126202) TildeMODEL (3102585) Ubuntu (5678) WikiSource (32427) bible-uedin (61205) infopankki (51688)
* sv-en: total size = 37649015
* unused dev/test data is added to training data
* total size (opus): 82379895
## Validation data
* en-fi: Tatoeba, 78868
* en-sv: Tatoeba, 24256
* total size of shuffled dev data: 103084
* devset = top 2500 lines of opus-dev.src.shuffled!
* testset = next 2500 lines of opus-dev.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newsdev2015-enfi-fien.fi.en | 25.5 | 0.536 |
| newstest2015-enfi-fien.fi.en | 26.9 | 0.545 |
| newstest2016-enfi-fien.fi.en | 28.6 | 0.567 |
| newstest2017-enfi-fien.fi.en | 31.9 | 0.589 |
| newstest2018-enfi-fien.fi.en | 23.4 | 0.514 |
| newstest2019-fien-fien.fi.en | 28.5 | 0.560 |
| newstestB2016-enfi-fien.fi.en | 23.8 | 0.523 |
| newstestB2017-enfi-fien.fi.en | 27.2 | 0.554 |
| newstestB2017-fien-fien.fi.en | 27.2 | 0.554 |
| opus-test.multi.en | 58.2 | 0.721 |

18
models/fi-ar/README.md Normal file
View File

@ -0,0 +1,18 @@
# opus-2020-05-16.zip
* dataset: opus
* model: transformer-align
* source language(s): fi
* target language(s): ar
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-ar/opus-2020-05-16.zip)
* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-ar/opus-2020-05-16.test.txt)
* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-ar/opus-2020-05-16.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| infopankki.fi.ar | 42.9 | 0.674 |

25
models/fi-fa/README.md Normal file
View File

@ -0,0 +1,25 @@
# opus+bt-2020-05-20.zip
* dataset: opus+bt
* model: transformer
* source language(s): fi
* target language(s): fa
* model: transformer
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus+bt-2020-05-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-fa/opus+bt-2020-05-20.zip)
* test set translations: [opus+bt-2020-05-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-fa/opus+bt-2020-05-20.test.txt)
* test set scores: [opus+bt-2020-05-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-fa/opus+bt-2020-05-20.eval.txt)
## Training data: opus+bt
* fa-fi: GNOME JW300 KDE4 OpenSubtitles QED Ubuntu wikimedia
* fa-fi backtranslations: backtranslate/fa-fi/latest/wiki.aa.fa-fi backtranslate/fa-fi/latest/wikinews.aa.fa-fi backtranslate/fa-fi/latest/wikiquote.aa.fa-fi
* unused dev/test data is added to training data
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| infopankki.fi.fa | 48.9 | 0.673 |

18
models/fi-pl/README.md Normal file
View File

@ -0,0 +1,18 @@
# opus-2020-05-16.zip
* dataset: opus
* model: transformer-align
* source language(s): fi
* target language(s): pl
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-pl/opus-2020-05-16.zip)
* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-pl/opus-2020-05-16.test.txt)
* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-pl/opus-2020-05-16.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.fi.pl | 47.9 | 0.673 |

View File

@ -0,0 +1,50 @@
# opus-2020-05-22.zip
* dataset: opus
* model: transformer-align
* source language(s): nb nn no nb_NO nn_NO no_nb
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-05-22.zip](https://object.pouta.csc.fi/OPUS-MT-models/nb+nn+no+nb_NO+nn_NO+no_nb-en/opus-2020-05-22.zip)
* test set translations: [opus-2020-05-22.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/nb+nn+no+nb_NO+nn_NO+no_nb-en/opus-2020-05-22.test.txt)
* test set scores: [opus-2020-05-22.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/nb+nn+no+nb_NO+nn_NO+no_nb-en/opus-2020-05-22.eval.txt)
## Training data: opus
* nb-en: EUbookshop (27499) GNOME (116) KDE4 (93556) QED (124570) Ubuntu (3294)
* nb-en: total size = 249035
* nn-en: GNOME (367687) KDE4 (76910) QED (3569) Ubuntu (38767) wikimedia (383)
* nn-en: total size = 487316
* no-en: bible-uedin (61093) Books (3412) GNOME (7124) OpenSubtitles (8071047) Tanzil (134647) TildeMODEL (325194) Ubuntu (7925) wikimedia (66)
* no-en: total size = 8610508
* nb_NO-en: GNOME (1)
* nb_NO-en: total size = 1
* nn_NO-en: GNOME (1)
* nn_NO-en: total size = 1
* no_nb-en: GNOME (20)
* no_nb-en: total size = 20
* unused dev/test data is added to training data
* total size (opus): 11179800
## Validation data
* en-nb: Tatoeba, 9282
* en-nn: Tatoeba, 940
* en-no: JW300, 1837668
* en-nb_NO: bible-uedin, 0
* en-nn_NO: bible-uedin, 0
* en-no_nb: bible-uedin, 0
* total size of shuffled dev data: 1847890
* devset = top 5000 lines of opus-dev.src.shuffled!
* testset = next 5000 lines of opus-dev.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| opus-test.nb.en | 44.7 | 0.623 |

18
models/pl-fi/README.md Normal file
View File

@ -0,0 +1,18 @@
# opus-2020-05-16.zip
* dataset: opus
* model: transformer-align
* source language(s): pl
* target language(s): fi
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/pl-fi/opus-2020-05-16.zip)
* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/pl-fi/opus-2020-05-16.test.txt)
* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/pl-fi/opus-2020-05-16.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.pl.fi | 41.0 | 0.640 |

18
models/sq-fi/README.md Normal file
View File

@ -0,0 +1,18 @@
# opus-2020-05-16.zip
* dataset: opus
* model: transformer-align
* source language(s): sq
* target language(s): fi
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm32k,spm32k)
* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/sq-fi/opus-2020-05-16.zip)
* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/sq-fi/opus-2020-05-16.test.txt)
* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/sq-fi/opus-2020-05-16.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.sq.fi | 30.2 | 0.556 |

35
models/zne-en/README.md Normal file
View File

@ -0,0 +1,35 @@
# opus+bt-2020-05-23.zip
* dataset: opus+bt
* model: transformer-align
* source language(s): zne
* target language(s): en
* model: transformer-align
* pre-processing: normalization + SentencePiece (spm4k,spm4k)
* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/zne-en/opus+bt-2020-05-23.zip)
* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/zne-en/opus+bt-2020-05-23.test.txt)
* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/zne-en/opus+bt-2020-05-23.eval.txt)
## Training data: opus+bt
* zne-en:
* zne-en: total size = 0
* unused dev/test data is added to training data
* total size (opus+bt): 184853
## Validation data
* en-zne: JW300, 189924
* total size of shuffled dev data: 189924
* devset = top 2500 lines of JW300.src.shuffled!
* testset = next 2500 lines of JW300.src.shuffled!
* remaining lines are added to traindata
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.zne.en | 35.2 | 0.510 |