From 7bd502edccc77d757a06d716319179301e7b8cd3 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Thu, 26 Nov 2020 12:57:46 +0200 Subject: [PATCH] updated model list --- lib/config.mk | 4 + lib/data.mk | 4 + lib/generic.mk | 6 +- lib/projects/tatoeba.mk | 33 +++- lib/sentencepiece.mk | 2 +- models/ar-fi/README.md | 36 ++++ .../README.md | 177 ++++++++++++++++++ models/en-chr/README.md | 35 ++++ models/en-cjp/README.md | 33 ++++ models/en-cop/README.md | 33 ++++ models/en-dop/README.md | 32 ++++ models/en-quw/README.md | 32 ++++ models/en-syr/README.md | 35 ++++ models/en-ve/README.md | 35 ++++ models/en-wls/README.md | 32 ++++ models/en-wo/README.md | 34 ++++ models/en-yap/README.md | 32 ++++ models/en-yo/README.md | 34 ++++ models/en-zne/README.md | 33 ++++ models/fa-fi/README.md | 36 ++++ models/fi+sv-en/README.md | 47 +++++ models/fi-ar/README.md | 18 ++ models/fi-fa/README.md | 25 +++ models/fi-pl/README.md | 18 ++ .../nb+nn+no+nb_NO+nn_NO+no_nb-en/README.md | 50 +++++ models/pl-fi/README.md | 18 ++ models/sq-fi/README.md | 18 ++ models/zne-en/README.md | 35 ++++ 28 files changed, 915 insertions(+), 12 deletions(-) create mode 100644 models/ar-fi/README.md create mode 100644 models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/README.md create mode 100644 models/en-chr/README.md create mode 100644 models/en-cjp/README.md create mode 100644 models/en-cop/README.md create mode 100644 models/en-dop/README.md create mode 100644 models/en-quw/README.md create mode 100644 models/en-syr/README.md create mode 100644 models/en-ve/README.md create mode 100644 models/en-wls/README.md create mode 100644 models/en-wo/README.md create mode 100644 models/en-yap/README.md create mode 100644 models/en-yo/README.md create mode 100644 models/en-zne/README.md create mode 100644 models/fa-fi/README.md create mode 100644 models/fi+sv-en/README.md create mode 100644 models/fi-ar/README.md create mode 100644 models/fi-fa/README.md create mode 100644 models/fi-pl/README.md create mode 100644 models/nb+nn+no+nb_NO+nn_NO+no_nb-en/README.md create mode 100644 models/pl-fi/README.md create mode 100644 models/sq-fi/README.md create mode 100644 models/zne-en/README.md diff --git a/lib/config.mk b/lib/config.mk index 0b20487d..20213f74 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -63,6 +63,10 @@ SKIP_LANGPAIRS ?= "nothing" ## # SHUFFLE_DATA = 1 +## devtest data is shuffled by default +SHUFFLE_DEVDATA = 1 + + ## set FIT_DATA_SIZE to a specific value to fit the training data ## to a certain number of lines for each language pair in the collection ## --> especially useful for multilingual models for balancing the diff --git a/lib/data.mk b/lib/data.mk index 7e265f12..4c45b426 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -454,7 +454,11 @@ ${DEV_SRC}.shuffled.gz: fi \ done \ done +ifeq (${SHUFFLE_DEVDATA},0) + paste ${DEV_SRC} ${DEV_TRG} | ${GZIP} -c > $@ +else paste ${DEV_SRC} ${DEV_TRG} | ${UNIQ} | ${SHUFFLE} | ${GZIP} -c > $@ +endif echo -n "* total size of shuffled dev data: " >> ${dir ${DEV_SRC}}README.md ${GZIP} -cd < $@ | wc -l >> ${dir ${DEV_SRC}}README.md diff --git a/lib/generic.mk b/lib/generic.mk index 8a1cda77..48350d8c 100644 --- a/lib/generic.mk +++ b/lib/generic.mk @@ -306,11 +306,11 @@ endif ## document level models +## devtest data should not be shuffled %-doc: - ${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \ - PRE=norm SUBWORDS=spm \ - PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \ + ${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \ PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \ + SHUFFLE_DEVDATA=0 \ ${@:-doc=} diff --git a/lib/projects/tatoeba.mk b/lib/projects/tatoeba.mk index 2e30dad0..5d3e6ff3 100644 --- a/lib/projects/tatoeba.mk +++ b/lib/projects/tatoeba.mk @@ -88,11 +88,21 @@ ## general parameters for Tatoeba models -TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge -TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master -TATOEBA_WORK ?= ${PWD}/work-tatoeba -TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE} -TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono + +## NEW: release +TATOEBA_VERSION ?= v2020-07-28 + +TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge +# TATOEBA_TEST_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} +# TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} +# TATOEBA_MONO_URL := ${TATOEBA_DATAURL}-${TATOEBA_VERSION} +TATOEBA_TEST_URL := ${TATOEBA_DATAURL} +TATOEBA_TRAIN_URL := ${TATOEBA_DATAURL} +TATOEBA_MONO_URL := ${TATOEBA_DATAURL} +TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master +TATOEBA_WORK ?= ${PWD}/work-tatoeba +TATOEBA_DATA ?= ${TATOEBA_WORK}/data/${PRE} +TATOEBA_MONO ?= ${TATOEBA_WORK}/data/mono TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models @@ -109,6 +119,7 @@ TATOEBA_PARAMS := TRAINSET=Tatoeba-train \ DEVMINSIZE=200 \ WORKHOME=${TATOEBA_WORK} \ MODELSHOME=${PWD}/models-tatoeba \ + RELEASEDIR=${PWD}/models-tatoeba \ MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \ MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \ ALT_MODEL_DIR=tatoeba \ @@ -807,8 +818,13 @@ print-skiplangids: ${TATOEBA_MONO}/%.labels: mkdir -p $@.d - wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar - tar -C $@.d -xf $@.d/mono.tar +# the old URL without versioning: + -wget -q -O $@.d/mono.tar ${TATOEBA_DATAURL}/$(patsubst %.labels,%,$(notdir $@)).tar + -tar -C $@.d -xf $@.d/mono.tar + rm -f $@.d/mono.tar +# the new URLs with versioning: + -wget -q -O $@.d/mono.tar ${TATOEBA_MONO_URL}/$(patsubst %.labels,%,$(notdir $@)).tar + -tar -C $@.d -xf $@.d/mono.tar rm -f $@.d/mono.tar find $@.d -name '*.id.gz' | xargs ${ZCAT} | sort -u | tr "\n" ' ' | sed 's/ $$//' > $@ for c in `find $@.d -name '*.id.gz' | sed 's/\.id\.gz//'`; do \ @@ -837,8 +853,9 @@ ${TATOEBA_MONO}/%.labels: ## TODO: should we do some filtering like bitext-match, OPUS-filter ... %/Tatoeba-train.${LANGPAIR}.clean.${SRCEXT}.gz: mkdir -p $@.d - -wget -q -O $@.d/train.tar ${TATOEBA_DATAURL}/${LANGPAIR}.tar + -wget -q -O $@.d/train.tar ${TATOEBA_TRAIN_URL}/${LANGPAIR}.tar -tar -C $@.d -xf $@.d/train.tar + rm -f $@.d/train.tar if [ -e $@.d/data/${LANGPAIR}/test.src ]; then \ mv $@.d/data/${LANGPAIR}/test.src ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${SRCEXT}; \ mv $@.d/data/${LANGPAIR}/test.trg ${dir $@}Tatoeba-test.${LANGPAIR}.clean.${TRGEXT}; \ diff --git a/lib/sentencepiece.mk b/lib/sentencepiece.mk index 14aecb69..bb480952 100644 --- a/lib/sentencepiece.mk +++ b/lib/sentencepiece.mk @@ -257,7 +257,7 @@ endif %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE}: ${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k devdata ${MAKE} PRE_SRC=spm${SRCBPESIZE:000=}k PRE_TRG=spm${TRGBPESIZE:000=}k testdata - ./large-context.pl -l ${CONTEXT_SIZE} \ + ${SCRIPTDIR}/large-context.pl -l ${CONTEXT_SIZE} \ ${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE},%.src.spm${SRCBPESIZE:000=}k,$@} \ ${patsubst %.src.spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE},%.trg.spm${TRGBPESIZE:000=}k,$@} \ | ${GZIP} > $@.tmp.gz diff --git a/models/ar-fi/README.md b/models/ar-fi/README.md new file mode 100644 index 00000000..4ba4f584 --- /dev/null +++ b/models/ar-fi/README.md @@ -0,0 +1,36 @@ +# opus+bt+thl-2020-05-16.zip + +* dataset: opus+bt+thl +* model: transformer-align +* source language(s): ar +* target language(s): fi +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus+bt+thl-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus+bt+thl-2020-05-16.zip) +* test set translations: [opus+bt+thl-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus+bt+thl-2020-05-16.test.txt) +* test set scores: [opus+bt+thl-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus+bt+thl-2020-05-16.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| infopankki.ar.fi | 49.7 | 0.750 | + +# opus-2020-05-20.zip + +* dataset: opus +* model: transformer-align +* source language(s): ar +* target language(s): fi +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus-2020-05-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus-2020-05-20.zip) +* test set translations: [opus-2020-05-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus-2020-05-20.test.txt) +* test set scores: [opus-2020-05-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ar-fi/opus-2020-05-20.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| infopankki.ar.fi | 56.8 | 0.779 | + diff --git a/models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/README.md b/models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/README.md new file mode 100644 index 00000000..4c63294f --- /dev/null +++ b/models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/README.md @@ -0,0 +1,177 @@ +# opus-2020-10-04.zip + +* dataset: opus +* model: transformer +* source language(s): de en fi fr nl sv +* target language(s): de en fi fr nl sv +* model: transformer +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID) +* download: [opus-2020-10-04.zip](https://object.pouta.csc.fi/OPUS-MT-models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/opus-2020-10-04.zip) +* test set translations: [opus-2020-10-04.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/opus-2020-10-04.test.txt) +* test set scores: [opus-2020-10-04.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/de+en+fi+fr+nl+sv-de+en+fi+fr+nl+sv/opus-2020-10-04.eval.txt) + +## Training data: opus + +* de-en: +* de-fi: +* de-fr: +* de-nl: +* de-sv: +* en-de: +* en-fi: +* en-fr: +* en-nl: +* en-sv: +* fi-de: +* fi-en: +* fi-fr: +* fi-nl: +* fi-sv: +* fr-de: +* fr-en: +* fr-fi: +* fr-nl: +* fr-sv: +* nl-de: +* nl-en: +* nl-fi: +* nl-fr: +* nl-sv: +* sv-de: +* sv-en: +* sv-fi: +* sv-fr: +* sv-nl: +* unused dev/test data is added to training data +* total size (opus): 501437198 + + +## Validation data + +* de-en: OpenSubtitles, 21508596 +* de-fi: OpenSubtitles, 12641471 +* de-fr: OpenSubtitles, 15031870 +* de-nl: OpenSubtitles, 14722152 +* de-sv: OpenSubtitles, 8785182 +* de-en: OpenSubtitles, 21508596 +* en-fi: OpenSubtitles, 26457741 +* en-fr: OpenSubtitles, 39527656 +* en-nl: OpenSubtitles, 35049286 +* en-sv: OpenSubtitles, 16169056 +* de-fi: OpenSubtitles, 12641471 +* en-fi: OpenSubtitles, 26457741 +* fi-fr: OpenSubtitles, 18216582 +* fi-nl: OpenSubtitles, 19868494 +* fi-sv: OpenSubtitles, 13138133 +* de-fr: OpenSubtitles, 15031870 +* en-fr: OpenSubtitles, 39527656 +* fi-fr: OpenSubtitles, 18216582 +* fr-nl: OpenSubtitles, 23406778 +* fr-sv: OpenSubtitles, 11422393 +* de-nl: OpenSubtitles, 14722152 +* en-nl: OpenSubtitles, 35049286 +* fi-nl: OpenSubtitles, 19868494 +* fr-nl: OpenSubtitles, 23406778 +* nl-sv: OpenSubtitles, 12642707 +* de-sv: OpenSubtitles, 8785182 +* en-sv: OpenSubtitles, 16169056 +* fi-sv: OpenSubtitles, 13138133 +* fr-sv: OpenSubtitles, 11422393 +* nl-sv: OpenSubtitles, 12642707 +* total size of shuffled dev data: 501506812 + +* devset = top 2500 lines of opus-dev.src.shuffled! +* testset = next 2500 lines of opus-dev.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| euelections_dev2019.de-fr-defr.de.fr | 16.5 | 0.447 | +| euelections_dev2019.fr-de-frde.fr.de | 14.7 | 0.439 | +| fiskmo_testset-fisv.fi.sv | 10.5 | 0.394 | +| fiskmo_testset-svfi.sv.fi | 7.4 | 0.402 | +| goethe-institute-test1-defi.de.fi | 9.7 | 0.381 | +| goethe-institute-test2-defi.de.fi | 9.6 | 0.376 | +| newsdev2015-enfi-enfi.en.fi | 11.3 | 0.423 | +| newsdev2015-enfi-fien.fi.en | 17.3 | 0.453 | +| newsdiscussdev2015-enfr-enfr.en.fr | 26.6 | 0.532 | +| newsdiscussdev2015-enfr-fren.fr.en | 24.3 | 0.512 | +| newsdiscusstest2015-enfr-enfr.en.fr | 29.4 | 0.552 | +| newsdiscusstest2015-enfr-fren.fr.en | 26.6 | 0.521 | +| newssyscomb2009-deen.de.en | 18.6 | 0.468 | +| newssyscomb2009-defr.de.fr | 16.6 | 0.449 | +| newssyscomb2009-ende.en.de | 14.7 | 0.448 | +| newssyscomb2009-enfr.en.fr | 21.5 | 0.502 | +| newssyscomb2009-frde.fr.de | 14.3 | 0.438 | +| newssyscomb2009-fren.fr.en | 22.0 | 0.498 | +| news-test2008-deen.de.en | 18.4 | 0.464 | +| news-test2008-defr.de.fr | 16.3 | 0.443 | +| news-test2008-ende.en.de | 15.1 | 0.437 | +| news-test2008-enfr.en.fr | 19.2 | 0.477 | +| news-test2008-frde.fr.de | 14.1 | 0.431 | +| news-test2008-fren.fr.en | 18.9 | 0.474 | +| newstest2009-deen.de.en | 17.6 | 0.454 | +| newstest2009-defr.de.fr | 15.7 | 0.434 | +| newstest2009-ende.en.de | 14.4 | 0.439 | +| newstest2009-enfr.en.fr | 20.2 | 0.487 | +| newstest2009-frde.fr.de | 13.9 | 0.428 | +| newstest2009-fren.fr.en | 20.6 | 0.484 | +| newstest2010-deen.de.en | 20.1 | 0.484 | +| newstest2010-defr.de.fr | 17.3 | 0.457 | +| newstest2010-ende.en.de | 15.7 | 0.449 | +| newstest2010-enfr.en.fr | 22.4 | 0.506 | +| newstest2010-frde.fr.de | 13.9 | 0.432 | +| newstest2010-fren.fr.en | 22.5 | 0.507 | +| newstest2011-deen.de.en | 17.9 | 0.463 | +| newstest2011-defr.de.fr | 16.5 | 0.444 | +| newstest2011-ende.en.de | 14.4 | 0.435 | +| newstest2011-enfr.en.fr | 23.1 | 0.515 | +| newstest2011-frde.fr.de | 13.6 | 0.422 | +| newstest2011-fren.fr.en | 22.9 | 0.510 | +| newstest2012-deen.de.en | 19.2 | 0.469 | +| newstest2012-defr.de.fr | 16.8 | 0.445 | +| newstest2012-ende.en.de | 14.6 | 0.436 | +| newstest2012-enfr.en.fr | 22.2 | 0.505 | +| newstest2012-frde.fr.de | 14.1 | 0.422 | +| newstest2012-fren.fr.en | 23.1 | 0.506 | +| newstest2013-deen.de.en | 21.3 | 0.486 | +| newstest2013-defr.de.fr | 18.7 | 0.453 | +| newstest2013-ende.en.de | 17.8 | 0.464 | +| newstest2013-enfr.en.fr | 23.9 | 0.505 | +| newstest2013-frde.fr.de | 15.7 | 0.438 | +| newstest2013-fren.fr.en | 23.6 | 0.506 | +| newstest2014-deen-deen.de.en | 20.7 | 0.485 | +| newstest2014-fren-fren.fr.en | 25.5 | 0.537 | +| newstest2015-ende-deen.de.en | 23.2 | 0.501 | +| newstest2015-ende-ende.en.de | 21.4 | 0.493 | +| newstest2015-enfi-enfi.en.fi | 13.2 | 0.445 | +| newstest2015-enfi-fien.fi.en | 18.9 | 0.463 | +| newstest2016-ende-deen.de.en | 25.8 | 0.528 | +| newstest2016-ende-ende.en.de | 22.5 | 0.505 | +| newstest2016-enfi-enfi.en.fi | 14.0 | 0.446 | +| newstest2016-enfi-fien.fi.en | 19.4 | 0.478 | +| newstest2017-ende-deen.de.en | 24.4 | 0.511 | +| newstest2017-ende-ende.en.de | 20.0 | 0.486 | +| newstest2017-enfi-enfi.en.fi | 15.6 | 0.466 | +| newstest2017-enfi-fien.fi.en | 21.7 | 0.496 | +| newstest2018-ende-deen.de.en | 28.4 | 0.542 | +| newstest2018-ende-ende.en.de | 26.6 | 0.537 | +| newstest2018-enfi-enfi.en.fi | 11.1 | 0.421 | +| newstest2018-enfi-fien.fi.en | 16.1 | 0.438 | +| newstest2019-deen-deen.de.en | 22.5 | 0.499 | +| newstest2019-defr-defr.de.fr | 17.0 | 0.460 | +| newstest2019-ende-ende.en.de | 25.3 | 0.523 | +| newstest2019-enfi-enfi.en.fi | 14.1 | 0.438 | +| newstest2019-fien-fien.fi.en | 19.7 | 0.470 | +| newstest2019-frde-frde.fr.de | 14.9 | 0.454 | +| newstestB2016-enfi-enfi.en.fi | 11.2 | 0.422 | +| newstestB2016-enfi-fien.fi.en | 15.9 | 0.443 | +| newstestB2017-enfi-enfi.en.fi | 12.7 | 0.437 | +| newstestB2017-enfi-fien.fi.en | 18.6 | 0.467 | +| newstestB2017-fien-fien.fi.en | 18.6 | 0.467 | +| opus-test.multi.multi | 24.1 | 0.439 | +| simplification-enen.en.en | 49.6 | 0.714 | + diff --git a/models/en-chr/README.md b/models/en-chr/README.md new file mode 100644 index 00000000..a7edb468 --- /dev/null +++ b/models/en-chr/README.md @@ -0,0 +1,35 @@ +# opus-2020-05-23.zip + +* dataset: opus +* model: transformer-align +* source language(s): en +* target language(s): chr +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm1k,spm1k) +* download: [opus-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-chr/opus-2020-05-23.zip) +* test set translations: [opus-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-chr/opus-2020-05-23.test.txt) +* test set scores: [opus-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-chr/opus-2020-05-23.eval.txt) + +## Training data: opus+bt + +* en-chr: Tatoeba (22) Ubuntu (6) wikimedia (5) +* en-chr: total size = 33 +* unused dev/test data is added to training data +* total size (opus+bt): 10938 + + +## Validation data + +* chr-en: bible-uedin, 15905 +* total size of shuffled dev data: 15905 + +* devset = top 2500 lines of bible-uedin.src.shuffled! +* testset = next 2500 lines of bible-uedin.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| bible-uedin.en.chr | 44.6 | 0.569 | + diff --git a/models/en-cjp/README.md b/models/en-cjp/README.md new file mode 100644 index 00000000..6585809a --- /dev/null +++ b/models/en-cjp/README.md @@ -0,0 +1,33 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): cjp +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm1k,spm1k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-cjp/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cjp/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cjp/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* unused dev/test data is added to training data +* total size (opus+bt): 10750 + + +## Validation data + +* cjp-en: bible-uedin, 15749 +* total size of shuffled dev data: 15749 + +* devset = top 2500 lines of bible-uedin.src.shuffled! +* testset = next 2500 lines of bible-uedin.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| bible-uedin.en.cjp | 31.8 | 0.535 | + diff --git a/models/en-cop/README.md b/models/en-cop/README.md new file mode 100644 index 00000000..b50868c1 --- /dev/null +++ b/models/en-cop/README.md @@ -0,0 +1,33 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): cop +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm1k,spm1k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-cop/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cop/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-cop/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* unused dev/test data is added to training data +* total size (opus+bt): 10896 + + +## Validation data + +* cop-en: bible-uedin, 15901 +* total size of shuffled dev data: 15901 + +* devset = top 2500 lines of bible-uedin.src.shuffled! +* testset = next 2500 lines of bible-uedin.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| bible-uedin.en.cop | 45.6 | 0.651 | + diff --git a/models/en-dop/README.md b/models/en-dop/README.md new file mode 100644 index 00000000..a716cc11 --- /dev/null +++ b/models/en-dop/README.md @@ -0,0 +1,32 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): dop +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm1k,spm1k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-dop/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-dop/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-dop/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* unused dev/test data is added to training data +* total size (opus+bt): 10738 + + +## Validation data + +* dop-en: bible-uedin + +* devset = top 2500 lines of bible-uedin.src.shuffled! +* testset = next 2500 lines of bible-uedin.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| bible-uedin.en.dop | 26.0 | 0.480 | + diff --git a/models/en-quw/README.md b/models/en-quw/README.md new file mode 100644 index 00000000..feb30873 --- /dev/null +++ b/models/en-quw/README.md @@ -0,0 +1,32 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): quw +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm1k,spm1k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-quw/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-quw/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-quw/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* unused dev/test data is added to training data +* total size (opus+bt): 10380 + + +## Validation data + +* en-quw: bible-uedin + +* devset = top 2500 lines of bible-uedin.src.shuffled! +* testset = next 2500 lines of bible-uedin.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| bible-uedin.en.quw | 39.5 | 0.618 | + diff --git a/models/en-syr/README.md b/models/en-syr/README.md new file mode 100644 index 00000000..abb3451c --- /dev/null +++ b/models/en-syr/README.md @@ -0,0 +1,35 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): syr +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm1k,spm1k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-syr/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-syr/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-syr/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* en-syr: Ubuntu (70) +* en-syr: total size = 70 +* unused dev/test data is added to training data +* total size (opus+bt): 10970 + + +## Validation data + +* en-syr: bible-uedin, 15899 +* total size of shuffled dev data: 15899 + +* devset = top 2500 lines of bible-uedin.src.shuffled! +* testset = next 2500 lines of bible-uedin.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| bible-uedin.en.syr | 49.6 | 0.681 | + diff --git a/models/en-ve/README.md b/models/en-ve/README.md new file mode 100644 index 00000000..03fe889b --- /dev/null +++ b/models/en-ve/README.md @@ -0,0 +1,35 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): ve +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm4k,spm4k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ve/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ve/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ve/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* en-ve: Ubuntu (4) wikimedia (2) +* en-ve: total size = 6 +* unused dev/test data is added to training data +* total size (opus+bt): 208084 + + +## Validation data + +* en-ve: JW300, 213086 +* total size of shuffled dev data: 213086 + +* devset = top 2500 lines of JW300.src.shuffled! +* testset = next 2500 lines of JW300.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.en.ve | 40.5 | 0.616 | + diff --git a/models/en-wls/README.md b/models/en-wls/README.md new file mode 100644 index 00000000..0927f942 --- /dev/null +++ b/models/en-wls/README.md @@ -0,0 +1,32 @@ +# opus-2020-05-23.zip + +* dataset: opus +* model: transformer-align +* source language(s): en +* target language(s): wls +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm4k,spm4k) +* download: [opus-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-wls/opus-2020-05-23.zip) +* test set translations: [opus-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-wls/opus-2020-05-23.test.txt) +* test set scores: [opus-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-wls/opus-2020-05-23.eval.txt) + +## Training data: opus+bt + +* unused dev/test data is added to training data +* total size (opus+bt): 156156 + + +## Validation data + +* en-wls: JW300 + +* devset = top 2500 lines of JW300.src.shuffled! +* testset = next 2500 lines of JW300.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.en.wls | 35.6 | 0.565 | + diff --git a/models/en-wo/README.md b/models/en-wo/README.md new file mode 100644 index 00000000..68a66797 --- /dev/null +++ b/models/en-wo/README.md @@ -0,0 +1,34 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): wo +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm1k,spm1k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-wo/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-wo/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-wo/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* en-wo: Tatoeba (3) Ubuntu (126) +* en-wo: total size = 129 +* unused dev/test data is added to training data +* total size (opus+bt): 10931 + + +## Validation data + +* en-wo: bible-uedin + +* devset = top 2500 lines of bible-uedin.src.shuffled! +* testset = next 2500 lines of bible-uedin.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| bible-uedin.en.wo | 46.6 | 0.604 | + diff --git a/models/en-yap/README.md b/models/en-yap/README.md new file mode 100644 index 00000000..70e6597e --- /dev/null +++ b/models/en-yap/README.md @@ -0,0 +1,32 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): yap +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm4k,spm4k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-yap/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-yap/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-yap/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* unused dev/test data is added to training data +* total size (opus+bt): 120218 + + +## Validation data + +* en-yap: JW300 + +* devset = top 2500 lines of JW300.src.shuffled! +* testset = next 2500 lines of JW300.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.en.yap | 28.5 | 0.471 | + diff --git a/models/en-yo/README.md b/models/en-yo/README.md new file mode 100644 index 00000000..aa977125 --- /dev/null +++ b/models/en-yo/README.md @@ -0,0 +1,34 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): yo +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm4k,spm4k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-yo/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-yo/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-yo/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* en-yo: GNOME (3928) Tatoeba (31) Ubuntu (76) +* en-yo: total size = 4035 +* unused dev/test data is added to training data +* total size (opus+bt): 435278 + + +## Validation data + +* en-yo: JW300 + +* devset = top 2500 lines of JW300.src.shuffled! +* testset = next 2500 lines of JW300.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.en.yo | 35.2 | 0.515 | + diff --git a/models/en-zne/README.md b/models/en-zne/README.md new file mode 100644 index 00000000..5ba14fdd --- /dev/null +++ b/models/en-zne/README.md @@ -0,0 +1,33 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): en +* target language(s): zne +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm4k,spm4k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-zne/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-zne/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-zne/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* unused dev/test data is added to training data +* total size (opus+bt): 184853 + + +## Validation data + +* en-zne: JW300, 189924 +* total size of shuffled dev data: 189924 + +* devset = top 2500 lines of JW300.src.shuffled! +* testset = next 2500 lines of JW300.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.en.zne | 32.0 | 0.556 | + diff --git a/models/fa-fi/README.md b/models/fa-fi/README.md new file mode 100644 index 00000000..e5f16f4d --- /dev/null +++ b/models/fa-fi/README.md @@ -0,0 +1,36 @@ +# opus-2020-05-20.zip + +* dataset: opus +* model: transformer-align +* source language(s): fa +* target language(s): fi +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus-2020-05-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus-2020-05-20.zip) +* test set translations: [opus-2020-05-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus-2020-05-20.test.txt) +* test set scores: [opus-2020-05-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus-2020-05-20.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.fa.fi | 17.5 | 0.407 | + +# opus+bt+thl-2020-05-20.zip + +* dataset: opus+bt+thl +* model: transformer +* source language(s): fa +* target language(s): fi +* model: transformer +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus+bt+thl-2020-05-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus+bt+thl-2020-05-20.zip) +* test set translations: [opus+bt+thl-2020-05-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus+bt+thl-2020-05-20.test.txt) +* test set scores: [opus+bt+thl-2020-05-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fa-fi/opus+bt+thl-2020-05-20.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| infopankki.fa.fi | 57.9 | 0.773 | + diff --git a/models/fi+sv-en/README.md b/models/fi+sv-en/README.md new file mode 100644 index 00000000..e759916d --- /dev/null +++ b/models/fi+sv-en/README.md @@ -0,0 +1,47 @@ +# opus-2020-10-10.zip + +* dataset: opus +* model: transformer +* source language(s): fi sv +* target language(s): en +* model: transformer +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus-2020-10-10.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi+sv-en/opus-2020-10-10.zip) +* test set translations: [opus-2020-10-10.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi+sv-en/opus-2020-10-10.test.txt) +* test set scores: [opus-2020-10-10.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi+sv-en/opus-2020-10-10.eval.txt) + +## Training data: opus + +* fi-en: Books (3618) DGT (4887825) ECB (139089) ELRA-W0217 (9772) ELRA-W0220 (53703) ELRA-W0305 (15) ELRC_2922 (312) ELRC_2923 (394) ELRC_3382 (3357) ELRC_416 (696) EMEA (875550) EUbookshop (2027241) EUconst (7220) Europarl (1954995) GNOME (59709) JRC-Acquis (15927) JW300 (2001165) KDE4 (90150) OpenSubtitles (26457741) PHP (24293) ParaCrawl (3089564) QED (98509) TildeMODEL (2983582) Ubuntu (7470) bible-uedin (61917) infopankki (84378) +* fi-en: total size = 44938192 +* sv-en: Books (3047) DGT (4780207) ELRA-W0130 (2170) ELRA-W0213 (1924) ELRA-W0222 (6560) ELRA-W0239 (8265) ELRA-W0305 (1132) ELRC_2922 (499) ELRC_2923 (492) ELRC_3382 (3738) ELRC_416 (1062) EMEA (840417) EUbookshop (1885825) EUconst (7010) Europarl (1870175) GNOME (126) GlobalVoices (8012) JRC-Acquis (666453) JW300 (1641702) KDE4 (190266) OpenSubtitles (16169056) PHP (18420) ParaCrawl (6000734) QED (161764) RF (174) Tanzil (126202) TildeMODEL (3102585) Ubuntu (5678) WikiSource (32427) bible-uedin (61205) infopankki (51688) +* sv-en: total size = 37649015 +* unused dev/test data is added to training data +* total size (opus): 82379895 + + +## Validation data + +* en-fi: Tatoeba, 78868 +* en-sv: Tatoeba, 24256 +* total size of shuffled dev data: 103084 + +* devset = top 2500 lines of opus-dev.src.shuffled! +* testset = next 2500 lines of opus-dev.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| newsdev2015-enfi-fien.fi.en | 25.5 | 0.536 | +| newstest2015-enfi-fien.fi.en | 26.9 | 0.545 | +| newstest2016-enfi-fien.fi.en | 28.6 | 0.567 | +| newstest2017-enfi-fien.fi.en | 31.9 | 0.589 | +| newstest2018-enfi-fien.fi.en | 23.4 | 0.514 | +| newstest2019-fien-fien.fi.en | 28.5 | 0.560 | +| newstestB2016-enfi-fien.fi.en | 23.8 | 0.523 | +| newstestB2017-enfi-fien.fi.en | 27.2 | 0.554 | +| newstestB2017-fien-fien.fi.en | 27.2 | 0.554 | +| opus-test.multi.en | 58.2 | 0.721 | + diff --git a/models/fi-ar/README.md b/models/fi-ar/README.md new file mode 100644 index 00000000..7a8691c2 --- /dev/null +++ b/models/fi-ar/README.md @@ -0,0 +1,18 @@ +# opus-2020-05-16.zip + +* dataset: opus +* model: transformer-align +* source language(s): fi +* target language(s): ar +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-ar/opus-2020-05-16.zip) +* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-ar/opus-2020-05-16.test.txt) +* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-ar/opus-2020-05-16.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| infopankki.fi.ar | 42.9 | 0.674 | + diff --git a/models/fi-fa/README.md b/models/fi-fa/README.md new file mode 100644 index 00000000..7280dde0 --- /dev/null +++ b/models/fi-fa/README.md @@ -0,0 +1,25 @@ +# opus+bt-2020-05-20.zip + +* dataset: opus+bt +* model: transformer +* source language(s): fi +* target language(s): fa +* model: transformer +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus+bt-2020-05-20.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-fa/opus+bt-2020-05-20.zip) +* test set translations: [opus+bt-2020-05-20.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-fa/opus+bt-2020-05-20.test.txt) +* test set scores: [opus+bt-2020-05-20.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-fa/opus+bt-2020-05-20.eval.txt) + +## Training data: opus+bt + +* fa-fi: GNOME JW300 KDE4 OpenSubtitles QED Ubuntu wikimedia +* fa-fi backtranslations: backtranslate/fa-fi/latest/wiki.aa.fa-fi backtranslate/fa-fi/latest/wikinews.aa.fa-fi backtranslate/fa-fi/latest/wikiquote.aa.fa-fi +* unused dev/test data is added to training data + + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| infopankki.fi.fa | 48.9 | 0.673 | + diff --git a/models/fi-pl/README.md b/models/fi-pl/README.md new file mode 100644 index 00000000..f360b170 --- /dev/null +++ b/models/fi-pl/README.md @@ -0,0 +1,18 @@ +# opus-2020-05-16.zip + +* dataset: opus +* model: transformer-align +* source language(s): fi +* target language(s): pl +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-pl/opus-2020-05-16.zip) +* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-pl/opus-2020-05-16.test.txt) +* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-pl/opus-2020-05-16.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| Tatoeba.fi.pl | 47.9 | 0.673 | + diff --git a/models/nb+nn+no+nb_NO+nn_NO+no_nb-en/README.md b/models/nb+nn+no+nb_NO+nn_NO+no_nb-en/README.md new file mode 100644 index 00000000..d9d21c95 --- /dev/null +++ b/models/nb+nn+no+nb_NO+nn_NO+no_nb-en/README.md @@ -0,0 +1,50 @@ +# opus-2020-05-22.zip + +* dataset: opus +* model: transformer-align +* source language(s): nb nn no nb_NO nn_NO no_nb +* target language(s): en +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus-2020-05-22.zip](https://object.pouta.csc.fi/OPUS-MT-models/nb+nn+no+nb_NO+nn_NO+no_nb-en/opus-2020-05-22.zip) +* test set translations: [opus-2020-05-22.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/nb+nn+no+nb_NO+nn_NO+no_nb-en/opus-2020-05-22.test.txt) +* test set scores: [opus-2020-05-22.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/nb+nn+no+nb_NO+nn_NO+no_nb-en/opus-2020-05-22.eval.txt) + +## Training data: opus + +* nb-en: EUbookshop (27499) GNOME (116) KDE4 (93556) QED (124570) Ubuntu (3294) +* nb-en: total size = 249035 +* nn-en: GNOME (367687) KDE4 (76910) QED (3569) Ubuntu (38767) wikimedia (383) +* nn-en: total size = 487316 +* no-en: bible-uedin (61093) Books (3412) GNOME (7124) OpenSubtitles (8071047) Tanzil (134647) TildeMODEL (325194) Ubuntu (7925) wikimedia (66) +* no-en: total size = 8610508 +* nb_NO-en: GNOME (1) +* nb_NO-en: total size = 1 +* nn_NO-en: GNOME (1) +* nn_NO-en: total size = 1 +* no_nb-en: GNOME (20) +* no_nb-en: total size = 20 +* unused dev/test data is added to training data +* total size (opus): 11179800 + + +## Validation data + +* en-nb: Tatoeba, 9282 +* en-nn: Tatoeba, 940 +* en-no: JW300, 1837668 +* en-nb_NO: bible-uedin, 0 +* en-nn_NO: bible-uedin, 0 +* en-no_nb: bible-uedin, 0 +* total size of shuffled dev data: 1847890 + +* devset = top 5000 lines of opus-dev.src.shuffled! +* testset = next 5000 lines of opus-dev.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| opus-test.nb.en | 44.7 | 0.623 | + diff --git a/models/pl-fi/README.md b/models/pl-fi/README.md new file mode 100644 index 00000000..0900fbe8 --- /dev/null +++ b/models/pl-fi/README.md @@ -0,0 +1,18 @@ +# opus-2020-05-16.zip + +* dataset: opus +* model: transformer-align +* source language(s): pl +* target language(s): fi +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/pl-fi/opus-2020-05-16.zip) +* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/pl-fi/opus-2020-05-16.test.txt) +* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/pl-fi/opus-2020-05-16.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| Tatoeba.pl.fi | 41.0 | 0.640 | + diff --git a/models/sq-fi/README.md b/models/sq-fi/README.md new file mode 100644 index 00000000..52af0c31 --- /dev/null +++ b/models/sq-fi/README.md @@ -0,0 +1,18 @@ +# opus-2020-05-16.zip + +* dataset: opus +* model: transformer-align +* source language(s): sq +* target language(s): fi +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm32k,spm32k) +* download: [opus-2020-05-16.zip](https://object.pouta.csc.fi/OPUS-MT-models/sq-fi/opus-2020-05-16.zip) +* test set translations: [opus-2020-05-16.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/sq-fi/opus-2020-05-16.test.txt) +* test set scores: [opus-2020-05-16.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/sq-fi/opus-2020-05-16.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.sq.fi | 30.2 | 0.556 | + diff --git a/models/zne-en/README.md b/models/zne-en/README.md new file mode 100644 index 00000000..01353b25 --- /dev/null +++ b/models/zne-en/README.md @@ -0,0 +1,35 @@ +# opus+bt-2020-05-23.zip + +* dataset: opus+bt +* model: transformer-align +* source language(s): zne +* target language(s): en +* model: transformer-align +* pre-processing: normalization + SentencePiece (spm4k,spm4k) +* download: [opus+bt-2020-05-23.zip](https://object.pouta.csc.fi/OPUS-MT-models/zne-en/opus+bt-2020-05-23.zip) +* test set translations: [opus+bt-2020-05-23.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/zne-en/opus+bt-2020-05-23.test.txt) +* test set scores: [opus+bt-2020-05-23.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/zne-en/opus+bt-2020-05-23.eval.txt) + +## Training data: opus+bt + +* zne-en: +* zne-en: total size = 0 +* unused dev/test data is added to training data +* total size (opus+bt): 184853 + + +## Validation data + +* en-zne: JW300, 189924 +* total size of shuffled dev data: 189924 + +* devset = top 2500 lines of JW300.src.shuffled! +* testset = next 2500 lines of JW300.src.shuffled! +* remaining lines are added to traindata + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.zne.en | 35.2 | 0.510 | +