finetuning for fi-en

This commit is contained in:
Joerg Tiedemann 2020-02-14 00:12:55 +02:00
parent 870804f4ee
commit 0e893a06e0
35 changed files with 32059 additions and 27 deletions

View File

@ -687,12 +687,14 @@ add-to-local-mono-data:
$(LOAD_MOSES) cat $< |\ $(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\ $(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\ $(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@ sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
%.simple.gz: %.gz %.simple.gz: %.gz
$(LOAD_MOSES) zcat $< |\ $(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\ $(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\ $(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
@ -842,7 +844,7 @@ SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} .PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/% # ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.charfreq ${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
ifeq ($(wildcard ${SPMSRCMODEL}),) ifeq ($(wildcard ${SPMSRCMODEL}),)
mkdir -p ${dir $@} mkdir -p ${dir $@}
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}}) ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
@ -850,7 +852,8 @@ ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
else else
cut -f2- -d ' ' $< | grep . | shuf > $<.text cut -f2- -d ' ' $< | grep . | shuf > $<.text
endif endif
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \ ${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train \
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \ --character_coverage=0.9995 --hard_vocab_limit=false; \
@ -871,11 +874,12 @@ endif
## no labels on the target language side ## no labels on the target language side
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/% # ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_TRG}.charfreq ${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
ifeq ($(wildcard ${SPMTRGMODEL}),) ifeq ($(wildcard ${SPMTRGMODEL}),)
mkdir -p ${dir $@} mkdir -p ${dir $@}
grep . $< | shuf > $<.text grep . $< | shuf > $<.text
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \ ${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \ --character_coverage=0.9995 --hard_vocab_limit=false; \
@ -952,11 +956,12 @@ ifneq (${SPMMODEL},${SPMTRGMONO})
endif endif
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.${PRE}.charfreq ${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
ifeq ($(wildcard ${SPMMODEL}),) ifeq ($(wildcard ${SPMMODEL}),)
mkdir -p ${dir $@} mkdir -p ${dir $@}
grep . $< | shuf > $<.text grep . $< | shuf > $<.text
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \ ${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \ ${SPM_HOME}/spm_train \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \ --model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \ --character_coverage=0.9995 --hard_vocab_limit=false; \
@ -992,11 +997,15 @@ endif
## python-based char-counter (seems to be the fastest version) ## python-based char-counter (seems to be the fastest version)
%.charfreq: % %.charfreq: %
python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<', 'r').read())))" > $@ head -10000000 $< > $<.10m
python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
rm -f $<.10m
## slow version ## slow version
%.charfreq2: % %.charfreq2: %
sed 's/./& /g' < $< | tr ' ' "\n" | grep . |\ head -10000000 $< |\
sed 's/./& /g' | \
tr ' ' "\n" | grep . |\
sort | uniq -c > $@ sort | uniq -c > $@

View File

@ -84,6 +84,7 @@ ifeq (${wildcard ${BT_MODEL_START}},)
cp ${MODEL_VOCAB} ${BT_MODEL_VOCAB} cp ${MODEL_VOCAB} ${BT_MODEL_VOCAB}
endif endif
endif endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=opus+bt \ ${MAKE} DATASET=opus+bt \
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \ CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \ CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
@ -94,6 +95,7 @@ endif
## run a multigpu job (2 or 4 GPUs) ## run a multigpu job (2 or 4 GPUs)
%-multigpu %-0123: %-multigpu %-0123:

View File

@ -275,13 +275,16 @@ icelandic:
${MAKE} SRCLANGS=is TRGLANGS="da no nn nb sv" bilingual ${MAKE} SRCLANGS=is TRGLANGS="da no nn nb sv" bilingual
${MAKE} SRCLANGS=is TRGLANGS=fi bilingual ${MAKE} SRCLANGS=is TRGLANGS=fi bilingual
## include yandex data in training
enru-yandex: enru-yandex:
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex data ${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex data
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex reverse-data ${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex reverse-data
${MAKE} DATASET=opus+yandex SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \ ${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 train.submit-multigpu WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train.submit-multigpu
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex \ ${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g train.submit-multigpu WALLTIME=72 HPC_CORES=1 HPC_MEM=4g MARIAN_EARLY_STOPPING=15 train.submit-multigpu
enit: enit:

View File

@ -29,7 +29,7 @@ MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifeq (${MODELNAME},) ifeq (${MODELNAME},)
MODELHOME = ../work-filter/models/${LANGPAIR} MODELHOME = ../work-langid/models/${LANGPAIR}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}} # MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}} MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
@ -128,6 +128,7 @@ print-names:
echo ${LANGNAME} echo ${LANGNAME}
echo ${UDPIPE_MODEL} echo ${UDPIPE_MODEL}
echo ${WIKI_JSON} echo ${WIKI_JSON}
echo ${MODELNAME}
## fetch the latest model ## fetch the latest model

View File

@ -66,7 +66,7 @@ MARIAN_EARLY_STOPPING = 5
.PHONY: all .PHONY: all
all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
@ -250,11 +250,13 @@ ${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
cp ${BASEMODELZIP} ${dir $@} cp ${BASEMODELZIP} ${dir $@}
cd ${dir $@} && unzip -u *.zip cd ${dir $@} && unzip -u *.zip
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\ zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\ ${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
gzip -c > $@ gzip -c > $@
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\ zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\ ${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
@ -314,10 +316,10 @@ ${TUNED_MODEL}.done: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz $
.PHONY: translate .PHONY: translate
translate: ${TEST_SRC}.${TRG}.gz translate: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.gz
## translate test set ## translate test set
${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz ${TEST_SRC}.${BASEMODELNAME}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
mkdir -p ${dir $@} mkdir -p ${dir $@}
${LOADMODS} && ${MARIAN}/marian-decoder \ ${LOADMODS} && ${MARIAN}/marian-decoder \
-i $< \ -i $< \
@ -329,10 +331,10 @@ ${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
.PHONY: translate-baseline .PHONY: translate-baseline
translate-baseline: ${TEST_SRC}.baseline.${TRG}.gz translate-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.gz
## translate test set ## translate test set
${TEST_SRC}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
mkdir -p ${dir $@} mkdir -p ${dir $@}
cd ${LANGPAIR}/${BASEMODELNAME}; \ cd ${LANGPAIR}/${BASEMODELNAME}; \
${LOADMODS} && ${MARIAN}/marian-decoder \ ${LOADMODS} && ${MARIAN}/marian-decoder \
@ -346,18 +348,18 @@ ${TEST_SRC}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/
.PHONY: eval eval-baseline .PHONY: eval eval-baseline
eval: ${TEST_SRC}.${TRG}.eval eval: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval
eval-baseline: ${TEST_SRC}.baseline.${TRG}.eval eval-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval
## without reference normalisation ## without reference normalisation
${TEST_SRC}.${TRG}.eval ${TEST_SRC}.baseline.${TRG}.eval: %.eval: %.gz ${TEST_TRG}.gz ${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval: %.eval: %.gz ${TEST_TRG}.gz
zcat ${TEST_TRG}.gz > $@.ref zcat ${TEST_TRG}.gz > $@.ref
zcat $< | sacrebleu $@.ref > $@ zcat $< | sacrebleu $@.ref > $@
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@ zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
rm -f $@.ref rm -f $@.ref
## with reference normalisation (should not do this) ## with reference normalisation (should not do this)
${TEST_SRC}.${TRG}.eval-norm ${TEST_SRC}.baseline.${TRG}.eval-norm: %.eval-norm: %.gz ${TEST_TRG}.gz ${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval-norm ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval-norm: %.eval-norm: %.gz ${TEST_TRG}.gz
zcat ${TEST_TRG}.gz |\ zcat ${TEST_TRG}.gz |\
${TOKENIZER}/replace-unicode-punctuation.perl |\ ${TOKENIZER}/replace-unicode-punctuation.perl |\
${TOKENIZER}/remove-non-printing-char.perl |\ ${TOKENIZER}/remove-non-printing-char.perl |\
@ -371,10 +373,10 @@ ${TEST_SRC}.${TRG}.eval-norm ${TEST_SRC}.baseline.${TRG}.eval-norm: %.eval-norm:
.PHONY: compare compare-baseline .PHONY: compare compare-baseline
compare: ${TEST_SRC}.${TRG}.compare compare: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
compare-baseline: ${TEST_SRC}.baseline.${TRG}.compare compare-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare: %.compare: %.eval ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare: %.compare: %.eval
zcat ${TEST_SRC}.gz > $@.1 zcat ${TEST_SRC}.gz > $@.1
zcat ${TEST_TRG}.gz > $@.2 zcat ${TEST_TRG}.gz > $@.2
zcat ${<:.eval=.gz} > $@.3 zcat ${<:.eval=.gz} > $@.3

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 22.9 54.2/29.2/17.8/11.3 (BP = 0.962 ratio = 0.963 hyp_len = 36946 ref_len = 38369)
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.548

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 25.1 56.3/31.7/19.9/13.1 (BP = 0.963 ratio = 0.964 hyp_len = 36977 ref_len = 38369)
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.568

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 28.5 62.4/35.5/21.9/13.8 (BP = 0.998 ratio = 0.998 hyp_len = 36157 ref_len = 36215)
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.564

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 31.4 64.7/38.5/24.7/16.2 (BP = 0.992 ratio = 0.992 hyp_len = 35930 ref_len = 36215)
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.583

View File

@ -44,3 +44,26 @@
| newstestB2017-enfi.en.fi | 22.4 | 0.559 | | newstestB2017-enfi.en.fi | 22.4 | 0.559 |
| Tatoeba.en.fi | 39.4 | 0.627 | | Tatoeba.en.fi | 39.4 | 0.627 |
# opus-2020-02-13.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.zip)
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.test.txt)
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newsdev2015-enfi.en.fi | 21.1 | 0.552 |
| newstest2015-enfi.en.fi | 22.7 | 0.563 |
| newstest2016-enfi.en.fi | 24.3 | 0.572 |
| newstest2017-enfi.en.fi | 27.1 | 0.601 |
| newstest2018-enfi.en.fi | 18.0 | 0.529 |
| newstest2019-enfi.en.fi | 22.9 | 0.548 |
| newstestB2016-enfi.en.fi | 19.2 | 0.537 |
| newstestB2017-enfi.en.fi | 22.3 | 0.560 |
| Tatoeba.en.fi | 41.1 | 0.645 |

View File

@ -70,3 +70,27 @@
| newstestB2017-fien.fi.en | 27.7 | 0.559 | | newstestB2017-fien.fi.en | 27.7 | 0.559 |
| Tatoeba.fi.en | 57.2 | 0.717 | | Tatoeba.fi.en | 57.2 | 0.717 |
# opus-2020-02-13.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.zip)
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.test.txt)
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newsdev2015-enfi.fi.en | 25.4 | 0.538 |
| newstest2015-enfi.fi.en | 27.1 | 0.549 |
| newstest2016-enfi.fi.en | 29.5 | 0.572 |
| newstest2017-enfi.fi.en | 33.1 | 0.598 |
| newstest2018-enfi.fi.en | 24.0 | 0.519 |
| newstest2019-fien.fi.en | 28.9 | 0.566 |
| newstestB2016-enfi.fi.en | 24.5 | 0.527 |
| newstestB2017-enfi.fi.en | 27.9 | 0.560 |
| newstestB2017-fien.fi.en | 27.9 | 0.560 |
| Tatoeba.fi.en | 57.4 | 0.718 |

View File

@ -14,3 +14,19 @@
| fiskmo_testset.sv.fi | 26.4 | 0.616 | | fiskmo_testset.sv.fi | 26.4 | 0.616 |
| Tatoeba.sv.fi | 44.7 | 0.667 | | Tatoeba.sv.fi | 44.7 | 0.667 |
# opus-2020-02-13.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.zip)
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.test.txt)
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| fiskmo_testset.sv.fi | 26.1 | 0.613 |
| Tatoeba.sv.fi | 44.8 | 0.673 |