mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
finetuning for fi-en
This commit is contained in:
parent
870804f4ee
commit
0e893a06e0
@ -687,12 +687,14 @@ add-to-local-mono-data:
|
|||||||
$(LOAD_MOSES) cat $< |\
|
$(LOAD_MOSES) cat $< |\
|
||||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||||
|
|
||||||
%.simple.gz: %.gz
|
%.simple.gz: %.gz
|
||||||
$(LOAD_MOSES) zcat $< |\
|
$(LOAD_MOSES) zcat $< |\
|
||||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||||
|
|
||||||
|
|
||||||
@ -842,7 +844,7 @@ SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
|||||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||||
|
|
||||||
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.charfreq
|
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
|
||||||
ifeq ($(wildcard ${SPMSRCMODEL}),)
|
ifeq ($(wildcard ${SPMSRCMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
||||||
@ -850,7 +852,8 @@ ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
|
|||||||
else
|
else
|
||||||
cut -f2- -d ' ' $< | grep . | shuf > $<.text
|
cut -f2- -d ' ' $< | grep . | shuf > $<.text
|
||||||
endif
|
endif
|
||||||
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
|
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
|
||||||
|
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train \
|
||||||
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
|
||||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
@ -871,11 +874,12 @@ endif
|
|||||||
|
|
||||||
## no labels on the target language side
|
## no labels on the target language side
|
||||||
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
|
||||||
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_TRG}.charfreq
|
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
|
||||||
ifeq ($(wildcard ${SPMTRGMODEL}),)
|
ifeq ($(wildcard ${SPMTRGMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
grep . $< | shuf > $<.text
|
grep . $< | shuf > $<.text
|
||||||
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
|
${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
|
||||||
|
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train \
|
||||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
@ -952,11 +956,12 @@ ifneq (${SPMMODEL},${SPMTRGMONO})
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
|
||||||
ifeq ($(wildcard ${SPMMODEL}),)
|
ifeq ($(wildcard ${SPMMODEL}),)
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
grep . $< | shuf > $<.text
|
grep . $< | shuf > $<.text
|
||||||
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
|
${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
|
||||||
|
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
|
||||||
${SPM_HOME}/spm_train \
|
${SPM_HOME}/spm_train \
|
||||||
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
|
||||||
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
--character_coverage=0.9995 --hard_vocab_limit=false; \
|
||||||
@ -992,11 +997,15 @@ endif
|
|||||||
|
|
||||||
## python-based char-counter (seems to be the fastest version)
|
## python-based char-counter (seems to be the fastest version)
|
||||||
%.charfreq: %
|
%.charfreq: %
|
||||||
python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<', 'r').read())))" > $@
|
head -10000000 $< > $<.10m
|
||||||
|
python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
|
||||||
|
rm -f $<.10m
|
||||||
|
|
||||||
## slow version
|
## slow version
|
||||||
%.charfreq2: %
|
%.charfreq2: %
|
||||||
sed 's/./& /g' < $< | tr ' ' "\n" | grep . |\
|
head -10000000 $< |\
|
||||||
|
sed 's/./& /g' | \
|
||||||
|
tr ' ' "\n" | grep . |\
|
||||||
sort | uniq -c > $@
|
sort | uniq -c > $@
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,6 +84,7 @@ ifeq (${wildcard ${BT_MODEL_START}},)
|
|||||||
cp ${MODEL_VOCAB} ${BT_MODEL_VOCAB}
|
cp ${MODEL_VOCAB} ${BT_MODEL_VOCAB}
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
|
||||||
${MAKE} DATASET=opus+bt \
|
${MAKE} DATASET=opus+bt \
|
||||||
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
|
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
|
||||||
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
|
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
|
||||||
@ -94,6 +95,7 @@ endif
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## run a multigpu job (2 or 4 GPUs)
|
## run a multigpu job (2 or 4 GPUs)
|
||||||
|
|
||||||
%-multigpu %-0123:
|
%-multigpu %-0123:
|
||||||
|
@ -275,13 +275,16 @@ icelandic:
|
|||||||
${MAKE} SRCLANGS=is TRGLANGS="da no nn nb sv" bilingual
|
${MAKE} SRCLANGS=is TRGLANGS="da no nn nb sv" bilingual
|
||||||
${MAKE} SRCLANGS=is TRGLANGS=fi bilingual
|
${MAKE} SRCLANGS=is TRGLANGS=fi bilingual
|
||||||
|
|
||||||
|
|
||||||
|
## include yandex data in training
|
||||||
|
|
||||||
enru-yandex:
|
enru-yandex:
|
||||||
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex data
|
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex data
|
||||||
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex reverse-data
|
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex reverse-data
|
||||||
${MAKE} DATASET=opus+yandex SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
|
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
|
||||||
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 train.submit-multigpu
|
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train.submit-multigpu
|
||||||
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex \
|
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex \
|
||||||
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g train.submit-multigpu
|
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g MARIAN_EARLY_STOPPING=15 train.submit-multigpu
|
||||||
|
|
||||||
|
|
||||||
enit:
|
enit:
|
||||||
|
@ -29,7 +29,7 @@ MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
|||||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||||
|
|
||||||
ifeq (${MODELNAME},)
|
ifeq (${MODELNAME},)
|
||||||
MODELHOME = ../work-filter/models/${LANGPAIR}
|
MODELHOME = ../work-langid/models/${LANGPAIR}
|
||||||
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||||
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
|
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
|
||||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||||
@ -128,6 +128,7 @@ print-names:
|
|||||||
echo ${LANGNAME}
|
echo ${LANGNAME}
|
||||||
echo ${UDPIPE_MODEL}
|
echo ${UDPIPE_MODEL}
|
||||||
echo ${WIKI_JSON}
|
echo ${WIKI_JSON}
|
||||||
|
echo ${MODELNAME}
|
||||||
|
|
||||||
|
|
||||||
## fetch the latest model
|
## fetch the latest model
|
||||||
|
@ -66,7 +66,7 @@ MARIAN_EARLY_STOPPING = 5
|
|||||||
|
|
||||||
|
|
||||||
.PHONY: all
|
.PHONY: all
|
||||||
all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare
|
all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -250,11 +250,13 @@ ${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
|
|||||||
cp ${BASEMODELZIP} ${dir $@}
|
cp ${BASEMODELZIP} ${dir $@}
|
||||||
cd ${dir $@} && unzip -u *.zip
|
cd ${dir $@} && unzip -u *.zip
|
||||||
|
|
||||||
|
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
|
||||||
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||||
zcat $< |\
|
zcat $< |\
|
||||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
|
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
|
||||||
gzip -c > $@
|
gzip -c > $@
|
||||||
|
|
||||||
|
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
|
||||||
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||||
zcat $< |\
|
zcat $< |\
|
||||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
|
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
|
||||||
@ -314,10 +316,10 @@ ${TUNED_MODEL}.done: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz $
|
|||||||
|
|
||||||
|
|
||||||
.PHONY: translate
|
.PHONY: translate
|
||||||
translate: ${TEST_SRC}.${TRG}.gz
|
translate: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.gz
|
||||||
|
|
||||||
## translate test set
|
## translate test set
|
||||||
${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
|
${TEST_SRC}.${BASEMODELNAME}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
${LOADMODS} && ${MARIAN}/marian-decoder \
|
${LOADMODS} && ${MARIAN}/marian-decoder \
|
||||||
-i $< \
|
-i $< \
|
||||||
@ -329,10 +331,10 @@ ${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
|
|||||||
|
|
||||||
|
|
||||||
.PHONY: translate-baseline
|
.PHONY: translate-baseline
|
||||||
translate-baseline: ${TEST_SRC}.baseline.${TRG}.gz
|
translate-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.gz
|
||||||
|
|
||||||
## translate test set
|
## translate test set
|
||||||
${TEST_SRC}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
cd ${LANGPAIR}/${BASEMODELNAME}; \
|
cd ${LANGPAIR}/${BASEMODELNAME}; \
|
||||||
${LOADMODS} && ${MARIAN}/marian-decoder \
|
${LOADMODS} && ${MARIAN}/marian-decoder \
|
||||||
@ -346,18 +348,18 @@ ${TEST_SRC}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/
|
|||||||
|
|
||||||
|
|
||||||
.PHONY: eval eval-baseline
|
.PHONY: eval eval-baseline
|
||||||
eval: ${TEST_SRC}.${TRG}.eval
|
eval: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval
|
||||||
eval-baseline: ${TEST_SRC}.baseline.${TRG}.eval
|
eval-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval
|
||||||
|
|
||||||
## without reference normalisation
|
## without reference normalisation
|
||||||
${TEST_SRC}.${TRG}.eval ${TEST_SRC}.baseline.${TRG}.eval: %.eval: %.gz ${TEST_TRG}.gz
|
${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval: %.eval: %.gz ${TEST_TRG}.gz
|
||||||
zcat ${TEST_TRG}.gz > $@.ref
|
zcat ${TEST_TRG}.gz > $@.ref
|
||||||
zcat $< | sacrebleu $@.ref > $@
|
zcat $< | sacrebleu $@.ref > $@
|
||||||
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
||||||
rm -f $@.ref
|
rm -f $@.ref
|
||||||
|
|
||||||
## with reference normalisation (should not do this)
|
## with reference normalisation (should not do this)
|
||||||
${TEST_SRC}.${TRG}.eval-norm ${TEST_SRC}.baseline.${TRG}.eval-norm: %.eval-norm: %.gz ${TEST_TRG}.gz
|
${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval-norm ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval-norm: %.eval-norm: %.gz ${TEST_TRG}.gz
|
||||||
zcat ${TEST_TRG}.gz |\
|
zcat ${TEST_TRG}.gz |\
|
||||||
${TOKENIZER}/replace-unicode-punctuation.perl |\
|
${TOKENIZER}/replace-unicode-punctuation.perl |\
|
||||||
${TOKENIZER}/remove-non-printing-char.perl |\
|
${TOKENIZER}/remove-non-printing-char.perl |\
|
||||||
@ -371,10 +373,10 @@ ${TEST_SRC}.${TRG}.eval-norm ${TEST_SRC}.baseline.${TRG}.eval-norm: %.eval-norm:
|
|||||||
|
|
||||||
|
|
||||||
.PHONY: compare compare-baseline
|
.PHONY: compare compare-baseline
|
||||||
compare: ${TEST_SRC}.${TRG}.compare
|
compare: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
|
||||||
compare-baseline: ${TEST_SRC}.baseline.${TRG}.compare
|
compare-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
|
||||||
|
|
||||||
${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare: %.compare: %.eval
|
${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare: %.compare: %.eval
|
||||||
zcat ${TEST_SRC}.gz > $@.1
|
zcat ${TEST_SRC}.gz > $@.1
|
||||||
zcat ${TEST_TRG}.gz > $@.2
|
zcat ${TEST_TRG}.gz > $@.2
|
||||||
zcat ${<:.eval=.gz} > $@.3
|
zcat ${<:.eval=.gz} > $@.3
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 22.9 54.2/29.2/17.8/11.3 (BP = 0.962 ratio = 0.963 hyp_len = 36946 ref_len = 38369)
|
||||||
|
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.548
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 25.1 56.3/31.7/19.9/13.1 (BP = 0.963 ratio = 0.964 hyp_len = 36977 ref_len = 38369)
|
||||||
|
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.568
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
finetune/fi-en/news/test/newstest2019-fien.en.gz
Normal file
BIN
finetune/fi-en/news/test/newstest2019-fien.en.gz
Normal file
Binary file not shown.
BIN
finetune/fi-en/news/test/newstest2019-fien.fi.gz
Normal file
BIN
finetune/fi-en/news/test/newstest2019-fien.fi.gz
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 28.5 62.4/35.5/21.9/13.8 (BP = 0.998 ratio = 0.998 hyp_len = 36157 ref_len = 36215)
|
||||||
|
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.564
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,2 @@
|
|||||||
|
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 31.4 64.7/38.5/24.7/16.2 (BP = 0.992 ratio = 0.992 hyp_len = 35930 ref_len = 36215)
|
||||||
|
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.583
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -44,3 +44,26 @@
|
|||||||
| newstestB2017-enfi.en.fi | 22.4 | 0.559 |
|
| newstestB2017-enfi.en.fi | 22.4 | 0.559 |
|
||||||
| Tatoeba.en.fi | 39.4 | 0.627 |
|
| Tatoeba.en.fi | 39.4 | 0.627 |
|
||||||
|
|
||||||
|
# opus-2020-02-13.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.zip)
|
||||||
|
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| newsdev2015-enfi.en.fi | 21.1 | 0.552 |
|
||||||
|
| newstest2015-enfi.en.fi | 22.7 | 0.563 |
|
||||||
|
| newstest2016-enfi.en.fi | 24.3 | 0.572 |
|
||||||
|
| newstest2017-enfi.en.fi | 27.1 | 0.601 |
|
||||||
|
| newstest2018-enfi.en.fi | 18.0 | 0.529 |
|
||||||
|
| newstest2019-enfi.en.fi | 22.9 | 0.548 |
|
||||||
|
| newstestB2016-enfi.en.fi | 19.2 | 0.537 |
|
||||||
|
| newstestB2017-enfi.en.fi | 22.3 | 0.560 |
|
||||||
|
| Tatoeba.en.fi | 41.1 | 0.645 |
|
||||||
|
|
||||||
|
@ -70,3 +70,27 @@
|
|||||||
| newstestB2017-fien.fi.en | 27.7 | 0.559 |
|
| newstestB2017-fien.fi.en | 27.7 | 0.559 |
|
||||||
| Tatoeba.fi.en | 57.2 | 0.717 |
|
| Tatoeba.fi.en | 57.2 | 0.717 |
|
||||||
|
|
||||||
|
# opus-2020-02-13.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.zip)
|
||||||
|
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| newsdev2015-enfi.fi.en | 25.4 | 0.538 |
|
||||||
|
| newstest2015-enfi.fi.en | 27.1 | 0.549 |
|
||||||
|
| newstest2016-enfi.fi.en | 29.5 | 0.572 |
|
||||||
|
| newstest2017-enfi.fi.en | 33.1 | 0.598 |
|
||||||
|
| newstest2018-enfi.fi.en | 24.0 | 0.519 |
|
||||||
|
| newstest2019-fien.fi.en | 28.9 | 0.566 |
|
||||||
|
| newstestB2016-enfi.fi.en | 24.5 | 0.527 |
|
||||||
|
| newstestB2017-enfi.fi.en | 27.9 | 0.560 |
|
||||||
|
| newstestB2017-fien.fi.en | 27.9 | 0.560 |
|
||||||
|
| Tatoeba.fi.en | 57.4 | 0.718 |
|
||||||
|
|
||||||
|
@ -14,3 +14,19 @@
|
|||||||
| fiskmo_testset.sv.fi | 26.4 | 0.616 |
|
| fiskmo_testset.sv.fi | 26.4 | 0.616 |
|
||||||
| Tatoeba.sv.fi | 44.7 | 0.667 |
|
| Tatoeba.sv.fi | 44.7 | 0.667 |
|
||||||
|
|
||||||
|
# opus-2020-02-13.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.zip)
|
||||||
|
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| fiskmo_testset.sv.fi | 26.1 | 0.613 |
|
||||||
|
| Tatoeba.sv.fi | 44.8 | 0.673 |
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user