finetuning for fi-en

This commit is contained in:
Joerg Tiedemann 2020-02-14 00:12:55 +02:00
parent 870804f4ee
commit 0e893a06e0
35 changed files with 32059 additions and 27 deletions

View File

@ -687,12 +687,14 @@ add-to-local-mono-data:
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
%.simple.gz: %.gz
$(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
@ -842,7 +844,7 @@ SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
# ${SPMSRCMODEL}: ${WORKDIR}/%.spm${SRCBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.charfreq
${SPMSRCMODEL}: ${LOCAL_TRAIN_SRC}
ifeq ($(wildcard ${SPMSRCMODEL}),)
mkdir -p ${dir $@}
ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
@ -850,7 +852,8 @@ ifeq ($(TRGLANGS),${firstword ${TRGLANGS}})
else
cut -f2- -d ' ' $< | grep . | shuf > $<.text
endif
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq
if [ `cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \
--model_prefix=$@ --vocab_size=$(SRCBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \
@ -871,11 +874,12 @@ endif
## no labels on the target language side
# ${SPMTRGMODEL}: ${WORKDIR}/%.spm${TRGBPESIZE:000=}k-model: ${TMPDIR}/${LANGPAIRSTR}/%
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG} ${LOCAL_TRAIN_TRG}.charfreq
${SPMTRGMODEL}: ${LOCAL_TRAIN_TRG}
ifeq ($(wildcard ${SPMTRGMODEL}),)
mkdir -p ${dir $@}
grep . $< | shuf > $<.text
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
${MAKE} ${LOCAL_TRAIN_TRG}.charfreq
if [ `cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \
@ -952,11 +956,12 @@ ifneq (${SPMMODEL},${SPMTRGMONO})
endif
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
${SPMMODEL}: ${LOCAL_MONO_DATA}.${PRE}
ifeq ($(wildcard ${SPMMODEL}),)
mkdir -p ${dir $@}
grep . $< | shuf > $<.text
if [ `cat ${word 2,$^} | wc -l` -gt 1000 ]; then \
${MAKE} ${LOCAL_MONO_DATA}.${PRE}.charfreq
if [ `cat ${LOCAL_MONO_DATA}.${PRE}.charfreq | wc -l` -gt 1000 ]; then \
${SPM_HOME}/spm_train \
--model_prefix=$@ --vocab_size=$(TRGBPESIZE) --input=$<.text \
--character_coverage=0.9995 --hard_vocab_limit=false; \
@ -992,11 +997,15 @@ endif
## python-based char-counter (seems to be the fastest version)
%.charfreq: %
python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<', 'r').read())))" > $@
head -10000000 $< > $<.10m
python -c "import collections, pprint; pprint.pprint(dict(collections.Counter(open('$<.10m', 'r').read())))" > $@
rm -f $<.10m
## slow version
%.charfreq2: %
sed 's/./& /g' < $< | tr ' ' "\n" | grep . |\
head -10000000 $< |\
sed 's/./& /g' | \
tr ' ' "\n" | grep . |\
sort | uniq -c > $@

View File

@ -84,6 +84,7 @@ ifeq (${wildcard ${BT_MODEL_START}},)
cp ${MODEL_VOCAB} ${BT_MODEL_VOCAB}
endif
endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=opus+bt \
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
@ -94,6 +95,7 @@ endif
## run a multigpu job (2 or 4 GPUs)
%-multigpu %-0123:

View File

@ -275,13 +275,16 @@ icelandic:
${MAKE} SRCLANGS=is TRGLANGS="da no nn nb sv" bilingual
${MAKE} SRCLANGS=is TRGLANGS=fi bilingual
## include yandex data in training
enru-yandex:
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex data
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex reverse-data
${MAKE} DATASET=opus+yandex SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 train.submit-multigpu
${MAKE} DATASET=opus+yandex SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g train.submit-multigpu
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex data
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex reverse-data
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train.submit-multigpu
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g MARIAN_EARLY_STOPPING=15 train.submit-multigpu
enit:

View File

@ -29,7 +29,7 @@ MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifeq (${MODELNAME},)
MODELHOME = ../work-filter/models/${LANGPAIR}
MODELHOME = ../work-langid/models/${LANGPAIR}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
@ -128,6 +128,7 @@ print-names:
echo ${LANGNAME}
echo ${UDPIPE_MODEL}
echo ${WIKI_JSON}
echo ${MODELNAME}
## fetch the latest model

View File

@ -66,7 +66,7 @@ MARIAN_EARLY_STOPPING = 5
.PHONY: all
all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare
all: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
@ -250,11 +250,13 @@ ${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
cp ${BASEMODELZIP} ${dir $@}
cd ${dir $@} && unzip -u *.zip
.INTERMEDIATE: ${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
gzip -c > $@
.INTERMEDIATE: ${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
@ -314,10 +316,10 @@ ${TUNED_MODEL}.done: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz $
.PHONY: translate
translate: ${TEST_SRC}.${TRG}.gz
translate: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.gz
## translate test set
${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
${TEST_SRC}.${BASEMODELNAME}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
mkdir -p ${dir $@}
${LOADMODS} && ${MARIAN}/marian-decoder \
-i $< \
@ -329,10 +331,10 @@ ${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
.PHONY: translate-baseline
translate-baseline: ${TEST_SRC}.baseline.${TRG}.gz
translate-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.gz
## translate test set
${TEST_SRC}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
mkdir -p ${dir $@}
cd ${LANGPAIR}/${BASEMODELNAME}; \
${LOADMODS} && ${MARIAN}/marian-decoder \
@ -346,18 +348,18 @@ ${TEST_SRC}.baseline.${TRG}.gz: ${TEST_SRC}.pre.gz ${LANGPAIR}/${BASEMODELNAME}/
.PHONY: eval eval-baseline
eval: ${TEST_SRC}.${TRG}.eval
eval-baseline: ${TEST_SRC}.baseline.${TRG}.eval
eval: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval
eval-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval
## without reference normalisation
${TEST_SRC}.${TRG}.eval ${TEST_SRC}.baseline.${TRG}.eval: %.eval: %.gz ${TEST_TRG}.gz
${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval: %.eval: %.gz ${TEST_TRG}.gz
zcat ${TEST_TRG}.gz > $@.ref
zcat $< | sacrebleu $@.ref > $@
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
rm -f $@.ref
## with reference normalisation (should not do this)
${TEST_SRC}.${TRG}.eval-norm ${TEST_SRC}.baseline.${TRG}.eval-norm: %.eval-norm: %.gz ${TEST_TRG}.gz
${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval-norm ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.eval-norm: %.eval-norm: %.gz ${TEST_TRG}.gz
zcat ${TEST_TRG}.gz |\
${TOKENIZER}/replace-unicode-punctuation.perl |\
${TOKENIZER}/remove-non-printing-char.perl |\
@ -371,10 +373,10 @@ ${TEST_SRC}.${TRG}.eval-norm ${TEST_SRC}.baseline.${TRG}.eval-norm: %.eval-norm:
.PHONY: compare compare-baseline
compare: ${TEST_SRC}.${TRG}.compare
compare-baseline: ${TEST_SRC}.baseline.${TRG}.compare
compare: ${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare
compare-baseline: ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare
${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare: %.compare: %.eval
${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare ${TEST_SRC}.${BASEMODELNAME}.baseline.${TRG}.compare: %.compare: %.eval
zcat ${TEST_SRC}.gz > $@.1
zcat ${TEST_TRG}.gz > $@.2
zcat ${<:.eval=.gz} > $@.3

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 22.9 54.2/29.2/17.8/11.3 (BP = 0.962 ratio = 0.963 hyp_len = 36946 ref_len = 38369)
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.548

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 25.1 56.3/31.7/19.9/13.1 (BP = 0.963 ratio = 0.964 hyp_len = 36977 ref_len = 38369)
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.568

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 28.5 62.4/35.5/21.9/13.8 (BP = 0.998 ratio = 0.998 hyp_len = 36157 ref_len = 36215)
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.564

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.4.2 = 31.4 64.7/38.5/24.7/16.2 (BP = 0.992 ratio = 0.992 hyp_len = 35930 ref_len = 36215)
chrF2+case.mixed+numchars.6+numrefs.1+space.False+version.1.4.2 = 0.583

View File

@ -44,3 +44,26 @@
| newstestB2017-enfi.en.fi | 22.4 | 0.559 |
| Tatoeba.en.fi | 39.4 | 0.627 |
# opus-2020-02-13.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.zip)
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.test.txt)
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus-2020-02-13.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newsdev2015-enfi.en.fi | 21.1 | 0.552 |
| newstest2015-enfi.en.fi | 22.7 | 0.563 |
| newstest2016-enfi.en.fi | 24.3 | 0.572 |
| newstest2017-enfi.en.fi | 27.1 | 0.601 |
| newstest2018-enfi.en.fi | 18.0 | 0.529 |
| newstest2019-enfi.en.fi | 22.9 | 0.548 |
| newstestB2016-enfi.en.fi | 19.2 | 0.537 |
| newstestB2017-enfi.en.fi | 22.3 | 0.560 |
| Tatoeba.en.fi | 41.1 | 0.645 |

View File

@ -70,3 +70,27 @@
| newstestB2017-fien.fi.en | 27.7 | 0.559 |
| Tatoeba.fi.en | 57.2 | 0.717 |
# opus-2020-02-13.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.zip)
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.test.txt)
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-13.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newsdev2015-enfi.fi.en | 25.4 | 0.538 |
| newstest2015-enfi.fi.en | 27.1 | 0.549 |
| newstest2016-enfi.fi.en | 29.5 | 0.572 |
| newstest2017-enfi.fi.en | 33.1 | 0.598 |
| newstest2018-enfi.fi.en | 24.0 | 0.519 |
| newstest2019-fien.fi.en | 28.9 | 0.566 |
| newstestB2016-enfi.fi.en | 24.5 | 0.527 |
| newstestB2017-enfi.fi.en | 27.9 | 0.560 |
| newstestB2017-fien.fi.en | 27.9 | 0.560 |
| Tatoeba.fi.en | 57.4 | 0.718 |

View File

@ -14,3 +14,19 @@
| fiskmo_testset.sv.fi | 26.4 | 0.616 |
| Tatoeba.sv.fi | 44.7 | 0.667 |
# opus-2020-02-13.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-13.zip](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.zip)
* test set translations: [opus-2020-02-13.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.test.txt)
* test set scores: [opus-2020-02-13.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/sv-fi/opus-2020-02-13.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| fiskmo_testset.sv.fi | 26.1 | 0.613 |
| Tatoeba.sv.fi | 44.8 | 0.673 |