diff --git a/Makefile.dist b/Makefile.dist index e311b3b9..865bece2 100644 --- a/Makefile.dist +++ b/Makefile.dist @@ -207,14 +207,14 @@ endif grep chrF *.${DATASET}.${PRE_SRC}-${PRE_TRG}${NR}.${MODELTYPE}.*.eval | cut -f3 -d ' ' > $@.3; \ echo '| testset | BLEU | chr-F |' >> README.md; \ echo '|-----------------------|-------|-------|' >> README.md; \ - paste $@.1 $@.2 $@.3 | sed "s/\t/ | /g;s/^/| /;s/$$/ |/" >> README.md; \ + paste $@.1 $@.2 $@.3 | sed "s/\t/ | /g;s/^/| /;s/$$/ |/" | sort | uniq >> README.md; \ rm -f $@.1 $@.2 $@.3; \ fi @cat ${WORKDIR}/README.md >> ${dir $@}README.md @echo '' >> ${dir $@}README.md @cp models/LICENSE ${WORKDIR}/ @chmod +x ${WORKDIR}/preprocess.sh - @sed -e 's# - /.*/\([^/]*\)$$# - \1#' \ + @sed -e 's# - .*/\([^/]*\)$$# - \1#' \ -e 's/beam-size: [0-9]*$$/beam-size: 6/' \ -e 's/mini-batch: [0-9]*$$/mini-batch: 1/' \ -e 's/maxi-batch: [0-9]*$$/maxi-batch: 1/' \ diff --git a/finetune/Makefile b/finetune/Makefile index 21131187..23c27002 100644 --- a/finetune/Makefile +++ b/finetune/Makefile @@ -1,15 +1,28 @@ # # fine-tune an existing model # set SRC and TRG to source and target language IDs -# defaults: SRC=en TRG=de # # make SRC=xx TRG=yy news-tune-data ...... create tuning data from newstest sets # make SRC=xx TRG=yy all ................. tune and eval # -# other targets: +# +# create a package from the fine-tuned model +# NOTE: set SRCLANGS and TRGLANSG to make the top-level makefile happy +# +# make SRCLANGS=xx TRGLANGS=yy dist +# +# +# other targets for special cases # # make news-enfi ......... make tuned model for en-fi News # make goethe-defi ....... make model for Goethe Institute data +# make waen .............. fine tune fr-en model for Walloon-English +# make enwa .............. same as waen but for English-Walloon +# make waen-dist ......... make a package for wa-en +# +# +# other targets for sub-tasks +# # make data .............. pre-process train/dev data # make tune .............. fine-tune model # make translate ......... translate test set with fine-tuned model @@ -32,9 +45,12 @@ include ../Makefile.env include ../Makefile.config include ../Makefile.slurm - -SRC = en -TRG = de +ifndef SRC + SRC = en +endif +ifndef TRG + TRG = de +endif LANGPAIR = ${SRC}-${TRG} MODEL = news @@ -240,6 +256,103 @@ goethe-test: + +## make dist: +## +## make a package of the fine-tuned model +## and copy it into the models directory (../models) + +## test data evaluation for to generate info about test score in the README.md +OLD_TEST_EVAL = ${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval +NEW_TEST_DATA = ${patsubst %.${SRC},%,${notdir ${TEST_SRC}}} +NEW_TEST_EVAL = ${LANGPAIR}/${MODEL}/model/${NEW_TEST_DATA}.${DATASET}-${MODEL}.${PRE_SRC}-${PRE_TRG}${NR}.transformer.${SRC}.${TRG}.eval + +dist: + cp ${OLD_TEST_EVAL} ${NEW_TEST_EVAL} + ${MAKE} -C .. \ + MODELSHOME=${PWD}/../models \ + MODELS_URL=https://object.pouta.csc.fi/OPUS-MT-models \ + DATASET=${DATASET}-${MODEL} \ + SRCLANGS=${SRCLANGS} TRGLANGS=${TRGLANGS} \ + PREPROCESS_TYPE=spm \ + MODELTYPE=transformer \ + PREPROCESS_SRCMODEL=${PWD}/${LANGPAIR}/${BASEMODELNAME}/source.spm \ + PREPROCESS_TRGMODEL=${PWD}/${LANGPAIR}/${BASEMODELNAME}/target.spm \ + PREPROCESS_DESCRIPTION="normalization + SentencePiece" \ + MODEL_FINAL=${PWD}/${TUNED_MODEL}.npz.best-perplexity.npz \ + MODEL_DECODER=${PWD}/${TUNED_MODEL}.npz.best-perplexity.npz.decoder.yml \ + MODEL_VOCAB=${PWD}/${TUNED_MODEL_VOCAB} \ + MODEL_VALIDLOG=${patsubst %.model,%.valid.log,${PWD}/${TUNED_MODEL}} \ + MODEL_TRAINLOG=${patsubst %.model,%.train.log,${PWD}/${TUNED_MODEL}} \ + TEST_EVALUATION=${PWD}/${TEST_SRC}.${BASEMODELNAME}.${TRG}.eval \ + TEST_COMPARISON=${PWD}/${TEST_SRC}.${BASEMODELNAME}.${TRG}.compare \ + WORKDIR=${PWD}/${LANGPAIR}/${MODEL}/model \ + dist + + + +## fine-tune en-fr model for walloon +## --> can we do that? + +ENWA = ${wildcard ${OPUSHOME}/*/latest/moses/en-wa.*} +enwa-data: en-fr/enwa/train/enwa.fr.gz +waen-data: fr-en/enwa/train/enwa.fr.gz +waen-dist: + ${MAKE} SRCLANGS=wa TRGLANGS=en SRC=fr TRG=en MODEL=enwa dist + +en-fr/enwa/train/enwa.fr.gz: ${ENWA} + mkdir -p en-fr/tmp + cd en-fr/tmp; \ + for c in ${ENWA}; do \ + unzip -n $$c; \ + done + cat en-fr/tmp/*.wa > en-fr/all.wa + cat en-fr/tmp/*.en > en-fr/all.en + paste en-fr/all.en en-fr/all.wa |\ + $(TOKENIZER)/replace-unicode-punctuation.perl |\ + perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\ + sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\ + sort | uniq | shuf > en-fr/all.en-wa + mkdir -p en-fr/enwa/dev + mkdir -p en-fr/enwa/test + mkdir -p en-fr/enwa/train + head -1000 en-fr/all.en-wa | cut -f1 | gzip -c \ + > en-fr/enwa/test/enwa.en.gz + head -1000 en-fr/all.en-wa | cut -f2 | gzip -c \ + > en-fr/enwa/test/enwa.fr.gz + head -2001 en-fr/all.en-wa | tail -1000 | cut -f1 | gzip -c \ + > en-fr/enwa/dev/enwa.en.gz + head -2001 en-fr/all.en-wa | tail -1000 | cut -f2 | gzip -c \ + > en-fr/enwa/dev/enwa.fr.gz + tail -n +2002 en-fr/all.en-wa | cut -f1 | gzip -c \ + > en-fr/enwa/train/enwa.en.gz + tail -n +2002 en-fr/all.en-wa | cut -f2 | gzip -c \ + > en-fr/enwa/train/enwa.fr.gz + rm -f en-fr/all.* + rm -fr en-fr/tmp + +fr-en/enwa/train/enwa.fr.gz: en-fr/enwa/train/enwa.fr.gz + mkdir -p fr-en/enwa/dev + mkdir -p fr-en/enwa/test + mkdir -p fr-en/enwa/train + cp en-fr/enwa/test/enwa.en.gz fr-en/enwa/test/ + cp en-fr/enwa/test/enwa.fr.gz fr-en/enwa/test/ + cp en-fr/enwa/dev/enwa.en.gz fr-en/enwa/dev/ + cp en-fr/enwa/dev/enwa.fr.gz fr-en/enwa/dev/ + cp en-fr/enwa/train/enwa.en.gz fr-en/enwa/train/ + cp en-fr/enwa/train/enwa.fr.gz fr-en/enwa/train/ + + +enwa: en-fr/enwa/train/enwa.fr.gz + ${MAKE} SRC=en TRG=fr MODEL=enwa all + +waen: fr-en/enwa/train/enwa.fr.gz + ${MAKE} SRC=fr TRG=en MODEL=enwa all + + + + + ## make news tuning data from testsets TESTSETS_HOME = ../testsets/${LANGPAIR} diff --git a/models/en-fi/README.md b/models/en-fi/README.md index d1b1debe..8520632a 100644 --- a/models/en-fi/README.md +++ b/models/en-fi/README.md @@ -90,3 +90,19 @@ | newstestB2017-enfi.en.fi | 24.9 | 0.584 | | Tatoeba.en.fi | 41.4 | 0.650 | + +# opus+bt-news-2020-03-21.zip + +* dataset: opus+bt-news +* model: transformer +* pre-processing: normalization + SentencePiece +* download: [opus+bt-news-2020-03-21.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus+bt-news-2020-03-21.zip) +* test set translations: [opus+bt-news-2020-03-21.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus+bt-news-2020-03-21.test.txt) +* test set scores: [opus+bt-news-2020-03-21.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-fi/opus+bt-news-2020-03-21.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| newstest2019-enfi.en.fi | 25.7 | 0.578 | + diff --git a/models/fi-en/README.md b/models/fi-en/README.md index 3a5f831e..ea9094a6 100644 --- a/models/fi-en/README.md +++ b/models/fi-en/README.md @@ -94,3 +94,19 @@ | newstestB2017-fien.fi.en | 27.9 | 0.560 | | Tatoeba.fi.en | 57.4 | 0.718 | + +# opus-news-2020-03-21.zip + +* dataset: opus-news +* model: transformer +* pre-processing: normalization + SentencePiece +* download: [opus-news-2020-03-21.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-news-2020-03-21.zip) +* test set translations: [opus-news-2020-03-21.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-news-2020-03-21.test.txt) +* test set scores: [opus-news-2020-03-21.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-news-2020-03-21.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| newstest2019-fien.fi.en | 31.4 | 0.583 | + diff --git a/models/sv-fi/README.md b/models/sv-fi/README.md index ae60eefb..b49a879f 100644 --- a/models/sv-fi/README.md +++ b/models/sv-fi/README.md @@ -29,4 +29,3 @@ |-----------------------|-------|-------| | fiskmo_testset.sv.fi | 26.1 | 0.613 | | Tatoeba.sv.fi | 44.8 | 0.673 | - diff --git a/models/wa-en/README.md b/models/wa-en/README.md index 4cd242ee..5e223897 100644 --- a/models/wa-en/README.md +++ b/models/wa-en/README.md @@ -13,3 +13,19 @@ |-----------------------|-------|-------| | Tatoeba.wa.en | 25.0 | 0.449 | + +# opus-enwa-2020-03-21.zip + +* dataset: opus-enwa +* model: transformer +* pre-processing: normalization + SentencePiece +* download: [opus-enwa-2020-03-21.zip](https://object.pouta.csc.fi/OPUS-MT-models/wa-en/opus-enwa-2020-03-21.zip) +* test set translations: [opus-enwa-2020-03-21.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/wa-en/opus-enwa-2020-03-21.test.txt) +* test set scores: [opus-enwa-2020-03-21.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/wa-en/opus-enwa-2020-03-21.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| enwa.fr.en | 42.6 | 0.564 | +