From 870804f4ee15b3054dcf419f56753e45d6a72855 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Tue, 11 Feb 2020 23:20:11 +0200 Subject: [PATCH] finetuning anc backtranslations --- Makefile.dist | 7 +++++-- backtranslate/Makefile | 14 ++++++++------ finetune/Makefile | 25 ++++++++++++++++++++++--- models/bcl-en/README.md | 15 +++++++++++++++ models/bn-en/README.md | 15 +++++++++++++++ models/en-bcl/README.md | 15 +++++++++++++++ models/en-ru/README.md | 22 ++++++++++++++++++++++ models/fi-en/README.md | 24 ++++++++++++++++++++++++ models/ml-en/README.md | 15 +++++++++++++++ models/ru-en/README.md | 23 +++++++++++++++++++++++ 10 files changed, 164 insertions(+), 11 deletions(-) diff --git a/Makefile.dist b/Makefile.dist index 7ae1b3ef..6697d99b 100644 --- a/Makefile.dist +++ b/Makefile.dist @@ -23,11 +23,14 @@ scores: ## get the best model from all kind of alternative setups ## in the following sub directories (add prefix work-) +ALT_MODEL_BASE = work- # ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm -ALT_MODEL_DIR = spm langid +# ALT_MODEL_DIR = spm langid +ALT_MODEL_DIR = langid + best_dist_all: - for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \ + for l in $(sort ${shell ls ${ALT_MODEL_BASE}* | grep -- '-' | grep -v old | grep -v work}); do \ if [ `find work*/$$l -name '*.npz' | wc -l` -gt 0 ]; then \ d=`find work-spm/$$l -name '*.best-perplexity.npz' -exec basename {} \; | cut -f1 -d.`; \ ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \ diff --git a/backtranslate/Makefile b/backtranslate/Makefile index d3b8672f..1310a5ad 100644 --- a/backtranslate/Makefile +++ b/backtranslate/Makefile @@ -30,7 +30,8 @@ MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} ifeq (${MODELNAME},) MODELHOME = ../work-filter/models/${LANGPAIR} - MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}} + # MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}} + MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} endif @@ -77,9 +78,10 @@ all: index.html ${MAKE} ${WIKI_SRC} ${WIKI_TRG} -WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary +# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary +WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource -all-wikis: +all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml for w in ${WIKISOURCES}; do \ ${MAKE} WIKISOURCE=$$w extract-text; \ echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \ @@ -104,18 +106,18 @@ focus-wikis: get-data: ${WIKI_JSON} extract-text: ${WIKI_TXT} -prepare-model: ${LANGPAIR}/decoder.yml +prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml prepare-data: ${WIKI_PRE} translate: ${WIKI_SRC} ${WIKI_TRG} ## translate all parts -translate-all-parts: +translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml for p in ${PARTS}; do \ ${MAKE} PART=$$p translate; \ done ## create jobs for translating all parts -submit-translate-all-parts: +submit-translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml for p in ${PARTS}; do \ ${MAKE} PART=$$p translate.submit; \ done diff --git a/finetune/Makefile b/finetune/Makefile index afb6190c..fcac91b8 100644 --- a/finetune/Makefile +++ b/finetune/Makefile @@ -71,17 +71,36 @@ all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare ## convert a TMX file to create dev-test-train data -## and start fine-tuning +## and start fine-tuning in the direction of sorted lang-IDs +## set REVERSE = 1 to run in the opposite direction +## +## - this also does some filtering of the TMX +## based on language identification and simple scripts and regexes +## - it assumes that ${TMX} points to a valid TMX files +## - it assumes that there are only 2 languages in the TMX (it will only use 2) TMX = vero-20200123.tmx.gz +REVERSE = 0 + tmx-tune: zcat ${TMX} |\ tmx2moses -r -o ${TMX:.tmx.gz=} - s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \ - t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \ + if [ ${REVERSE} -gt 0 ]; then \ + t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \ + s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \ + else \ + s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \ + t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \ + fi; \ echo $$s; echo $$t; \ mkdir -p $$s-$$t; \ paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \ + sort | uniq | \ + python3 ../bitext-match-lang.py -s $$s -t $$t | \ + grep -v '[<>{}]' |\ + $(TOKENIZER)/replace-unicode-punctuation.perl |\ + perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\ + sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\ shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \ mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \ mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \ diff --git a/models/bcl-en/README.md b/models/bcl-en/README.md index 380df394..60da87cb 100644 --- a/models/bcl-en/README.md +++ b/models/bcl-en/README.md @@ -28,3 +28,18 @@ |-----------------------|-------|-------| | JW300.bcl.en | 56.8 | 0.705 | +# opus-2020-02-11.zip + +* dataset: opus +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.zip) +* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.test.txt) +* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.bcl.en | 56.1 | 0.697 | + diff --git a/models/bn-en/README.md b/models/bn-en/README.md index ce5fa6a6..c67a6d3d 100644 --- a/models/bn-en/README.md +++ b/models/bn-en/README.md @@ -28,3 +28,18 @@ |-----------------------|-------|-------| | Tatoeba.bn.en | 49.8 | 0.644 | +# opus-2020-02-11.zip + +* dataset: opus +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.zip) +* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.test.txt) +* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| Tatoeba.bn.en | 49.2 | 0.638 | + diff --git a/models/en-bcl/README.md b/models/en-bcl/README.md index 04876e9f..1066fc4f 100644 --- a/models/en-bcl/README.md +++ b/models/en-bcl/README.md @@ -28,3 +28,18 @@ |-----------------------|-------|-------| | JW300.en.bcl | 55.3 | 0.729 | +# opus-2020-02-11.zip + +* dataset: opus +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.zip) +* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.test.txt) +* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| JW300.en.bcl | 53.8 | 0.719 | + diff --git a/models/en-ru/README.md b/models/en-ru/README.md index f3754000..18b11df0 100644 --- a/models/en-ru/README.md +++ b/models/en-ru/README.md @@ -20,3 +20,25 @@ | newstest2019-enru.en.ru | 22.3 | 0.412 | | Tatoeba.en.ru | 46.9 | 0.656 | +# opus-2020-02-11.zip + +* dataset: opus +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.zip) +* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.test.txt) +* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| newstest2012.en.ru | 31.1 | 0.581 | +| newstest2013.en.ru | 23.5 | 0.513 | +| newstest2015-enru.en.ru | 27.5 | 0.564 | +| newstest2016-enru.en.ru | 26.4 | 0.548 | +| newstest2017-enru.en.ru | 29.1 | 0.572 | +| newstest2018-enru.en.ru | 25.4 | 0.554 | +| newstest2019-enru.en.ru | 27.1 | 0.533 | +| Tatoeba.en.ru | 48.4 | 0.669 | + diff --git a/models/fi-en/README.md b/models/fi-en/README.md index 900b92a8..39552b2a 100644 --- a/models/fi-en/README.md +++ b/models/fi-en/README.md @@ -46,3 +46,27 @@ | newstestB2017-fien.fi.en | 27.3 | 0.556 | | Tatoeba.fi.en | 55.3 | 0.705 | +# opus-2020-02-11.zip + +* dataset: opus +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.zip) +* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.test.txt) +* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| newsdev2015-enfi.fi.en | 25.1 | 0.535 | +| newstest2015-enfi.fi.en | 26.8 | 0.548 | +| newstest2016-enfi.fi.en | 29.1 | 0.569 | +| newstest2017-enfi.fi.en | 32.7 | 0.596 | +| newstest2018-enfi.fi.en | 23.9 | 0.518 | +| newstest2019-fien.fi.en | 28.7 | 0.564 | +| newstestB2016-enfi.fi.en | 24.2 | 0.525 | +| newstestB2017-enfi.fi.en | 27.7 | 0.559 | +| newstestB2017-fien.fi.en | 27.7 | 0.559 | +| Tatoeba.fi.en | 57.2 | 0.717 | + diff --git a/models/ml-en/README.md b/models/ml-en/README.md index 1b618391..7612909a 100644 --- a/models/ml-en/README.md +++ b/models/ml-en/README.md @@ -28,3 +28,18 @@ |-----------------------|-------|-------| | Tatoeba.ml.en | 43.0 | 0.601 | +# opus-2020-02-11.zip + +* dataset: opus +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.zip) +* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.test.txt) +* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| Tatoeba.ml.en | 42.6 | 0.591 | + diff --git a/models/ru-en/README.md b/models/ru-en/README.md index cd68c8fc..cca2385c 100644 --- a/models/ru-en/README.md +++ b/models/ru-en/README.md @@ -67,3 +67,26 @@ | newstest2019-ruen.ru.en | 32.0 | 0.581 | | Tatoeba.ru.en | 59.8 | 0.726 | +# opus-2020-02-11.zip + +* dataset: opus +* model: transformer-align +* pre-processing: normalization + SentencePiece +* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.zip) +* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.test.txt) +* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.eval.txt) + +## Benchmarks + +| testset | BLEU | chr-F | +|-----------------------|-------|-------| +| newstest2012.ru.en | 34.8 | 0.603 | +| newstest2013.ru.en | 28.1 | 0.546 | +| newstest2014-ruen.ru.en | 32.1 | 0.593 | +| newstest2015-enru.ru.en | 30.3 | 0.567 | +| newstest2016-enru.ru.en | 30.1 | 0.566 | +| newstest2017-enru.ru.en | 33.4 | 0.593 | +| newstest2018-enru.ru.en | 29.6 | 0.566 | +| newstest2019-ruen.ru.en | 31.5 | 0.577 | +| Tatoeba.ru.en | 60.8 | 0.734 | +