finetuning anc backtranslations

This commit is contained in:
Joerg Tiedemann 2020-02-11 23:20:11 +02:00
parent 4b7ae1a39b
commit 870804f4ee
10 changed files with 164 additions and 11 deletions

View File

@ -23,11 +23,14 @@ scores:
## get the best model from all kind of alternative setups
## in the following sub directories (add prefix work-)
ALT_MODEL_BASE = work-
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
ALT_MODEL_DIR = spm langid
# ALT_MODEL_DIR = spm langid
ALT_MODEL_DIR = langid
best_dist_all:
for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
for l in $(sort ${shell ls ${ALT_MODEL_BASE}* | grep -- '-' | grep -v old | grep -v work}); do \
if [ `find work*/$$l -name '*.npz' | wc -l` -gt 0 ]; then \
d=`find work-spm/$$l -name '*.best-perplexity.npz' -exec basename {} \; | cut -f1 -d.`; \
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \

View File

@ -30,7 +30,8 @@ MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifeq (${MODELNAME},)
MODELHOME = ../work-filter/models/${LANGPAIR}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
@ -77,9 +78,10 @@ all: index.html
${MAKE} ${WIKI_SRC} ${WIKI_TRG}
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
all-wikis:
all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
for w in ${WIKISOURCES}; do \
${MAKE} WIKISOURCE=$$w extract-text; \
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
@ -104,18 +106,18 @@ focus-wikis:
get-data: ${WIKI_JSON}
extract-text: ${WIKI_TXT}
prepare-model: ${LANGPAIR}/decoder.yml
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
prepare-data: ${WIKI_PRE}
translate: ${WIKI_SRC} ${WIKI_TRG}
## translate all parts
translate-all-parts:
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
for p in ${PARTS}; do \
${MAKE} PART=$$p translate; \
done
## create jobs for translating all parts
submit-translate-all-parts:
submit-translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
for p in ${PARTS}; do \
${MAKE} PART=$$p translate.submit; \
done

View File

@ -71,17 +71,36 @@ all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare
## convert a TMX file to create dev-test-train data
## and start fine-tuning
## and start fine-tuning in the direction of sorted lang-IDs
## set REVERSE = 1 to run in the opposite direction
##
## - this also does some filtering of the TMX
## based on language identification and simple scripts and regexes
## - it assumes that ${TMX} points to a valid TMX files
## - it assumes that there are only 2 languages in the TMX (it will only use 2)
TMX = vero-20200123.tmx.gz
REVERSE = 0
tmx-tune:
zcat ${TMX} |\
tmx2moses -r -o ${TMX:.tmx.gz=}
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
if [ ${REVERSE} -gt 0 ]; then \
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
else \
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
fi; \
echo $$s; echo $$t; \
mkdir -p $$s-$$t; \
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
sort | uniq | \
python3 ../bitext-match-lang.py -s $$s -t $$t | \
grep -v '[<>{}]' |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \

View File

@ -28,3 +28,18 @@
|-----------------------|-------|-------|
| JW300.bcl.en | 56.8 | 0.705 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.bcl.en | 56.1 | 0.697 |

View File

@ -28,3 +28,18 @@
|-----------------------|-------|-------|
| Tatoeba.bn.en | 49.8 | 0.644 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.bn.en | 49.2 | 0.638 |

View File

@ -28,3 +28,18 @@
|-----------------------|-------|-------|
| JW300.en.bcl | 55.3 | 0.729 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| JW300.en.bcl | 53.8 | 0.719 |

View File

@ -20,3 +20,25 @@
| newstest2019-enru.en.ru | 22.3 | 0.412 |
| Tatoeba.en.ru | 46.9 | 0.656 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newstest2012.en.ru | 31.1 | 0.581 |
| newstest2013.en.ru | 23.5 | 0.513 |
| newstest2015-enru.en.ru | 27.5 | 0.564 |
| newstest2016-enru.en.ru | 26.4 | 0.548 |
| newstest2017-enru.en.ru | 29.1 | 0.572 |
| newstest2018-enru.en.ru | 25.4 | 0.554 |
| newstest2019-enru.en.ru | 27.1 | 0.533 |
| Tatoeba.en.ru | 48.4 | 0.669 |

View File

@ -46,3 +46,27 @@
| newstestB2017-fien.fi.en | 27.3 | 0.556 |
| Tatoeba.fi.en | 55.3 | 0.705 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newsdev2015-enfi.fi.en | 25.1 | 0.535 |
| newstest2015-enfi.fi.en | 26.8 | 0.548 |
| newstest2016-enfi.fi.en | 29.1 | 0.569 |
| newstest2017-enfi.fi.en | 32.7 | 0.596 |
| newstest2018-enfi.fi.en | 23.9 | 0.518 |
| newstest2019-fien.fi.en | 28.7 | 0.564 |
| newstestB2016-enfi.fi.en | 24.2 | 0.525 |
| newstestB2017-enfi.fi.en | 27.7 | 0.559 |
| newstestB2017-fien.fi.en | 27.7 | 0.559 |
| Tatoeba.fi.en | 57.2 | 0.717 |

View File

@ -28,3 +28,18 @@
|-----------------------|-------|-------|
| Tatoeba.ml.en | 43.0 | 0.601 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| Tatoeba.ml.en | 42.6 | 0.591 |

View File

@ -67,3 +67,26 @@
| newstest2019-ruen.ru.en | 32.0 | 0.581 |
| Tatoeba.ru.en | 59.8 | 0.726 |
# opus-2020-02-11.zip
* dataset: opus
* model: transformer-align
* pre-processing: normalization + SentencePiece
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.zip)
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.test.txt)
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.eval.txt)
## Benchmarks
| testset | BLEU | chr-F |
|-----------------------|-------|-------|
| newstest2012.ru.en | 34.8 | 0.603 |
| newstest2013.ru.en | 28.1 | 0.546 |
| newstest2014-ruen.ru.en | 32.1 | 0.593 |
| newstest2015-enru.ru.en | 30.3 | 0.567 |
| newstest2016-enru.ru.en | 30.1 | 0.566 |
| newstest2017-enru.ru.en | 33.4 | 0.593 |
| newstest2018-enru.ru.en | 29.6 | 0.566 |
| newstest2019-ruen.ru.en | 31.5 | 0.577 |
| Tatoeba.ru.en | 60.8 | 0.734 |