mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
finetuning anc backtranslations
This commit is contained in:
parent
4b7ae1a39b
commit
870804f4ee
@ -23,11 +23,14 @@ scores:
|
|||||||
## get the best model from all kind of alternative setups
|
## get the best model from all kind of alternative setups
|
||||||
## in the following sub directories (add prefix work-)
|
## in the following sub directories (add prefix work-)
|
||||||
|
|
||||||
|
ALT_MODEL_BASE = work-
|
||||||
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
|
# ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
|
||||||
ALT_MODEL_DIR = spm langid
|
# ALT_MODEL_DIR = spm langid
|
||||||
|
ALT_MODEL_DIR = langid
|
||||||
|
|
||||||
|
|
||||||
best_dist_all:
|
best_dist_all:
|
||||||
for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
|
for l in $(sort ${shell ls ${ALT_MODEL_BASE}* | grep -- '-' | grep -v old | grep -v work}); do \
|
||||||
if [ `find work*/$$l -name '*.npz' | wc -l` -gt 0 ]; then \
|
if [ `find work*/$$l -name '*.npz' | wc -l` -gt 0 ]; then \
|
||||||
d=`find work-spm/$$l -name '*.best-perplexity.npz' -exec basename {} \; | cut -f1 -d.`; \
|
d=`find work-spm/$$l -name '*.best-perplexity.npz' -exec basename {} \; | cut -f1 -d.`; \
|
||||||
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
||||||
|
@ -30,7 +30,8 @@ MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
|||||||
|
|
||||||
ifeq (${MODELNAME},)
|
ifeq (${MODELNAME},)
|
||||||
MODELHOME = ../work-filter/models/${LANGPAIR}
|
MODELHOME = ../work-filter/models/${LANGPAIR}
|
||||||
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
# MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||||
|
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
|
||||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||||
endif
|
endif
|
||||||
|
|
||||||
@ -77,9 +78,10 @@ all: index.html
|
|||||||
${MAKE} ${WIKI_SRC} ${WIKI_TRG}
|
${MAKE} ${WIKI_SRC} ${WIKI_TRG}
|
||||||
|
|
||||||
|
|
||||||
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
|
||||||
|
WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource
|
||||||
|
|
||||||
all-wikis:
|
all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||||
for w in ${WIKISOURCES}; do \
|
for w in ${WIKISOURCES}; do \
|
||||||
${MAKE} WIKISOURCE=$$w extract-text; \
|
${MAKE} WIKISOURCE=$$w extract-text; \
|
||||||
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
|
echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
|
||||||
@ -104,18 +106,18 @@ focus-wikis:
|
|||||||
|
|
||||||
get-data: ${WIKI_JSON}
|
get-data: ${WIKI_JSON}
|
||||||
extract-text: ${WIKI_TXT}
|
extract-text: ${WIKI_TXT}
|
||||||
prepare-model: ${LANGPAIR}/decoder.yml
|
prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||||
prepare-data: ${WIKI_PRE}
|
prepare-data: ${WIKI_PRE}
|
||||||
translate: ${WIKI_SRC} ${WIKI_TRG}
|
translate: ${WIKI_SRC} ${WIKI_TRG}
|
||||||
|
|
||||||
## translate all parts
|
## translate all parts
|
||||||
translate-all-parts:
|
translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||||
for p in ${PARTS}; do \
|
for p in ${PARTS}; do \
|
||||||
${MAKE} PART=$$p translate; \
|
${MAKE} PART=$$p translate; \
|
||||||
done
|
done
|
||||||
|
|
||||||
## create jobs for translating all parts
|
## create jobs for translating all parts
|
||||||
submit-translate-all-parts:
|
submit-translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||||
for p in ${PARTS}; do \
|
for p in ${PARTS}; do \
|
||||||
${MAKE} PART=$$p translate.submit; \
|
${MAKE} PART=$$p translate.submit; \
|
||||||
done
|
done
|
||||||
|
@ -71,17 +71,36 @@ all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare
|
|||||||
|
|
||||||
|
|
||||||
## convert a TMX file to create dev-test-train data
|
## convert a TMX file to create dev-test-train data
|
||||||
## and start fine-tuning
|
## and start fine-tuning in the direction of sorted lang-IDs
|
||||||
|
## set REVERSE = 1 to run in the opposite direction
|
||||||
|
##
|
||||||
|
## - this also does some filtering of the TMX
|
||||||
|
## based on language identification and simple scripts and regexes
|
||||||
|
## - it assumes that ${TMX} points to a valid TMX files
|
||||||
|
## - it assumes that there are only 2 languages in the TMX (it will only use 2)
|
||||||
|
|
||||||
TMX = vero-20200123.tmx.gz
|
TMX = vero-20200123.tmx.gz
|
||||||
|
REVERSE = 0
|
||||||
|
|
||||||
tmx-tune:
|
tmx-tune:
|
||||||
zcat ${TMX} |\
|
zcat ${TMX} |\
|
||||||
tmx2moses -r -o ${TMX:.tmx.gz=}
|
tmx2moses -r -o ${TMX:.tmx.gz=}
|
||||||
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
if [ ${REVERSE} -gt 0 ]; then \
|
||||||
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||||
|
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||||
|
else \
|
||||||
|
s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
|
||||||
|
t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
|
||||||
|
fi; \
|
||||||
echo $$s; echo $$t; \
|
echo $$s; echo $$t; \
|
||||||
mkdir -p $$s-$$t; \
|
mkdir -p $$s-$$t; \
|
||||||
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
|
paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
|
||||||
|
sort | uniq | \
|
||||||
|
python3 ../bitext-match-lang.py -s $$s -t $$t | \
|
||||||
|
grep -v '[<>{}]' |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||||
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
|
shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
|
||||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
|
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
|
||||||
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \
|
mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \
|
||||||
|
@ -28,3 +28,18 @@
|
|||||||
|-----------------------|-------|-------|
|
|-----------------------|-------|-------|
|
||||||
| JW300.bcl.en | 56.8 | 0.705 |
|
| JW300.bcl.en | 56.8 | 0.705 |
|
||||||
|
|
||||||
|
# opus-2020-02-11.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.zip)
|
||||||
|
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| JW300.bcl.en | 56.1 | 0.697 |
|
||||||
|
|
||||||
|
@ -28,3 +28,18 @@
|
|||||||
|-----------------------|-------|-------|
|
|-----------------------|-------|-------|
|
||||||
| Tatoeba.bn.en | 49.8 | 0.644 |
|
| Tatoeba.bn.en | 49.8 | 0.644 |
|
||||||
|
|
||||||
|
# opus-2020-02-11.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.zip)
|
||||||
|
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| Tatoeba.bn.en | 49.2 | 0.638 |
|
||||||
|
|
||||||
|
@ -28,3 +28,18 @@
|
|||||||
|-----------------------|-------|-------|
|
|-----------------------|-------|-------|
|
||||||
| JW300.en.bcl | 55.3 | 0.729 |
|
| JW300.en.bcl | 55.3 | 0.729 |
|
||||||
|
|
||||||
|
# opus-2020-02-11.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.zip)
|
||||||
|
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| JW300.en.bcl | 53.8 | 0.719 |
|
||||||
|
|
||||||
|
@ -20,3 +20,25 @@
|
|||||||
| newstest2019-enru.en.ru | 22.3 | 0.412 |
|
| newstest2019-enru.en.ru | 22.3 | 0.412 |
|
||||||
| Tatoeba.en.ru | 46.9 | 0.656 |
|
| Tatoeba.en.ru | 46.9 | 0.656 |
|
||||||
|
|
||||||
|
# opus-2020-02-11.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.zip)
|
||||||
|
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| newstest2012.en.ru | 31.1 | 0.581 |
|
||||||
|
| newstest2013.en.ru | 23.5 | 0.513 |
|
||||||
|
| newstest2015-enru.en.ru | 27.5 | 0.564 |
|
||||||
|
| newstest2016-enru.en.ru | 26.4 | 0.548 |
|
||||||
|
| newstest2017-enru.en.ru | 29.1 | 0.572 |
|
||||||
|
| newstest2018-enru.en.ru | 25.4 | 0.554 |
|
||||||
|
| newstest2019-enru.en.ru | 27.1 | 0.533 |
|
||||||
|
| Tatoeba.en.ru | 48.4 | 0.669 |
|
||||||
|
|
||||||
|
@ -46,3 +46,27 @@
|
|||||||
| newstestB2017-fien.fi.en | 27.3 | 0.556 |
|
| newstestB2017-fien.fi.en | 27.3 | 0.556 |
|
||||||
| Tatoeba.fi.en | 55.3 | 0.705 |
|
| Tatoeba.fi.en | 55.3 | 0.705 |
|
||||||
|
|
||||||
|
# opus-2020-02-11.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.zip)
|
||||||
|
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| newsdev2015-enfi.fi.en | 25.1 | 0.535 |
|
||||||
|
| newstest2015-enfi.fi.en | 26.8 | 0.548 |
|
||||||
|
| newstest2016-enfi.fi.en | 29.1 | 0.569 |
|
||||||
|
| newstest2017-enfi.fi.en | 32.7 | 0.596 |
|
||||||
|
| newstest2018-enfi.fi.en | 23.9 | 0.518 |
|
||||||
|
| newstest2019-fien.fi.en | 28.7 | 0.564 |
|
||||||
|
| newstestB2016-enfi.fi.en | 24.2 | 0.525 |
|
||||||
|
| newstestB2017-enfi.fi.en | 27.7 | 0.559 |
|
||||||
|
| newstestB2017-fien.fi.en | 27.7 | 0.559 |
|
||||||
|
| Tatoeba.fi.en | 57.2 | 0.717 |
|
||||||
|
|
||||||
|
@ -28,3 +28,18 @@
|
|||||||
|-----------------------|-------|-------|
|
|-----------------------|-------|-------|
|
||||||
| Tatoeba.ml.en | 43.0 | 0.601 |
|
| Tatoeba.ml.en | 43.0 | 0.601 |
|
||||||
|
|
||||||
|
# opus-2020-02-11.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.zip)
|
||||||
|
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| Tatoeba.ml.en | 42.6 | 0.591 |
|
||||||
|
|
||||||
|
@ -67,3 +67,26 @@
|
|||||||
| newstest2019-ruen.ru.en | 32.0 | 0.581 |
|
| newstest2019-ruen.ru.en | 32.0 | 0.581 |
|
||||||
| Tatoeba.ru.en | 59.8 | 0.726 |
|
| Tatoeba.ru.en | 59.8 | 0.726 |
|
||||||
|
|
||||||
|
# opus-2020-02-11.zip
|
||||||
|
|
||||||
|
* dataset: opus
|
||||||
|
* model: transformer-align
|
||||||
|
* pre-processing: normalization + SentencePiece
|
||||||
|
* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.zip)
|
||||||
|
* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.test.txt)
|
||||||
|
* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.eval.txt)
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| testset | BLEU | chr-F |
|
||||||
|
|-----------------------|-------|-------|
|
||||||
|
| newstest2012.ru.en | 34.8 | 0.603 |
|
||||||
|
| newstest2013.ru.en | 28.1 | 0.546 |
|
||||||
|
| newstest2014-ruen.ru.en | 32.1 | 0.593 |
|
||||||
|
| newstest2015-enru.ru.en | 30.3 | 0.567 |
|
||||||
|
| newstest2016-enru.ru.en | 30.1 | 0.566 |
|
||||||
|
| newstest2017-enru.ru.en | 33.4 | 0.593 |
|
||||||
|
| newstest2018-enru.ru.en | 29.6 | 0.566 |
|
||||||
|
| newstest2019-ruen.ru.en | 31.5 | 0.577 |
|
||||||
|
| Tatoeba.ru.en | 60.8 | 0.734 |
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user