finetuning anc backtranslations

2024-08-17 16:20:50 +03:00 · 2020-02-11 23:20:11 +02:00 · 2020-02-11 23:20:11 +02:00 · 870804f4ee
commit 870804f4ee
parent 4b7ae1a39b
10 changed files with 164 additions and 11 deletions
--- a/Makefile.dist
+++ b/Makefile.dist
@ -23,11 +23,14 @@ scores:
 ## get the best model from all kind of alternative setups
 ## in the following sub directories (add prefix work-)

+ALT_MODEL_BASE = work-
 # ALT_MODEL_DIR = bpe-old bpe-memad bpe spm-noalign bpe-align spm
-ALT_MODEL_DIR = spm langid
+# ALT_MODEL_DIR = spm langid
+ALT_MODEL_DIR = langid
+

 best_dist_all:
-	for l in $(sort ${shell ls work* | grep -- '-' | grep -v old | grep -v work}); do \
+	for l in $(sort ${shell ls ${ALT_MODEL_BASE}* | grep -- '-' | grep -v old | grep -v work}); do \
 	  if  [ `find work*/$$l -name '*.npz' | wc -l` -gt 0 ]; then \
 	    d=`find work-spm/$$l -name '*.best-perplexity.npz' -exec basename {} \; | cut -f1 -d.`; \
 	    ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
--- a/backtranslate/Makefile
+++ b/backtranslate/Makefile
@ -30,7 +30,8 @@ MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}

 ifeq (${MODELNAME},)
  MODELHOME = ../work-filter/models/${LANGPAIR}
-  MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
+  # MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
+  MODELZIP  = ${lastword ${sort ${wildcard ${MODELHOME}/opus-20*.zip}}}
  MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
 endif

@ -77,9 +78,10 @@ all: index.html
 	${MAKE} ${WIKI_SRC} ${WIKI_TRG}


-WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
+# WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource wiktionary
+WIKISOURCES = wiki wikibooks wikinews wikiquote wikisource

-all-wikis:
+all-wikis: ${LANGPAIR}/${MODELNAME}/decoder.yml
 	for w in ${WIKISOURCES}; do \
 	  ${MAKE} WIKISOURCE=$$w extract-text; \
 	  echo "find ${WIKI_DIR} -name '$$w.${LANGID}.${PART}.gz'"; \
@ -104,18 +106,18 @@ focus-wikis:

 get-data: ${WIKI_JSON}
 extract-text: ${WIKI_TXT}
-prepare-model: ${LANGPAIR}/decoder.yml
+prepare-model: ${LANGPAIR}/${MODELNAME}/decoder.yml
 prepare-data: ${WIKI_PRE}
 translate: ${WIKI_SRC} ${WIKI_TRG}

 ## translate all parts
-translate-all-parts:
+translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
 	for p in ${PARTS}; do \
 	  ${MAKE} PART=$$p translate; \
 	done

 ## create jobs for translating all parts
-submit-translate-all-parts:
+submit-translate-all-parts: ${LANGPAIR}/${MODELNAME}/decoder.yml
 	for p in ${PARTS}; do \
 	  ${MAKE} PART=$$p translate.submit; \
 	done
--- a/finetune/Makefile
+++ b/finetune/Makefile
@ -71,17 +71,36 @@ all: ${TEST_SRC}.${TRG}.compare ${TEST_SRC}.baseline.${TRG}.compare


 ## convert a TMX file to create dev-test-train data
-## and start fine-tuning
+## and start fine-tuning in the direction of sorted lang-IDs
+## set REVERSE = 1 to run in the opposite direction
+##
+## - this also does some filtering of the TMX
+##    based on language identification and simple scripts and regexes
+## - it assumes that ${TMX} points to a valid TMX files
+## - it assumes that there are only 2 languages in the TMX (it will only use 2)

 TMX = vero-20200123.tmx.gz
+REVERSE = 0
+
 tmx-tune:
 	zcat ${TMX} |\
 	tmx2moses -r -o ${TMX:.tmx.gz=}
-	s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
-	t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
+	if [ ${REVERSE} -gt 0 ]; then \
+	  t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
+	  s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
+	else \
+	  s=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | head -1`; \
+	  t=`ls ${TMX:.tmx.gz=}.*-* | sort | sed 's/^.*\.\([a-z]*\)$$/\1/' | tail -1`; \
+	fi; \
 	echo $$s; echo $$t; \
 	mkdir -p $$s-$$t; \
 	paste ${TMX:.tmx.gz=}.*-*.$$s ${TMX:.tmx.gz=}.*-*.$$t | \
+	sort | uniq | \
+	python3 ../bitext-match-lang.py -s $$s -t $$t | \
+	grep -v '[<>{}]' |\
+	$(TOKENIZER)/replace-unicode-punctuation.perl |\
+	perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
+	sed 's/  */ /g;s/^ *//g;s/ *$$//g' |\
 	shuf > ${TMX:.tmx.gz=}.$$s-$$t.shuffled; \
 	mkdir -p $$s-$$t/${TMX:.tmx.gz=}/dev; \
 	mkdir -p $$s-$$t/${TMX:.tmx.gz=}/test; \
--- a/models/bcl-en/README.md
+++ b/models/bcl-en/README.md
@ -28,3 +28,18 @@
 |-----------------------|-------|-------|
 | JW300.bcl.en 	| 56.8 	| 0.705 |

+# opus-2020-02-11.zip
+
+* dataset: opus
+* model: transformer-align
+* pre-processing: normalization + SentencePiece
+* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.zip)
+* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.test.txt)
+* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bcl-en/opus-2020-02-11.eval.txt)
+
+## Benchmarks
+
+| testset               | BLEU  | chr-F |
+|-----------------------|-------|-------|
+| JW300.bcl.en 	| 56.1 	| 0.697 |
+
--- a/models/bn-en/README.md
+++ b/models/bn-en/README.md
@ -28,3 +28,18 @@
 |-----------------------|-------|-------|
 | Tatoeba.bn.en 	| 49.8 	| 0.644 |

+# opus-2020-02-11.zip
+
+* dataset: opus
+* model: transformer-align
+* pre-processing: normalization + SentencePiece
+* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.zip)
+* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.test.txt)
+* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/bn-en/opus-2020-02-11.eval.txt)
+
+## Benchmarks
+
+| testset               | BLEU  | chr-F |
+|-----------------------|-------|-------|
+| Tatoeba.bn.en 	| 49.2 	| 0.638 |
+
--- a/models/en-bcl/README.md
+++ b/models/en-bcl/README.md
@ -28,3 +28,18 @@
 |-----------------------|-------|-------|
 | JW300.en.bcl 	| 55.3 	| 0.729 |

+# opus-2020-02-11.zip
+
+* dataset: opus
+* model: transformer-align
+* pre-processing: normalization + SentencePiece
+* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.zip)
+* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.test.txt)
+* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-bcl/opus-2020-02-11.eval.txt)
+
+## Benchmarks
+
+| testset               | BLEU  | chr-F |
+|-----------------------|-------|-------|
+| JW300.en.bcl 	| 53.8 	| 0.719 |
+
--- a/models/en-ru/README.md
+++ b/models/en-ru/README.md
@ -20,3 +20,25 @@
 | newstest2019-enru.en.ru 	| 22.3 	| 0.412 |
 | Tatoeba.en.ru 	| 46.9 	| 0.656 |

+# opus-2020-02-11.zip
+
+* dataset: opus
+* model: transformer-align
+* pre-processing: normalization + SentencePiece
+* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.zip)
+* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.test.txt)
+* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/en-ru/opus-2020-02-11.eval.txt)
+
+## Benchmarks
+
+| testset               | BLEU  | chr-F |
+|-----------------------|-------|-------|
+| newstest2012.en.ru 	| 31.1 	| 0.581 |
+| newstest2013.en.ru 	| 23.5 	| 0.513 |
+| newstest2015-enru.en.ru 	| 27.5 	| 0.564 |
+| newstest2016-enru.en.ru 	| 26.4 	| 0.548 |
+| newstest2017-enru.en.ru 	| 29.1 	| 0.572 |
+| newstest2018-enru.en.ru 	| 25.4 	| 0.554 |
+| newstest2019-enru.en.ru 	| 27.1 	| 0.533 |
+| Tatoeba.en.ru 	| 48.4 	| 0.669 |
+
--- a/models/fi-en/README.md
+++ b/models/fi-en/README.md
@ -46,3 +46,27 @@
 | newstestB2017-fien.fi.en 	| 27.3 	| 0.556 |
 | Tatoeba.fi.en 	| 55.3 	| 0.705 |

+# opus-2020-02-11.zip
+
+* dataset: opus
+* model: transformer-align
+* pre-processing: normalization + SentencePiece
+* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.zip)
+* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.test.txt)
+* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/fi-en/opus-2020-02-11.eval.txt)
+
+## Benchmarks
+
+| testset               | BLEU  | chr-F |
+|-----------------------|-------|-------|
+| newsdev2015-enfi.fi.en 	| 25.1 	| 0.535 |
+| newstest2015-enfi.fi.en 	| 26.8 	| 0.548 |
+| newstest2016-enfi.fi.en 	| 29.1 	| 0.569 |
+| newstest2017-enfi.fi.en 	| 32.7 	| 0.596 |
+| newstest2018-enfi.fi.en 	| 23.9 	| 0.518 |
+| newstest2019-fien.fi.en 	| 28.7 	| 0.564 |
+| newstestB2016-enfi.fi.en 	| 24.2 	| 0.525 |
+| newstestB2017-enfi.fi.en 	| 27.7 	| 0.559 |
+| newstestB2017-fien.fi.en 	| 27.7 	| 0.559 |
+| Tatoeba.fi.en 	| 57.2 	| 0.717 |
+
--- a/models/ml-en/README.md
+++ b/models/ml-en/README.md
@ -28,3 +28,18 @@
 |-----------------------|-------|-------|
 | Tatoeba.ml.en 	| 43.0 	| 0.601 |

+# opus-2020-02-11.zip
+
+* dataset: opus
+* model: transformer-align
+* pre-processing: normalization + SentencePiece
+* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.zip)
+* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.test.txt)
+* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ml-en/opus-2020-02-11.eval.txt)
+
+## Benchmarks
+
+| testset               | BLEU  | chr-F |
+|-----------------------|-------|-------|
+| Tatoeba.ml.en 	| 42.6 	| 0.591 |
+
--- a/models/ru-en/README.md
+++ b/models/ru-en/README.md
@ -67,3 +67,26 @@
 | newstest2019-ruen.ru.en 	| 32.0 	| 0.581 |
 | Tatoeba.ru.en 	| 59.8 	| 0.726 |

+# opus-2020-02-11.zip
+
+* dataset: opus
+* model: transformer-align
+* pre-processing: normalization + SentencePiece
+* download: [opus-2020-02-11.zip](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.zip)
+* test set translations: [opus-2020-02-11.test.txt](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.test.txt)
+* test set scores: [opus-2020-02-11.eval.txt](https://object.pouta.csc.fi/OPUS-MT-models/ru-en/opus-2020-02-11.eval.txt)
+
+## Benchmarks
+
+| testset               | BLEU  | chr-F |
+|-----------------------|-------|-------|
+| newstest2012.ru.en 	| 34.8 	| 0.603 |
+| newstest2013.ru.en 	| 28.1 	| 0.546 |
+| newstest2014-ruen.ru.en 	| 32.1 	| 0.593 |
+| newstest2015-enru.ru.en 	| 30.3 	| 0.567 |
+| newstest2016-enru.ru.en 	| 30.1 	| 0.566 |
+| newstest2017-enru.ru.en 	| 33.4 	| 0.593 |
+| newstest2018-enru.ru.en 	| 29.6 	| 0.566 |
+| newstest2019-ruen.ru.en 	| 31.5 	| 0.577 |
+| Tatoeba.ru.en 	| 60.8 	| 0.734 |
+