From e428895a00f0c3bf27fef1666723759bd4de21a6 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Fri, 18 Mar 2022 15:16:29 +0200 Subject: [PATCH 1/2] elg --- lib/data.mk | 4 ++-- lib/projects/elg.mk | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/data.mk b/lib/data.mk index 1bcb650b..50eb5b51 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -502,8 +502,8 @@ ifeq (${CLEAN_CORPUS_TRAINING_DATA},1) ${LOCAL_TRAIN_SRC} $(SRCEXT) $(TRGEXT) \ ${LOCAL_TRAIN_SRC}.clean \ ${MIN_NR_TOKENS} ${MAX_NR_TOKENS} - @mv -f ${LOCAL_TRAIN_SRC}.clean,${SRCEXT} ${LOCAL_TRAIN_SRC} - @mv -f ${LOCAL_TRAIN_SRC}.clean,${TRGEXT} ${LOCAL_TRAIN_TRG} + @mv -f ${LOCAL_TRAIN_SRC}.clean.${SRCEXT} ${LOCAL_TRAIN_SRC} + @mv -f ${LOCAL_TRAIN_SRC}.clean.${TRGEXT} ${LOCAL_TRAIN_TRG} @rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${SRCEXT} @rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${TRGEXT} endif diff --git a/lib/projects/elg.mk b/lib/projects/elg.mk index 4a51b13d..803342f6 100644 --- a/lib/projects/elg.mk +++ b/lib/projects/elg.mk @@ -115,7 +115,7 @@ elg-ukr2fin-tiny11: elg-gmq2ukr-tiny11: - ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr LANGPAIRSTR="gmq-ukr" train-tiny11-student + ${MAKE} MARIAN_EXTRA=--no-restore-corpus CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr LANGPAIRSTR="gmq-ukr" train-tiny11-student @@ -156,10 +156,10 @@ elg-ukr2lit-tiny11: elg-deu2ukr-tiny11: - ${MAKE} MARIAN_EXTRA=--no-restore-corpus STUDENT_DATA=pft-pbt-bt SRCLANGS=deu TRGLANGS=ukr train-tiny11-student + ${MAKE} MARIAN_EXTRA=--no-restore-corpus CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS=deu TRGLANGS=ukr train-tiny11-student elg-ukr2deu-tiny11: - ${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=deu train-tiny11-student + ${MAKE} MARIAN_EXTRA=--no-restore-corpus STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 TRGLANGS=deu train-tiny11-student From 22eb187496cc0d85a095b256052ff0cfe3f5e8bc Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Fri, 18 Mar 2022 17:58:27 +0200 Subject: [PATCH 2/2] fix with selecting best model for forward translation --- lib/data.mk | 3 +-- tatoeba/forward-translate/Makefile | 41 ++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/lib/data.mk b/lib/data.mk index 50eb5b51..9d5fc265 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -504,8 +504,7 @@ ifeq (${CLEAN_CORPUS_TRAINING_DATA},1) ${MIN_NR_TOKENS} ${MAX_NR_TOKENS} @mv -f ${LOCAL_TRAIN_SRC}.clean.${SRCEXT} ${LOCAL_TRAIN_SRC} @mv -f ${LOCAL_TRAIN_SRC}.clean.${TRGEXT} ${LOCAL_TRAIN_TRG} - @rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${SRCEXT} - @rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${TRGEXT} + @rm -f ${LOCAL_TRAIN_SRC}.${SRCEXT} ${LOCAL_TRAIN_SRC}.${TRGEXT} endif ifeq (${SHUFFLE_TRAINING_DATA},1) @echo ".... shuffle complete training data" diff --git a/tatoeba/forward-translate/Makefile b/tatoeba/forward-translate/Makefile index 44fc4f34..5510909f 100644 --- a/tatoeba/forward-translate/Makefile +++ b/tatoeba/forward-translate/Makefile @@ -68,11 +68,32 @@ SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}} PWD := $(shell pwd) +## new style of finding the best model for a given language pair: +## use the leaderboard in this repository! +## - use all benchmarks +## - take the one that is mentioned the most +## - count results from Tatoeba only once +## Problem: only the first tatoeba benchmark will be used +## (but this will be the oldest one) + +LEADERBOARD_HOME = ../../scores + +best-opusmt-model = ${shell grep -H . ${LEADERBOARD_HOME}/${1}/*/${2}.txt | \ + sed 's/txt:[0-9\.]*//' | sed -r 's/tatoeba-test-v[0-9]{4}-[0-9]{2}-[0-9]{2}/tatoeba-test/' | \ + rev | uniq -f1 | rev | cut -f2 | uniq -c | sort -nr | head -1 | sed 's/^.*http/http/'} +MODELZIP := ${call best-opusmt-model,${LANGPAIR},bleu-scores} +MODELINFO := ${MODELZIP:.zip=.yml} +MODELNAME := ${patsubst %.zip,%,${notdir ${MODELZIP}}} + + +## old style: query the list of released models +## problems with that ; the first one on the list might not be the best one + +# MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4} +# MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}} +# MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} + -# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar -MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4} -MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}} -MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l} ifneq (${MULTI_TARGET_MODEL},0) @@ -153,10 +174,14 @@ print-modelname: ## --> for scoring translations ##------------------------------------------- -REV_LANGPAIR = ${TRG}-${SRC} -REV_MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4} -REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}} -REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}} +REV_LANGPAIR = ${TRG}-${SRC} +REV_MODELZIP := ${call best-opusmt-model,${REV_LANGPAIR},bleu-scores} +REV_MODELINFO := ${REV_MODELZIP:.zip=.yml} +REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}} + +# REV_MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4} +# REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}} +# REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}} REV_MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${REV_MODELINFO} | grep 'use-target-labels' | wc -l} ifeq (${REV_MULTI_TARGET_MODEL},1)