Merge remote-tracking branch 'origin/elg-mahti' into puhti

This commit is contained in:
Joerg Tiedemann 2022-03-18 18:01:26 +02:00
commit 1d802e13f9
3 changed files with 39 additions and 15 deletions

View File

@ -502,10 +502,9 @@ ifeq (${CLEAN_CORPUS_TRAINING_DATA},1)
${LOCAL_TRAIN_SRC} $(SRCEXT) $(TRGEXT) \
${LOCAL_TRAIN_SRC}.clean \
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
@mv -f ${LOCAL_TRAIN_SRC}.clean,${SRCEXT} ${LOCAL_TRAIN_SRC}
@mv -f ${LOCAL_TRAIN_SRC}.clean,${TRGEXT} ${LOCAL_TRAIN_TRG}
@rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${SRCEXT}
@rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_SRC}.${TRGEXT}
@mv -f ${LOCAL_TRAIN_SRC}.clean.${SRCEXT} ${LOCAL_TRAIN_SRC}
@mv -f ${LOCAL_TRAIN_SRC}.clean.${TRGEXT} ${LOCAL_TRAIN_TRG}
@rm -f ${LOCAL_TRAIN_SRC}.${SRCEXT} ${LOCAL_TRAIN_SRC}.${TRGEXT}
endif
ifeq (${SHUFFLE_TRAINING_DATA},1)
@echo ".... shuffle complete training data"

View File

@ -125,7 +125,7 @@ elg-ukr2fin-tiny11:
elg-gmq2ukr-tiny11:
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr LANGPAIRSTR="gmq-ukr" train-tiny11-student
${MAKE} MARIAN_EXTRA=--no-restore-corpus CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS="dan isl nno nob nor swe" TRGLANGS=ukr LANGPAIRSTR="gmq-ukr" train-tiny11-student
@ -166,10 +166,10 @@ elg-ukr2lit-tiny11:
elg-deu2ukr-tiny11:
${MAKE} MARIAN_EXTRA=--no-restore-corpus STUDENT_DATA=pft-pbt-bt SRCLANGS=deu TRGLANGS=ukr train-tiny11-student
${MAKE} MARIAN_EXTRA=--no-restore-corpus CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt SRCLANGS=deu TRGLANGS=ukr train-tiny11-student
elg-ukr2deu-tiny11:
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=deu train-tiny11-student
${MAKE} MARIAN_EXTRA=--no-restore-corpus STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 TRGLANGS=deu train-tiny11-student

View File

@ -68,11 +68,32 @@ SORTED_LANGPAIR = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
PWD := $(shell pwd)
## new style of finding the best model for a given language pair:
## use the leaderboard in this repository!
## - use all benchmarks
## - take the one that is mentioned the most
## - count results from Tatoeba only once
## Problem: only the first tatoeba benchmark will be used
## (but this will be the oldest one)
LEADERBOARD_HOME = ../../scores
best-opusmt-model = ${shell grep -H . ${LEADERBOARD_HOME}/${1}/*/${2}.txt | \
sed 's/txt:[0-9\.]*//' | sed -r 's/tatoeba-test-v[0-9]{4}-[0-9]{2}-[0-9]{2}/tatoeba-test/' | \
rev | uniq -f1 | rev | cut -f2 | uniq -c | sort -nr | head -1 | sed 's/^.*http/http/'}
MODELZIP := ${call best-opusmt-model,${LANGPAIR},bleu-scores}
MODELINFO := ${MODELZIP:.zip=.yml}
MODELNAME := ${patsubst %.zip,%,${notdir ${MODELZIP}}}
## old style: query the list of released models
## problems with that ; the first one on the list might not be the best one
# MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4}
# MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
# MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
# MODELZIP = https://object.pouta.csc.fi/Tatoeba-Challenge/ang.tar
MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${LANGPAIR}' | head -1 | cut -f4}
MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${MODELZIP}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
@ -153,10 +174,14 @@ print-modelname:
## --> for scoring translations
##-------------------------------------------
REV_LANGPAIR = ${TRG}-${SRC}
REV_MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4}
REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}}
REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}}
REV_LANGPAIR = ${TRG}-${SRC}
REV_MODELZIP := ${call best-opusmt-model,${REV_LANGPAIR},bleu-scores}
REV_MODELINFO := ${REV_MODELZIP:.zip=.yml}
REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}}
# REV_MODELZIP := ${shell ${WGET} -qq -O - ${TATOEBA_RELEASED} | grep '^${REV_LANGPAIR}' | head -1 | cut -f4}
# REV_MODELINFO = ${patsubst ${TATOEBA_MODEL_STORAGE}/%.zip,${TATOEBA_GITRAW}/models/%.yml,${REV_MODELZIP}}
# REV_MODELNAME = ${patsubst %.zip,%,${notdir ${REV_MODELZIP}}}
REV_MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${REV_MODELINFO} | grep 'use-target-labels' | wc -l}
ifeq (${REV_MULTI_TARGET_MODEL},1)