elg models

This commit is contained in:
Joerg Tiedemann 2022-02-15 20:34:24 +02:00
parent a7edf81254
commit d6461550be
6 changed files with 246 additions and 65 deletions

View File

@ -285,7 +285,7 @@ OPUSLANGS := ${call get-opus-langs}
## - DEVSET is the first of the potential devset that exists with sufficient size
## TODO: what do we do if there is no devset?
POTENTIAL_DEVSETS = Tatoeba GlobalVoices infopankki JW300 bible-uedin
POTENTIAL_DEVSETS = Tatoeba GlobalVoices infopankki wikimedia JW300 bible-uedin
BIGGER_BITEXTS := ${call get-bigger-bitexts,${SRC},${TRG},${DEVSMALLSIZE}}
SMALLER_BITEXTS := ${call get-bigger-bitexts,${SRC},${TRG},${DEVMINSIZE}}
DEVSET ?= ${firstword ${filter ${POTENTIAL_DEVSETS},${BIGGER_BITEXTS}} \

3
lib/env/puhti.mk vendored
View File

@ -8,7 +8,8 @@ DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=5
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=4 CPUJOB_HPC_JOBS=2 CPUJOB_HPC_MEM=64g CPUJOB_HPC_DISK=500
CSCPROJECT = project_2002688
# CSCPROJECT = project_2002688
CSCPROJECT = project_2002982
WORKHOME = ${shell realpath ${PWD}/work}
GPU = v100
HPC_QUEUE = small

View File

@ -29,17 +29,76 @@
ELG_EU_LANGIDS = eng deu swe fin nld dan spa ces fra pol por lav ron est bul ell slk ita mlt slv hrv lit gle hun
ELG_EU_SELECTED = gmq nld spa fra pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus zle zls zlw tur ara heb sqi deu fin
ELG_EU_SELECTED_MULTILANG = "ces slk" "cat oci" "fry ltz nds afr"
ELG_EU_SELECTED = gmq nld pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus zle zls zlw tur ara heb sqi fin
ELG_EU_SELECTED_MULTILANG = "ces slk" "cat oci spa" "por glg"
ELG_EU_SELECTED_BIG = spa fra deu
# "fry ltz nds afr"
# "cat oci"
elg-eng2all:
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-job-bt; \
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-job-bt; \
done
for l in ${ELG_EU_SELECTED}; do \
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-trainjob-bt; \
done
for l in ${ELG_EU_SELECTED_BIG}; do \
${MAKE} MODELTYPE=transformer-big \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-eng2$${l}-trainjob-bt; \
done
elg-all2eng:
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
${MAKE} MODELTYPE=transformer-big TRGLANGS=eng SRCLANGS="$$l" \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-job-bt; \
done
for l in $(filter-out hun mlt,${ELG_EU_SELECTED}); do \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2eng-trainjob-bt; \
done
for l in ${ELG_EU_SELECTED_BIG}; do \
${MAKE} MODELTYPE=transformer-big \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-$${l}2eng-trainjob-bt; \
done
elg-all2spa:
${MAKE} MODELTYPE=transformer-big TRGLANGS=eng SRCLANGS="cat oci spa" \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-job-bt
elg-eng2all-eval1:
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
${MAKE} WALLTIME=1 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-sublang-eval-bt.submit; \
${MAKE} WALLTIME=1 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-eval-bt.submit; \
${MAKE} GPUJOB_HPC_MEM=20g WALLTIME=1 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-eval-testsets-bt.submit; \
done
elg-eng2all-eval2:
for l in ${ELG_EU_SELECTED}; do \
if [ -e ${wildcard work/eng-$$l/*.npz} ]; then \
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-evalall-bt.submit; \
fi \
done
elg-eng2xxx-eval:
${MAKE} WALLTIME=2 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="fry ltz nds afr" tatoeba-sublang-eval-bt.submit
${MAKE} WALLTIME=2 MODELTYPE=transformer-big tatoeba-eng2cel-multieval-bt.submit
${MAKE} GPUJOB_HPC_MEM=32g WALLTIME=2 MODELTYPE=transformer-big tatoeba-eng2lit-eval-testsets-bt.submit
elg-eng2missing:
for l in est lav ron hbs sqi spa fra ita por zlw ara heb deu fin; do \
@ -51,7 +110,33 @@ elg-eng2slv:
elg-missing:
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="ces slk" tatoeba-job-bt
${MAKE} MODELTYPE=transformer-big SRCLANGS=zlw TRGLANGS=gmq tatoeba-job-bt
${MAKE} MODELTYPE=transformer-big SRCLANGS=zlw TRGLANGS=gmq tatoeba-zlw2gmq-trainjob-bt-pivotlang
${MAKE} MODELTYPE=transformer-big SRCLANGS=zlw TRGLANGS=gmq \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-zlw2gmq-trainjob-bt-pivotlang
elg-missing2:
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="ces slk" tatoeba-job-bt
${MAKE} MODELTYPE=transformer-big SRCLANGS=zlw TRGLANGS=gmq \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-zlw2gmq-trainjob-bt
elg-eng2fra:
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS=fra \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-job-bt
elg-eng2zls:
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS=zls \
MARIAN_EXTRA=--no-restore-corpus \
tatoeba-job-bt
elg-eng2heb:
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS=heb MARIAN_EXTRA=--no-restore-corpus tatoeba-job-bt
elg-eng2spa:
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS=spa \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-job-bt

View File

@ -38,39 +38,40 @@ SLURM_JOBNAME ?= $(subst -,,${LANGPAIRSTR})
%.submit:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submit=}"' >>$@
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submit=}.out.%j' >> $@
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submit=}.err.%j' >> $@
mkdir -p ${dir ${TMPWORKDIR}/$@}
echo '#!/bin/bash -l' > ${TMPWORKDIR}/$@
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submit=}"' >>${TMPWORKDIR}/$@
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submit=}.out.%j' >> ${TMPWORKDIR}/$@
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submit=}.err.%j' >> ${TMPWORKDIR}/$@
ifdef EMAIL
echo '#SBATCH --mail-type=END' >> $@
echo '#SBATCH --mail-user=${EMAIL}' >> $@
echo '#SBATCH --mail-type=END' >> ${TMPWORKDIR}/$@
echo '#SBATCH --mail-user=${EMAIL}' >> ${TMPWORKDIR}/$@
endif
echo '#SBATCH --mem=${GPUJOB_HPC_MEM}' >> $@
echo '#SBATCH -n ${GPUJOB_HPC_CORES}' >> $@
echo '#SBATCH -N ${GPUJOB_HPC_NODES}' >> $@
echo '#SBATCH -t ${GPUJOB_HPC_TIME}:00' >> $@
echo '#SBATCH -p ${GPUJOB_HPC_QUEUE}' >> $@
echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> $@
echo '#SBATCH --mem=${GPUJOB_HPC_MEM}' >> ${TMPWORKDIR}/$@
echo '#SBATCH -n ${GPUJOB_HPC_CORES}' >> ${TMPWORKDIR}/$@
echo '#SBATCH -N ${GPUJOB_HPC_NODES}' >> ${TMPWORKDIR}/$@
echo '#SBATCH -t ${GPUJOB_HPC_TIME}:00' >> ${TMPWORKDIR}/$@
echo '#SBATCH -p ${GPUJOB_HPC_QUEUE}' >> ${TMPWORKDIR}/$@
echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> ${TMPWORKDIR}/$@
ifdef BROKEN_NODES
echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@
echo '#SBATCH --exclude=${BROKEN_NODES}' >> ${TMPWORKDIR}/$@
endif
echo '${HPC_EXTRA}' >> $@
echo '${HPC_EXTRA1}' >> $@
echo '${HPC_EXTRA2}' >> $@
echo '${HPC_EXTRA3}' >> $@
echo '${HPC_GPU_EXTRA1}' >> $@
echo '${HPC_GPU_EXTRA2}' >> $@
echo '${HPC_GPU_EXTRA3}' >> $@
echo '${LOAD_GPU_ENV}' >> $@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
echo 'pwd' >> $@
echo 'echo "Starting at `date`"' >> $@
echo 'srun ${MAKE} -j ${GPUJOB_HPC_JOBS} ${MAKEARGS} ${@:.submit=}' >> $@
echo 'echo "Finishing at `date`"' >> $@
sbatch ${SBATCH_ARGS} $@
echo '${HPC_EXTRA}' >> ${TMPWORKDIR}/$@
echo '${HPC_EXTRA1}' >> ${TMPWORKDIR}/$@
echo '${HPC_EXTRA2}' >> ${TMPWORKDIR}/$@
echo '${HPC_EXTRA3}' >> ${TMPWORKDIR}/$@
echo '${HPC_GPU_EXTRA1}' >> ${TMPWORKDIR}/$@
echo '${HPC_GPU_EXTRA2}' >> ${TMPWORKDIR}/$@
echo '${HPC_GPU_EXTRA3}' >> ${TMPWORKDIR}/$@
echo '${LOAD_GPU_ENV}' >> ${TMPWORKDIR}/$@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> ${TMPWORKDIR}/$@
echo 'pwd' >> ${TMPWORKDIR}/$@
echo 'echo "Starting at `date`"' >> ${TMPWORKDIR}/$@
echo 'srun ${MAKE} -j ${GPUJOB_HPC_JOBS} ${MAKEARGS} ${@:.submit=}' >> ${TMPWORKDIR}/$@
echo 'echo "Finishing at `date`"' >> ${TMPWORKDIR}/$@
sbatch ${SBATCH_ARGS} ${TMPWORKDIR}/$@
mkdir -p ${WORKDIR}
mv $@ ${WORKDIR}/$@
mv ${TMPWORKDIR}/$@ ${WORKDIR}/$@
# echo 'srun ${MAKE} NR=${NR} MODELTYPE=${MODELTYPE} DATASET=${DATASET} SRC=${SRC} TRG=${TRG} PRE_SRC=${PRE_SRC} PRE_TRG=${PRE_TRG} ${MAKEARGS} ${@:.submit=}' >> $@
@ -88,38 +89,39 @@ CPUJOB_HPC_JOBS ?= ${CPUJOB_HPC_THREADS}
%.submitcpu:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submitcpu=}"' >>$@
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submitcpu=}.out.%j' >> $@
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submitcpu=}.err.%j' >> $@
mkdir -p ${dir ${TMPWORKDIR}/$@}
echo '#!/bin/bash -l' > ${TMPWORKDIR}/$@
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submitcpu=}"' >>${TMPWORKDIR}/$@
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submitcpu=}.out.%j' >> ${TMPWORKDIR}/$@
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submitcpu=}.err.%j' >> ${TMPWORKDIR}/$@
ifdef EMAIL
echo '#SBATCH --mail-type=END' >> $@
echo '#SBATCH --mail-user=${EMAIL}' >> $@
echo '#SBATCH --mail-type=END' >> ${TMPWORKDIR}/$@
echo '#SBATCH --mail-user=${EMAIL}' >> ${TMPWORKDIR}/$@
endif
echo '#SBATCH --mem=${CPUJOB_HPC_MEM}' >> $@
echo '#SBATCH -n ${CPUJOB_HPC_CORES}' >> $@
echo '#SBATCH -N ${CPUJOB_HPC_NODES}' >> $@
echo '#SBATCH -p ${CPUJOB_HPC_QUEUE}' >> $@
echo '#SBATCH -t ${CPUJOB_HPC_TIME}:00' >> $@
echo '#SBATCH --mem=${CPUJOB_HPC_MEM}' >> ${TMPWORKDIR}/$@
echo '#SBATCH -n ${CPUJOB_HPC_CORES}' >> ${TMPWORKDIR}/$@
echo '#SBATCH -N ${CPUJOB_HPC_NODES}' >> ${TMPWORKDIR}/$@
echo '#SBATCH -p ${CPUJOB_HPC_QUEUE}' >> ${TMPWORKDIR}/$@
echo '#SBATCH -t ${CPUJOB_HPC_TIME}:00' >> ${TMPWORKDIR}/$@
ifdef BROKEN_NODES
echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@
echo '#SBATCH --exclude=${BROKEN_NODES}' >> ${TMPWORKDIR}/$@
endif
echo '${HPC_EXTRA}' >> $@
echo '${HPC_EXTRA1}' >> $@
echo '${HPC_EXTRA2}' >> $@
echo '${HPC_EXTRA3}' >> $@
echo '${HPC_CPU_EXTRA1}' >> $@
echo '${HPC_CPU_EXTRA2}' >> $@
echo '${HPC_CPU_EXTRA3}' >> $@
echo '${LOAD_CPU_ENV}' >> $@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
echo 'pwd' >> $@
echo 'echo "Starting at `date`"' >> $@
echo '${MAKE} -j ${CPUJOB_HPC_JOBS} ${MAKEARGS} ${@:.submitcpu=}' >> $@
echo 'echo "Finishing at `date`"' >> $@
sbatch ${SBATCH_ARGS} $@
echo '${HPC_EXTRA}' >> ${TMPWORKDIR}/$@
echo '${HPC_EXTRA1}' >> ${TMPWORKDIR}/$@
echo '${HPC_EXTRA2}' >> ${TMPWORKDIR}/$@
echo '${HPC_EXTRA3}' >> ${TMPWORKDIR}/$@
echo '${HPC_CPU_EXTRA1}' >> ${TMPWORKDIR}/$@
echo '${HPC_CPU_EXTRA2}' >> ${TMPWORKDIR}/$@
echo '${HPC_CPU_EXTRA3}' >> ${TMPWORKDIR}/$@
echo '${LOAD_CPU_ENV}' >> ${TMPWORKDIR}/$@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> ${TMPWORKDIR}/$@
echo 'pwd' >> ${TMPWORKDIR}/$@
echo 'echo "Starting at `date`"' >> ${TMPWORKDIR}/$@
echo '${MAKE} -j ${CPUJOB_HPC_JOBS} ${MAKEARGS} ${@:.submitcpu=}' >> ${TMPWORKDIR}/$@
echo 'echo "Finishing at `date`"' >> ${TMPWORKDIR}/$@
sbatch ${SBATCH_ARGS} ${TMPWORKDIR}/$@
mkdir -p ${WORKDIR}
mv $@ ${WORKDIR}/$@
mv ${TMPWORKDIR}/$@ ${WORKDIR}/$@
# echo '${MAKE} -j ${HPC_CORES} DATASET=${DATASET} SRC=${SRC} TRG=${TRG} PRE_SRC=${PRE_SRC} PRE_TRG=${PRE_TRG} ${MAKEARGS} ${@:.submitcpu=}' >> $@

View File

@ -100,3 +100,42 @@ endif
-e 's/&/&/g' |\
sed 'n;n;G;' > $@
rm -f $@.1 $@.2 $@.3
# print-bleu-scores:
# grep BLEU ${WORKHOME}/*/*.eval |\
# sed 's#^${WORKHOME}/##' |\
# sed 's/\.\([^\.]*\)\.\([^\.]*\)\.\([^\.]*\)\.eval[^ ]* = \([0-9\.]*\).*$$/ \1 \2-\3 \4/' |\
# sed 's#^\([^/]*\)/\([^\.]*\)\.[^ ]* #\1 \2 #'
print-bleu-scores:
grep BLEU ${WORKHOME}/*/*.eval |\
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
sort -k3,3 -k1,1 -k2,2nr
pretty-print-bleu-scores:
grep BLEU ${WORKHOME}/*/*.eval |\
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
sort -k3,3 -k1,1 -k2,2nr |\
perl -e 'while (<>){@a=split(/\t/);printf "%15s %5.2f %-25s %-15ss %s",@a;}'
print-bleu-scores2:
grep BLEU ${WORKHOME}/*/*.eval |\
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+)\.[^\.]+\.([^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$6-$$7\t$$8\t$$2\t$$1\t$$4\t$$5#' |\
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
sort -k3,3 -k1,1 -k2,2nr
pretty-print-bleu-scores2:
grep BLEU ${WORKHOME}/*/*.eval |\
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+)\.[^\.]+\.([^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$6-$$7\t$$8\t$$2\t$$1\t$$4\t$$5#' |\
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
sort -k3,3 -k1,1 -k2,2nr |\
perl -e 'while (<>){@a=split(/\t/);printf "%15s %5.2f %-25s %-15s %-25s %s",@a;}'

View File

@ -18,7 +18,7 @@ include ${REPOHOME}lib/slurm.mk
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$'}
MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | grep -v '.eval.zip$$'}
MODEL_DIST = ${firstword ${MODEL_DISTS}}
MODEL = ${MODEL_DIST:.zip=}
MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
@ -34,11 +34,14 @@ WORK_DIR = ${WORK_HOME}/${MODEL}
## model directory (for test results)
## model score file and zipfile with evaluation results
MODEL_HOME = ${REPOHOME}models-tatoeba
# MODEL_HOME = ${REPOHOME}models-tatoeba
MODEL_HOME = ${REPOHOME}tatoeba/models
MODEL_DIR = ${MODEL_HOME}/${MODEL}
MODEL_SCORES = ${MODEL_DIR}.scores.txt
MODEL_EVALZIP = ${MODEL_DIR}.eval.zip
LEADERBOARD_DIR = ${REPOHOME}scores
## all zip files with benchmark results
MODEL_EVALZIPS := ${patsubst %.zip,${MODEL_HOME}/%.eval.zip,${MODEL_DISTS}}
@ -63,8 +66,13 @@ ${MODEL_EVALZIPS}: ${TESTSET_INDEX}
fi
${MAKE} MODEL_DIST=${patsubst ${MODEL_HOME}/%.eval.zip,%.zip,$@} eval-model
## evaluate tge model with all benchmarks
## register the scores and update the leaderboard
## final cleanup
.PHONY: eval-model
eval-model: ${MODEL_SCORES}
${MAKE} register-scores
${MAKE} sort-leaderboards
if [ -d ${MODEL_DIR} ]; then \
cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.eval *.compare; \
rm -f ${MODEL_DIR}/*.eval; \
@ -278,3 +286,49 @@ ${MODEL_SCORES}: ${TESTSET_INDEX}
$@.urls $@.nrlines $@.nrwords; \
fi
##-------------------------------------------------------------------
## uodate leader boards with score from score files
## LEADERBOARDS = list of leader boards that need to be sorted
## SCOREFILES = all score files in the model directories
## SCOREFILES_DONE = a flag that shows that the scores are registered
##-------------------------------------------------------------------
SCOREFILES := ${wildcard ${MODEL_HOME}/*/*.scores.txt}
SCOREFILES_DONE = ${SCOREFILES:.txt=.registered}
SCOREFILE_DONE = ${MODEL_SCORES:.txt=.registered}
## update all leader boards with all scores
update-leaderboards: ${SCOREFILES_DONE}
${MAKE} sort-leaderboards
## register the scores for the current model
## (scores will be added to some temporary files sorted by language pair and benchmark)
register-scores: ${SCOREFILE_DONE}
${SCOREFILES_DONE}: %.registered: %.txt
@echo "register scores from ${patsubst ${MODEL_HOME}/%,%,$<}"
@cat $< | perl -e 'while (<>){ @a=split(/\t/); system "mkdir -p ${LEADERBOARD_DIR}/$$a[0]/$$a[1]"; open B,">>${LEADERBOARD_DIR}/$$a[0]/$$a[1]/bleu-scores.$(subst /,.,${patsubst ${MODEL_HOME}/%,%,$<}).unsorted.txt"; open C,">>${LEADERBOARD_DIR}/$$a[0]/$$a[1]/chrf-scores.$(subst /,.,${patsubst ${MODEL_HOME}/%,%,$<}).unsorted.txt"; print B "$$a[3]\t$$a[4]\n"; print C "$$a[2]\t$$a[4]\n"; close B; close C; }'
touch $@
UPDATE_SCORE_DIRS := $(sort $(dir ${wildcard ${LEADERBOARD_DIR}/*/*/*.unsorted.txt}))
LEADERBOARDS_BLEU := $(patsubst %,%bleu-scores.txt,${UPDATE_SCORE_DIRS})
LEADERBOARDS_CHRF := $(patsubst %,%chrf-scores.txt,${UPDATE_SCORE_DIRS})
## sort all leaderboards for which we have new unsorted scores
sort-leaderboards: ${LEADERBOARDS_BLEU} ${LEADERBOARDS_CHRF}
${LEADERBOARDS_BLEU}: ${UPDATE_SCORE_DIRS}
@echo "sort ${patsubst ${LEADERBOARD_DIR}/%,%,$@}"
@cat $(dir $@)bleu-scores*.txt | sort -u -k1,1nr > $@.sorted
@rm -f $(dir $@)bleu-scores*.txt
@mv $@.sorted $@
${LEADERBOARDS_CHRF}: ${UPDATE_SCORE_DIRS}
@echo "sort ${patsubst ${LEADERBOARD_DIR}/%,%,$@}"
@cat $(dir $@)chrf-scores*.txt | sort -u -k1,1nr > $@.sorted
@rm -f $(dir $@)chrf-scores*.txt
@mv $@.sorted $@