mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-27 11:03:13 +03:00
elg models
This commit is contained in:
parent
a7edf81254
commit
d6461550be
@ -285,7 +285,7 @@ OPUSLANGS := ${call get-opus-langs}
|
||||
## - DEVSET is the first of the potential devset that exists with sufficient size
|
||||
## TODO: what do we do if there is no devset?
|
||||
|
||||
POTENTIAL_DEVSETS = Tatoeba GlobalVoices infopankki JW300 bible-uedin
|
||||
POTENTIAL_DEVSETS = Tatoeba GlobalVoices infopankki wikimedia JW300 bible-uedin
|
||||
BIGGER_BITEXTS := ${call get-bigger-bitexts,${SRC},${TRG},${DEVSMALLSIZE}}
|
||||
SMALLER_BITEXTS := ${call get-bigger-bitexts,${SRC},${TRG},${DEVMINSIZE}}
|
||||
DEVSET ?= ${firstword ${filter ${POTENTIAL_DEVSETS},${BIGGER_BITEXTS}} \
|
||||
|
3
lib/env/puhti.mk
vendored
3
lib/env/puhti.mk
vendored
@ -8,7 +8,8 @@ DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=5
|
||||
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=4 CPUJOB_HPC_JOBS=2 CPUJOB_HPC_MEM=64g CPUJOB_HPC_DISK=500
|
||||
|
||||
|
||||
CSCPROJECT = project_2002688
|
||||
# CSCPROJECT = project_2002688
|
||||
CSCPROJECT = project_2002982
|
||||
WORKHOME = ${shell realpath ${PWD}/work}
|
||||
GPU = v100
|
||||
HPC_QUEUE = small
|
||||
|
@ -29,17 +29,76 @@
|
||||
|
||||
ELG_EU_LANGIDS = eng deu swe fin nld dan spa ces fra pol por lav ron est bul ell slk ita mlt slv hrv lit gle hun
|
||||
|
||||
ELG_EU_SELECTED = gmq nld spa fra pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus zle zls zlw tur ara heb sqi deu fin
|
||||
ELG_EU_SELECTED_MULTILANG = "ces slk" "cat oci" "fry ltz nds afr"
|
||||
ELG_EU_SELECTED = gmq nld pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus zle zls zlw tur ara heb sqi fin
|
||||
ELG_EU_SELECTED_MULTILANG = "ces slk" "cat oci spa" "por glg"
|
||||
ELG_EU_SELECTED_BIG = spa fra deu
|
||||
|
||||
# "fry ltz nds afr"
|
||||
# "cat oci"
|
||||
|
||||
|
||||
elg-eng2all:
|
||||
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-job-bt; \
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-job-bt; \
|
||||
done
|
||||
for l in ${ELG_EU_SELECTED}; do \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-trainjob-bt; \
|
||||
done
|
||||
for l in ${ELG_EU_SELECTED_BIG}; do \
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-eng2$${l}-trainjob-bt; \
|
||||
done
|
||||
|
||||
|
||||
elg-all2eng:
|
||||
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
|
||||
${MAKE} MODELTYPE=transformer-big TRGLANGS=eng SRCLANGS="$$l" \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-job-bt; \
|
||||
done
|
||||
for l in $(filter-out hun mlt,${ELG_EU_SELECTED}); do \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2eng-trainjob-bt; \
|
||||
done
|
||||
for l in ${ELG_EU_SELECTED_BIG}; do \
|
||||
${MAKE} MODELTYPE=transformer-big \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-$${l}2eng-trainjob-bt; \
|
||||
done
|
||||
|
||||
|
||||
elg-all2spa:
|
||||
${MAKE} MODELTYPE=transformer-big TRGLANGS=eng SRCLANGS="cat oci spa" \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-job-bt
|
||||
|
||||
|
||||
|
||||
|
||||
elg-eng2all-eval1:
|
||||
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
|
||||
${MAKE} WALLTIME=1 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-sublang-eval-bt.submit; \
|
||||
${MAKE} WALLTIME=1 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-eval-bt.submit; \
|
||||
${MAKE} GPUJOB_HPC_MEM=20g WALLTIME=1 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-eval-testsets-bt.submit; \
|
||||
done
|
||||
|
||||
elg-eng2all-eval2:
|
||||
for l in ${ELG_EU_SELECTED}; do \
|
||||
if [ -e ${wildcard work/eng-$$l/*.npz} ]; then \
|
||||
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-evalall-bt.submit; \
|
||||
fi \
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
elg-eng2xxx-eval:
|
||||
${MAKE} WALLTIME=2 MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="fry ltz nds afr" tatoeba-sublang-eval-bt.submit
|
||||
${MAKE} WALLTIME=2 MODELTYPE=transformer-big tatoeba-eng2cel-multieval-bt.submit
|
||||
${MAKE} GPUJOB_HPC_MEM=32g WALLTIME=2 MODELTYPE=transformer-big tatoeba-eng2lit-eval-testsets-bt.submit
|
||||
|
||||
|
||||
elg-eng2missing:
|
||||
for l in est lav ron hbs sqi spa fra ita por zlw ara heb deu fin; do \
|
||||
@ -51,7 +110,33 @@ elg-eng2slv:
|
||||
|
||||
|
||||
elg-missing:
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="ces slk" tatoeba-job-bt
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=zlw TRGLANGS=gmq tatoeba-job-bt
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=zlw TRGLANGS=gmq tatoeba-zlw2gmq-trainjob-bt-pivotlang
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=zlw TRGLANGS=gmq \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-zlw2gmq-trainjob-bt-pivotlang
|
||||
|
||||
elg-missing2:
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="ces slk" tatoeba-job-bt
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=zlw TRGLANGS=gmq \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-zlw2gmq-trainjob-bt
|
||||
|
||||
|
||||
elg-eng2fra:
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS=fra \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-job-bt
|
||||
|
||||
elg-eng2zls:
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS=zls \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
tatoeba-job-bt
|
||||
|
||||
elg-eng2heb:
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS=heb MARIAN_EXTRA=--no-restore-corpus tatoeba-job-bt
|
||||
|
||||
elg-eng2spa:
|
||||
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS=spa \
|
||||
MARIAN_EXTRA=--no-restore-corpus \
|
||||
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
|
||||
tatoeba-job-bt
|
||||
|
112
lib/slurm.mk
112
lib/slurm.mk
@ -38,39 +38,40 @@ SLURM_JOBNAME ?= $(subst -,,${LANGPAIRSTR})
|
||||
|
||||
%.submit:
|
||||
mkdir -p ${WORKDIR}
|
||||
echo '#!/bin/bash -l' > $@
|
||||
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submit=}"' >>$@
|
||||
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submit=}.out.%j' >> $@
|
||||
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submit=}.err.%j' >> $@
|
||||
mkdir -p ${dir ${TMPWORKDIR}/$@}
|
||||
echo '#!/bin/bash -l' > ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submit=}"' >>${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submit=}.out.%j' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submit=}.err.%j' >> ${TMPWORKDIR}/$@
|
||||
ifdef EMAIL
|
||||
echo '#SBATCH --mail-type=END' >> $@
|
||||
echo '#SBATCH --mail-user=${EMAIL}' >> $@
|
||||
echo '#SBATCH --mail-type=END' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH --mail-user=${EMAIL}' >> ${TMPWORKDIR}/$@
|
||||
endif
|
||||
echo '#SBATCH --mem=${GPUJOB_HPC_MEM}' >> $@
|
||||
echo '#SBATCH -n ${GPUJOB_HPC_CORES}' >> $@
|
||||
echo '#SBATCH -N ${GPUJOB_HPC_NODES}' >> $@
|
||||
echo '#SBATCH -t ${GPUJOB_HPC_TIME}:00' >> $@
|
||||
echo '#SBATCH -p ${GPUJOB_HPC_QUEUE}' >> $@
|
||||
echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> $@
|
||||
echo '#SBATCH --mem=${GPUJOB_HPC_MEM}' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -n ${GPUJOB_HPC_CORES}' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -N ${GPUJOB_HPC_NODES}' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -t ${GPUJOB_HPC_TIME}:00' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -p ${GPUJOB_HPC_QUEUE}' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> ${TMPWORKDIR}/$@
|
||||
ifdef BROKEN_NODES
|
||||
echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@
|
||||
echo '#SBATCH --exclude=${BROKEN_NODES}' >> ${TMPWORKDIR}/$@
|
||||
endif
|
||||
echo '${HPC_EXTRA}' >> $@
|
||||
echo '${HPC_EXTRA1}' >> $@
|
||||
echo '${HPC_EXTRA2}' >> $@
|
||||
echo '${HPC_EXTRA3}' >> $@
|
||||
echo '${HPC_GPU_EXTRA1}' >> $@
|
||||
echo '${HPC_GPU_EXTRA2}' >> $@
|
||||
echo '${HPC_GPU_EXTRA3}' >> $@
|
||||
echo '${LOAD_GPU_ENV}' >> $@
|
||||
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
|
||||
echo 'pwd' >> $@
|
||||
echo 'echo "Starting at `date`"' >> $@
|
||||
echo 'srun ${MAKE} -j ${GPUJOB_HPC_JOBS} ${MAKEARGS} ${@:.submit=}' >> $@
|
||||
echo 'echo "Finishing at `date`"' >> $@
|
||||
sbatch ${SBATCH_ARGS} $@
|
||||
echo '${HPC_EXTRA}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_EXTRA1}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_EXTRA2}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_EXTRA3}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_GPU_EXTRA1}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_GPU_EXTRA2}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_GPU_EXTRA3}' >> ${TMPWORKDIR}/$@
|
||||
echo '${LOAD_GPU_ENV}' >> ${TMPWORKDIR}/$@
|
||||
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> ${TMPWORKDIR}/$@
|
||||
echo 'pwd' >> ${TMPWORKDIR}/$@
|
||||
echo 'echo "Starting at `date`"' >> ${TMPWORKDIR}/$@
|
||||
echo 'srun ${MAKE} -j ${GPUJOB_HPC_JOBS} ${MAKEARGS} ${@:.submit=}' >> ${TMPWORKDIR}/$@
|
||||
echo 'echo "Finishing at `date`"' >> ${TMPWORKDIR}/$@
|
||||
sbatch ${SBATCH_ARGS} ${TMPWORKDIR}/$@
|
||||
mkdir -p ${WORKDIR}
|
||||
mv $@ ${WORKDIR}/$@
|
||||
mv ${TMPWORKDIR}/$@ ${WORKDIR}/$@
|
||||
|
||||
# echo 'srun ${MAKE} NR=${NR} MODELTYPE=${MODELTYPE} DATASET=${DATASET} SRC=${SRC} TRG=${TRG} PRE_SRC=${PRE_SRC} PRE_TRG=${PRE_TRG} ${MAKEARGS} ${@:.submit=}' >> $@
|
||||
|
||||
@ -88,38 +89,39 @@ CPUJOB_HPC_JOBS ?= ${CPUJOB_HPC_THREADS}
|
||||
|
||||
%.submitcpu:
|
||||
mkdir -p ${WORKDIR}
|
||||
echo '#!/bin/bash -l' > $@
|
||||
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submitcpu=}"' >>$@
|
||||
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submitcpu=}.out.%j' >> $@
|
||||
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submitcpu=}.err.%j' >> $@
|
||||
mkdir -p ${dir ${TMPWORKDIR}/$@}
|
||||
echo '#!/bin/bash -l' > ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submitcpu=}"' >>${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submitcpu=}.out.%j' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submitcpu=}.err.%j' >> ${TMPWORKDIR}/$@
|
||||
ifdef EMAIL
|
||||
echo '#SBATCH --mail-type=END' >> $@
|
||||
echo '#SBATCH --mail-user=${EMAIL}' >> $@
|
||||
echo '#SBATCH --mail-type=END' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH --mail-user=${EMAIL}' >> ${TMPWORKDIR}/$@
|
||||
endif
|
||||
echo '#SBATCH --mem=${CPUJOB_HPC_MEM}' >> $@
|
||||
echo '#SBATCH -n ${CPUJOB_HPC_CORES}' >> $@
|
||||
echo '#SBATCH -N ${CPUJOB_HPC_NODES}' >> $@
|
||||
echo '#SBATCH -p ${CPUJOB_HPC_QUEUE}' >> $@
|
||||
echo '#SBATCH -t ${CPUJOB_HPC_TIME}:00' >> $@
|
||||
echo '#SBATCH --mem=${CPUJOB_HPC_MEM}' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -n ${CPUJOB_HPC_CORES}' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -N ${CPUJOB_HPC_NODES}' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -p ${CPUJOB_HPC_QUEUE}' >> ${TMPWORKDIR}/$@
|
||||
echo '#SBATCH -t ${CPUJOB_HPC_TIME}:00' >> ${TMPWORKDIR}/$@
|
||||
ifdef BROKEN_NODES
|
||||
echo '#SBATCH --exclude=${BROKEN_NODES}' >> $@
|
||||
echo '#SBATCH --exclude=${BROKEN_NODES}' >> ${TMPWORKDIR}/$@
|
||||
endif
|
||||
echo '${HPC_EXTRA}' >> $@
|
||||
echo '${HPC_EXTRA1}' >> $@
|
||||
echo '${HPC_EXTRA2}' >> $@
|
||||
echo '${HPC_EXTRA3}' >> $@
|
||||
echo '${HPC_CPU_EXTRA1}' >> $@
|
||||
echo '${HPC_CPU_EXTRA2}' >> $@
|
||||
echo '${HPC_CPU_EXTRA3}' >> $@
|
||||
echo '${LOAD_CPU_ENV}' >> $@
|
||||
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
|
||||
echo 'pwd' >> $@
|
||||
echo 'echo "Starting at `date`"' >> $@
|
||||
echo '${MAKE} -j ${CPUJOB_HPC_JOBS} ${MAKEARGS} ${@:.submitcpu=}' >> $@
|
||||
echo 'echo "Finishing at `date`"' >> $@
|
||||
sbatch ${SBATCH_ARGS} $@
|
||||
echo '${HPC_EXTRA}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_EXTRA1}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_EXTRA2}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_EXTRA3}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_CPU_EXTRA1}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_CPU_EXTRA2}' >> ${TMPWORKDIR}/$@
|
||||
echo '${HPC_CPU_EXTRA3}' >> ${TMPWORKDIR}/$@
|
||||
echo '${LOAD_CPU_ENV}' >> ${TMPWORKDIR}/$@
|
||||
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> ${TMPWORKDIR}/$@
|
||||
echo 'pwd' >> ${TMPWORKDIR}/$@
|
||||
echo 'echo "Starting at `date`"' >> ${TMPWORKDIR}/$@
|
||||
echo '${MAKE} -j ${CPUJOB_HPC_JOBS} ${MAKEARGS} ${@:.submitcpu=}' >> ${TMPWORKDIR}/$@
|
||||
echo 'echo "Finishing at `date`"' >> ${TMPWORKDIR}/$@
|
||||
sbatch ${SBATCH_ARGS} ${TMPWORKDIR}/$@
|
||||
mkdir -p ${WORKDIR}
|
||||
mv $@ ${WORKDIR}/$@
|
||||
mv ${TMPWORKDIR}/$@ ${WORKDIR}/$@
|
||||
|
||||
|
||||
# echo '${MAKE} -j ${HPC_CORES} DATASET=${DATASET} SRC=${SRC} TRG=${TRG} PRE_SRC=${PRE_SRC} PRE_TRG=${PRE_TRG} ${MAKEARGS} ${@:.submitcpu=}' >> $@
|
||||
|
39
lib/test.mk
39
lib/test.mk
@ -100,3 +100,42 @@ endif
|
||||
-e 's/&/&/g' |\
|
||||
sed 'n;n;G;' > $@
|
||||
rm -f $@.1 $@.2 $@.3
|
||||
|
||||
|
||||
# print-bleu-scores:
|
||||
# grep BLEU ${WORKHOME}/*/*.eval |\
|
||||
# sed 's#^${WORKHOME}/##' |\
|
||||
# sed 's/\.\([^\.]*\)\.\([^\.]*\)\.\([^\.]*\)\.eval[^ ]* = \([0-9\.]*\).*$$/ \1 \2-\3 \4/' |\
|
||||
# sed 's#^\([^/]*\)/\([^\.]*\)\.[^ ]* #\1 \2 #'
|
||||
|
||||
|
||||
|
||||
print-bleu-scores:
|
||||
grep BLEU ${WORKHOME}/*/*.eval |\
|
||||
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\
|
||||
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
|
||||
sort -k3,3 -k1,1 -k2,2nr
|
||||
|
||||
|
||||
|
||||
pretty-print-bleu-scores:
|
||||
grep BLEU ${WORKHOME}/*/*.eval |\
|
||||
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\
|
||||
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
|
||||
sort -k3,3 -k1,1 -k2,2nr |\
|
||||
perl -e 'while (<>){@a=split(/\t/);printf "%15s %5.2f %-25s %-15ss %s",@a;}'
|
||||
|
||||
|
||||
print-bleu-scores2:
|
||||
grep BLEU ${WORKHOME}/*/*.eval |\
|
||||
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+)\.[^\.]+\.([^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$6-$$7\t$$8\t$$2\t$$1\t$$4\t$$5#' |\
|
||||
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
|
||||
sort -k3,3 -k1,1 -k2,2nr
|
||||
|
||||
pretty-print-bleu-scores2:
|
||||
grep BLEU ${WORKHOME}/*/*.eval |\
|
||||
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+)\.[^\.]+\.([^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$6-$$7\t$$8\t$$2\t$$1\t$$4\t$$5#' |\
|
||||
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
|
||||
sort -k3,3 -k1,1 -k2,2nr |\
|
||||
perl -e 'while (<>){@a=split(/\t/);printf "%15s %5.2f %-25s %-15s %-25s %s",@a;}'
|
||||
|
||||
|
@ -18,7 +18,7 @@ include ${REPOHOME}lib/slurm.mk
|
||||
|
||||
|
||||
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
|
||||
MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$'}
|
||||
MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | grep -v '.eval.zip$$'}
|
||||
MODEL_DIST = ${firstword ${MODEL_DISTS}}
|
||||
MODEL = ${MODEL_DIST:.zip=}
|
||||
MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
|
||||
@ -34,11 +34,14 @@ WORK_DIR = ${WORK_HOME}/${MODEL}
|
||||
|
||||
## model directory (for test results)
|
||||
## model score file and zipfile with evaluation results
|
||||
MODEL_HOME = ${REPOHOME}models-tatoeba
|
||||
# MODEL_HOME = ${REPOHOME}models-tatoeba
|
||||
MODEL_HOME = ${REPOHOME}tatoeba/models
|
||||
MODEL_DIR = ${MODEL_HOME}/${MODEL}
|
||||
MODEL_SCORES = ${MODEL_DIR}.scores.txt
|
||||
MODEL_EVALZIP = ${MODEL_DIR}.eval.zip
|
||||
|
||||
LEADERBOARD_DIR = ${REPOHOME}scores
|
||||
|
||||
## all zip files with benchmark results
|
||||
MODEL_EVALZIPS := ${patsubst %.zip,${MODEL_HOME}/%.eval.zip,${MODEL_DISTS}}
|
||||
|
||||
@ -63,8 +66,13 @@ ${MODEL_EVALZIPS}: ${TESTSET_INDEX}
|
||||
fi
|
||||
${MAKE} MODEL_DIST=${patsubst ${MODEL_HOME}/%.eval.zip,%.zip,$@} eval-model
|
||||
|
||||
## evaluate tge model with all benchmarks
|
||||
## register the scores and update the leaderboard
|
||||
## final cleanup
|
||||
.PHONY: eval-model
|
||||
eval-model: ${MODEL_SCORES}
|
||||
${MAKE} register-scores
|
||||
${MAKE} sort-leaderboards
|
||||
if [ -d ${MODEL_DIR} ]; then \
|
||||
cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.eval *.compare; \
|
||||
rm -f ${MODEL_DIR}/*.eval; \
|
||||
@ -278,3 +286,49 @@ ${MODEL_SCORES}: ${TESTSET_INDEX}
|
||||
$@.urls $@.nrlines $@.nrwords; \
|
||||
fi
|
||||
|
||||
|
||||
|
||||
##-------------------------------------------------------------------
|
||||
## uodate leader boards with score from score files
|
||||
## LEADERBOARDS = list of leader boards that need to be sorted
|
||||
## SCOREFILES = all score files in the model directories
|
||||
## SCOREFILES_DONE = a flag that shows that the scores are registered
|
||||
##-------------------------------------------------------------------
|
||||
|
||||
SCOREFILES := ${wildcard ${MODEL_HOME}/*/*.scores.txt}
|
||||
SCOREFILES_DONE = ${SCOREFILES:.txt=.registered}
|
||||
SCOREFILE_DONE = ${MODEL_SCORES:.txt=.registered}
|
||||
|
||||
## update all leader boards with all scores
|
||||
update-leaderboards: ${SCOREFILES_DONE}
|
||||
${MAKE} sort-leaderboards
|
||||
|
||||
## register the scores for the current model
|
||||
## (scores will be added to some temporary files sorted by language pair and benchmark)
|
||||
register-scores: ${SCOREFILE_DONE}
|
||||
|
||||
${SCOREFILES_DONE}: %.registered: %.txt
|
||||
@echo "register scores from ${patsubst ${MODEL_HOME}/%,%,$<}"
|
||||
@cat $< | perl -e 'while (<>){ @a=split(/\t/); system "mkdir -p ${LEADERBOARD_DIR}/$$a[0]/$$a[1]"; open B,">>${LEADERBOARD_DIR}/$$a[0]/$$a[1]/bleu-scores.$(subst /,.,${patsubst ${MODEL_HOME}/%,%,$<}).unsorted.txt"; open C,">>${LEADERBOARD_DIR}/$$a[0]/$$a[1]/chrf-scores.$(subst /,.,${patsubst ${MODEL_HOME}/%,%,$<}).unsorted.txt"; print B "$$a[3]\t$$a[4]\n"; print C "$$a[2]\t$$a[4]\n"; close B; close C; }'
|
||||
touch $@
|
||||
|
||||
|
||||
UPDATE_SCORE_DIRS := $(sort $(dir ${wildcard ${LEADERBOARD_DIR}/*/*/*.unsorted.txt}))
|
||||
LEADERBOARDS_BLEU := $(patsubst %,%bleu-scores.txt,${UPDATE_SCORE_DIRS})
|
||||
LEADERBOARDS_CHRF := $(patsubst %,%chrf-scores.txt,${UPDATE_SCORE_DIRS})
|
||||
|
||||
## sort all leaderboards for which we have new unsorted scores
|
||||
sort-leaderboards: ${LEADERBOARDS_BLEU} ${LEADERBOARDS_CHRF}
|
||||
|
||||
${LEADERBOARDS_BLEU}: ${UPDATE_SCORE_DIRS}
|
||||
@echo "sort ${patsubst ${LEADERBOARD_DIR}/%,%,$@}"
|
||||
@cat $(dir $@)bleu-scores*.txt | sort -u -k1,1nr > $@.sorted
|
||||
@rm -f $(dir $@)bleu-scores*.txt
|
||||
@mv $@.sorted $@
|
||||
|
||||
${LEADERBOARDS_CHRF}: ${UPDATE_SCORE_DIRS}
|
||||
@echo "sort ${patsubst ${LEADERBOARD_DIR}/%,%,$@}"
|
||||
@cat $(dir $@)chrf-scores*.txt | sort -u -k1,1nr > $@.sorted
|
||||
@rm -f $(dir $@)chrf-scores*.txt
|
||||
@mv $@.sorted $@
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user