comet scores

This commit is contained in:
Joerg Tiedemann 2022-10-15 22:16:24 +03:00
parent 4f5bc5780f
commit a3fd40003b
16 changed files with 428 additions and 80 deletions

@ -1 +1 @@
Subproject commit a04b403bc9ee3d2a50be716d28453fd1d3e45896
Subproject commit 6c08afb24684b468635e0471f92efa3d6e3def82

@ -1 +1 @@
Subproject commit c0247d60f9221255bdb11fbfe73d5b33336278ea
Subproject commit 86f3589668521eef16a0a6e6435c531e91ec98ae

View File

@ -246,6 +246,7 @@ DEVMINSIZE ?= 250
OPUSREAD_ARGS =
##----------------------------------------------------------------------------
## resources in OPUS
##----------------------------------------------------------------------------
@ -650,9 +651,6 @@ else
endif
## TODO: do we need to reduce workspace for decoding?
# MARIAN_DECODER_WORKSPACE = $$((${MARIAN_WORKSPACE} / 2))
MARIAN_DECODER_WORKSPACE = 10000
## weights associated with training examples
@ -684,6 +682,11 @@ MARIAN_MAXI_BATCH = 512
# MARIAN_MAXI_BATCH = 2048
## TODO: do we need to reduce workspace for decoding?
# MARIAN_DECODER_WORKSPACE = $$((${MARIAN_WORKSPACE} / 2))
MARIAN_DECODER_WORKSPACE = 10000
ifeq ($(GPU_AVAILABLE),1)
MARIAN_SCORER_FLAGS = -n1 -d ${MARIAN_GPUS} \
--quiet-translation -w ${MARIAN_DECODER_WORKSPACE} \

View File

@ -121,7 +121,6 @@ TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter
## all available language pairs
## (download the file once and keep it here to get the language pairs in the release)
TATOEBA_LANGPAIRS := ${shell if [ ! -e ${RELEASED_TATOEBA_DATA_FILE} ]; then \

View File

@ -78,8 +78,9 @@ ifeq (${wildcard $(TEST_EVALUATION)},)
endif
if [ -e $(TEST_EVALUATION) ]; then \
if [ `grep BLEU $(TEST_EVALUATION) | cut -f3 -d ' ' | cut -f1 -d '.'` -ge ${MIN_BLEU_SCORE} ]; then \
${MAKE} MODELSHOME=${RELEASEDIR} link-latest-model; \
${MAKE} MODELSHOME=${RELEASEDIR} \
MODELS_URL=https://object.pouta.csc.fi/${MODEL_CONTAINER} \
MODELS_URL=https://object.pouta.csc.fi/${MODEL_CONTAINER} \
dist; \
fi \
else \
@ -347,15 +348,17 @@ endif
link-latest-model:
if [ `ls ${patsubst %.zip,%_*,${DIST_PACKAGE}} 2>/dev/null | wc -l` -gt 0 ]; then \
rm -f ${DIST_PACKAGE}; \
cd ${dir ${DIST_PACKAGE}}; \
ln -s `ls -t ${patsubst %.zip,%_*.zip,$(notdir ${DIST_PACKAGE})} | head -1` \
${notdir ${DIST_PACKAGE}}; \
if [ `ls ${patsubst %.yml,%_*.yml,${DIST_YML}} 2>/dev/null | wc -l` -gt 0 ]; then \
rm -f ${DIST_YML}; \
cd ${dir ${DIST_YML}}; \
ln -s `ls -t $(patsubst %.yml,%_*.yml,$(notdir ${DIST_YML})) | head -1` $(notdir ${DIST_YML}); \
if [ `ls $(patsubst %.zip,%_*.zip,$(notdir ${DIST_PACKAGE})) 2>/dev/null | wc -l` -gt 0 ]; then \
rm -f $(notdir ${DIST_PACKAGE}); \
ln -s `ls -t $(patsubst %.zip,%_*.zip,$(notdir ${DIST_PACKAGE})) | head -1` $(notdir ${DIST_PACKAGE}); \
fi; \
fi
${DIST_PACKAGE}: ${MODEL_FINAL}
ifneq (${SKIP_DIST_EVAL},1)
@${MAKE} $(TEST_EVALUATION)
@ -512,9 +515,14 @@ endif
.PHONY: upload
upload:
which a-put
if [ -e models-links.tar ]; then \
tar -xf models-links.tar; \
rm -f models-links.tar; \
fi
find ${RELEASEDIR}/ -type l | tar -cf models-links.tar -T -
find ${RELEASEDIR}/ -type l -delete
cd ${RELEASEDIR} && swift upload ${MODEL_CONTAINER} --changed --skip-identical *
-find ${RELEASEDIR}/ -type l -delete
-cd ${RELEASEDIR} && swift upload ${MODEL_CONTAINER} --changed --skip-identical *
tar -xf models-links.tar
rm -f models-links.tar
swift post ${MODEL_CONTAINER} --read-acl ".r:*"

View File

@ -124,6 +124,9 @@ EXTRACT_LEX ?= ${shell which extract_lex 2>/dev/null || echo ${TOOLSDIR}/extr
MOSESSCRIPTS ?= ${TOOLSDIR}/moses-scripts/scripts
TMX2MOSES ?= ${shell which tmx2moses 2>/dev/null || echo ${TOOLSDIR}/OpusTools-perl/scripts/convert/tmx2moses}
GET_ISO_CODE ?= ${ISO639} -m
## marian-nmt binaries
MARIAN_TRAIN = ${MARIAN_HOME}marian

2
lib/env/mahti.mk vendored
View File

@ -107,3 +107,5 @@ MARIAN_BUILD_OPTIONS = -DCUDNN=ON \
# LOAD_EXTRACTLEX_BUILD_ENV = cmake gcc/9.3.0 boost/1.68.0
LOAD_EXTRACTLEX_BUILD_ENV = module load cmake boost
LOAD_COMET_ENV = module load python-data &&

11
lib/env/puhti.mk vendored
View File

@ -46,8 +46,10 @@ else
endif
CPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env
GPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env
# CPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env
# GPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env
CPU_MODULES = perl python-data cuda intel-oneapi-mkl openmpi
GPU_MODULES = perl python-data cuda intel-oneapi-mkl openmpi
LOAD_CPU_ENV = module load ${CPU_MODULES} && module list
LOAD_GPU_ENV = module load ${GPU_MODULES} && module list
@ -70,10 +72,10 @@ endif
HPC_EXTRA1 = \#SBATCH --account=${CSCPROJECT}
BUILD_MODULES = StdEnv python-env cmake perl/5.30.0
BUILD_MODULES = StdEnv perl python-data cuda intel-oneapi-mkl openmpi cmake
LOAD_BUILD_ENV = module purge && module load ${BUILD_MODULES} && module list
MARIAN_BUILD_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 cmake/3.18.2
MARIAN_BUILD_MODULES = StdEnv perl python-data cuda intel-oneapi-mkl openmpi cmake
LOAD_MARIAN_BUILD_ENV = module purge && module load ${MARIAN_BUILD_MODULES} && module list
MARIAN_BUILD_OPTIONS = -DTcmalloc_INCLUDE_DIR=/appl/spack/install-tree/gcc-8.3.0/gperftools-2.7-5w7w2c/include \
-DTcmalloc_LIBRARY=/appl/spack/install-tree/gcc-8.3.0/gperftools-2.7-5w7w2c/lib/libtcmalloc.so \
@ -91,3 +93,4 @@ MARIAN_BUILD_OPTIONS = -DTcmalloc_INCLUDE_DIR=/appl/spack/install-tree/gcc-8.3.
-DFBGEMM_STATIC=1
LOAD_COMET_ENV = module load pytorch &&

View File

@ -438,28 +438,68 @@ elg-new-bigmodels4:
done
elg-new-bigmodels5:
${MAKE} MODELTYPE=transformer-big MARIAN_EXTRA=--no-restore-corpus \
SKIP_SAME_LANG=1 \
DATA_SAMPLING_WEIGHT=0.5 \
SRCLANGS="jpn kor zho" \
TRGLANGS="jpn kor zho" tatoeba-job
elg-new-bigmodels-multieval:
for l in zls zlw; do \
-for l in ara deu fin fra gmq heb jpn por spa zho; do \
${MAKE} MODELTYPE=transformer-big tatoeba-sla2$${l}-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2sla-multieval; \
done
-${MAKE} MODELTYPE=transformer-big tatoeba-sla2sla-multieval
-${MAKE} MODELTYPE=transformer-big tatoeba-sla2kor-multieval-separate-spm
-${MAKE} MODELTYPE=transformer-big tatoeba-kor2sla-multieval-separate-spm
-for l in zls zlw; do \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2fin-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2deu-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-fin2$${l}-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-deu2$${l}-multieval; \
done
for l in bat gmq; do \
-for l in bat gmq; do \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2deu-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-deu2$${l}-multieval; \
done
for l in bat cel gmq zle zls zlw; do \
-for l in ara bat cel eus fas gmq gmw heb sqi tur vie zho zle zls zlw; do \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2itc-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-itc2$${l}-multieval; \
done
${MAKE} MODELTYPE=transformer-big tatoeba-cel2deu-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-deu2cel-multieval; \
for l in bat cel zle zls zlw; do \
-for l in ara bat cel eus fas heb sqi tur vie zho zle zls zlw; do \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2gmq-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-gmq2$${l}-multieval; \
done
-${MAKE} MODELTYPE=transformer-big tatoeba-cel2deu-multieval
-${MAKE} MODELTYPE=transformer-big tatoeba-deu2cel-multieval
-${MAKE} MODELTYPE=transformer-big tatoeba-bat2bat-multieval
-${MAKE} MODELTYPE=transformer-big tatoeba-cel2cel-multieval
-${MAKE} MODELTYPE=transformer-big tatoeba-gmq2gmq-multieval
-${MAKE} MODELTYPE=transformer-big tatoeba-itc2itc-multieval
-${MAKE} MODELTYPE=transformer-big tatoeba-gmw2gmw-multieval
elg-sla-train:
-for l in ara deu fin fra gmq heb jpn por spa zho; do \
${MAKE} MODELTYPE=transformer-big tatoeba-sla2$${l}-trainjob; \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2sla-trainjob; \
done
-${MAKE} MODELTYPE=transformer-big tatoeba-sla2sla-trainjob
-${MAKE} MODELTYPE=transformer-big tatoeba-sla2kor-trainjob-separate-spm
-${MAKE} MODELTYPE=transformer-big tatoeba-kor2sla-trainjob-separate-spm
elg-sla-multieval:
-for l in ara deu fin fra gmq heb jpn por spa zho; do \
${MAKE} MODELTYPE=transformer-big tatoeba-sla2$${l}-multieval; \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2sla-multieval; \
done
-${MAKE} MODELTYPE=transformer-big tatoeba-sla2sla-multieval
-${MAKE} MODELTYPE=transformer-big tatoeba-sla2kor-multieval-separate-spm
-${MAKE} MODELTYPE=transformer-big tatoeba-kor2sla-multieval-separate-spm

View File

@ -37,6 +37,8 @@ fetch-datasets fetch-tatoeba-datasets:
for t in ${MACRO_TRGLANGS}; do \
if [ `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
elif [ `echo '${TATOEBA_LANGPAIRS}' | tr ' ' "\n" | egrep "$$s-$$t|$$t-$$s" | wc -l` -eq 0 ]; then \
echo ".... no package released for $$s-$$t!"; \
else \
if [ "$$s" \< "$$t" ]; then \
if [ ! -e ${TATOEBA_DATA}/${TATOEBA_TRAINSET}.$$s-$$t.clean.$$s.gz ]; then \

View File

@ -134,6 +134,7 @@ LEADERBOARD_DIR = ${REPOHOME}scores
compare-bleu-score-table:
@grep BLEU ${WORKHOME}/*/*.eval |\
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\
grep -v '^[a-z\-]*multi' |\
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
perl -pe '@a=split(/\t/);$$a[2]=lc($$a[2]);$$a[2]=~s/^(.*)\-[a-z]{4}$$/$$1/;$$a[2]=~s/^(.*)\-[a-z]{6}$$/$$1/;$$a[2]=~s/^(news.*)\-[a-z]{4}/$$1/;if (-e "${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt"){$$b=`head -1 ${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt | cut -f1`;$$b+=0;}else{$$b=0;}$$d=$$a[1]-$$b;splice(@a,2,0,$$b,$$d);$$_=join("\t",@a);' |\
sort -k5,5 -k1,1 -k2,2nr
@ -144,20 +145,19 @@ compare-bleu-scores:
print-improved-models:
@make -s compare-bleu-scores |\
grep -v ' 0.00' | grep -v ' -[0-9]'
grep -v ' 0.00 [a-z]' | grep -v ' -[0-9]'
print-decreased-models:
@make -s compare-bleu-scores |\
grep ' -[0-9]'
## compare BLEU scores for the current model
compare-model-bleu-score-table:
@grep BLEU ${WORKDIR}/*.eval |\
perl -pe 's#^${WORKHOME}/([^/]*)/([^\.]+)\.(.*?-.*?\.)?([^\.]+\.[^\.]+\.[^\.]+)\.([^\.]+)\.([^\.]+)\.eval:.*? = ([0-9\.]+) .*$$#$$5-$$6\t$$7\t$$2\t$$1\t$$4#' |\
grep -v '^[a-z\-]*multi' |\
perl -pe '@a=split(/\t/);if($$a[0]=~/multi/){$$a[0]=$$a[3];};$$_=join("\t",@a);' |\
perl -pe '@a=split(/\t/);$$a[2]=lc($$a[2]);$$a[2]=~s/^(.*)\-[a-z]{4}$$/$$1/;$$a[2]=~s/^(.*)\-[a-z]{6}$$/$$1/;$$a[2]=~s/^(news.*)\-[a-z]{4}$$/$$1/;if (-e "${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt"){$$b=`head -1 ${LEADERBOARD_DIR}/$$a[0]/$$a[2]/bleu-scores.txt | cut -f1`;$$b+=0;}else{$$b=0;}$$d=$$a[1]-$$b;splice(@a,2,0,$$b,$$d);$$_=join("\t",@a);' |\
sort -k5,5 -k1,1 -k2,2nr

View File

@ -27,3 +27,24 @@ fix-config:
rm -f decoder.yml
SCOREFILES := ${wildcard */*.scores.txt}
BLEUSCOREFILES := ${SCOREFILES:.scores.txt=.bleu-scores.txt}
CHRFSCOREFILES := ${SCOREFILES:.scores.txt=.chrf-scores.txt}
create-score-files: ${BLEUSCOREFILES} ${CHRFSCOREFILES}
%.bleu-scores.txt: %.scores.txt
cut -f1,2,4 $< | \
sed 's/\(news.*[0-9][0-9][0-9][0-9]\)\-[a-z][a-z][a-z][a-z] /\1 /' |\
sed -e 's/\(news.*2021\)\.[a-z][a-z]\-[a-z][a-z] /\1 /' |\
rev | uniq -f1 | rev > $@
%.chrf-scores.txt: %.scores.txt
cut -f1,2,3 $< |\
sed 's/\(news.*[0-9][0-9][0-9][0-9]\)\-[a-z][a-z][a-z][a-z] /\1 /' |\
sed -e 's/\(news.*2021\)\.[a-z][a-z]\-[a-z][a-z] /\1 /' |\
rev | uniq -f1 | rev > $@

View File

@ -395,14 +395,14 @@ print-langgroups:
show-improved-models:
make -s compare-bleu-score-table-tatoeba | \
grep -v ' 0 ' | grep -v ' -[0-9]' | \
grep -v ' 0 [a-z]' | grep -v ' -[0-9]' | \
cut -f6 | sort -u | xargs
release-improved-models:
for l in ${shell make -s compare-bleu-score-table-tatoeba | \
grep -v ' 0 ' | grep -v ' -[0-9]' | \
grep -v ' 0 [a-z]' | grep -v ' -[0-9]' | \
cut -f6 | sort -u | xargs}; do \
s=`echo "$$l" | cut -f1 -d-`; \
t=`echo "$$l" | cut -f2 -d-`; \
@ -412,7 +412,7 @@ release-improved-models:
## release all models with improved scores even if they are not yet done
release-all-improved-models:
for l in ${shell make -s compare-bleu-score-table-tatoeba | \
grep -v ' 0 ' | grep -v ' -[0-9]' | \
grep -v ' 0 [a-z]' | grep -v ' -[0-9]' | \
cut -f6 | sort -u | xargs}; do \
s=`echo "$$l" | cut -f1 -d-`; \
t=`echo "$$l" | cut -f2 -d-`; \
@ -485,12 +485,37 @@ find-trglanggroup = $(call find-langgroup,$(lastword ${subst -, ,${subst 2, ,${1
find-langgroup-pair = $(sort $(call find-srclanggroup,${1}) $(call find-trglanggroup,${1}) ${2})
## expand language groups to individual languages and language pairs
LANGGROUP_SRCLANGS := ${call find-srclanggroup,${SRCLANGGROUP},${PIVOT}}
LANGGROUP_TRGLANGS := ${call find-srclanggroup,${TRGLANGGROUP},${PIVOT}}
LANGGROUP_LANGPAIRS := $(foreach S,${LANGGROUP_SRCLANGS},$(foreach T,${LANGGROUP_TRGLANGS},${S}-${T}))
## remove non-supported language pairs (make pattern to skip those language pairs)
## also remove combinations of the same language if SKIP_SAME_LANG is set to 1
ifeq (${SKIP_SAME_LANG},1)
LANGGROUP_SAMELANG_LANGPAIRS := $(foreach L,${LANGGROUP_SRCLANGS},${L}-${L})
LANGGROUP_SKIP_LANGPAIRS := ${LANGGROUP_SAMELANG_LANGPAIRS} $(filter-out ${TATOEBA_LANGPAIRS},${LANGGROUP_LANGPAIRS})
LANGGROUP_USE_LANGPAIRS := $(filter-out ${LANGGROUP_SKIP_LANGPAIRS},${LANGGROUP_LANGPAIRS})
LANGGROUP_SKIP_LANGPAIR_PATTERN := $(subst ${SPACE},|,${LANGGROUP_SKIP_LANGPAIRS})
else
LANGGROUP_SKIP_LANGPAIRS := $(filter-out ${TATOEBA_LANGPAIRS},${LANGGROUP_LANGPAIRS})
LANGGROUP_USE_LANGPAIRS := $(filter ${TATOEBA_LANGPAIRS},${LANGGROUP_LANGPAIRS})
LANGGROUP_SKIP_LANGPAIR_PATTERN := $(subst ${SPACE},|,${LANGGROUP_SKIP_LANGPAIRS})
endif
## print languages in this set
tatoeba-%-langs:
-( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-langs,%,$@))); \
@( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-langs,%,$@))); \
t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-langs,%,$@))); \
echo "${call find-srclanggroup,${patsubst tatoeba-%-langs,%,$@},${PIVOT}}"; \
echo "${call find-trglanggroup,${patsubst tatoeba-%-langs,%,$@},${PIVOT}}"; )
@echo 'use: ${LANGGROUP_USE_LANGPAIRS}'
@echo 'skip: ${LANGGROUP_SKIP_LANGPAIR_PATTERN}'
## shortcut to start a target only if certain language group limits are met
## (maximum and minimum number of languages)

View File

@ -2,12 +2,20 @@
# evaluate released Tatoeba MT models
# with existing benchmarks (collected in OPUS-MT-testsets)
#
#
#
# comet-score:
# on puhti: module load pytorch && comet-score
# on mahti: module load python-data && comet-score
## set the home directory of the repository
## this is to find the included makefiles
## (important to have a trailing '/')
SHELL := bash
PWD := ${shell pwd}
REPOHOME := ${PWD}/../../
@ -19,12 +27,20 @@ include ${REPOHOME}lib/slurm.mk
GPUJOB_HPC_MEM = 20g
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
MODEL_DISTS := ${shell ${WGET} -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | grep -v '.eval.zip$$'}
MODEL_DIST = ${firstword ${MODEL_DISTS}}
MODEL = ${MODEL_DIST:.zip=}
MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
MODEL_URL = ${MODEL_STORAGE}/${MODEL_DIST}
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
ifndef MODEL_DISTS
ifneq ($(wildcard models.missing),)
MODEL_DISTS := $(shell cat models.missing)
else
MODEL_DISTS := ${shell ${WGET} -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | grep -v '.eval.zip$$'}
endif
endif
MODEL_DIST = ${firstword ${MODEL_DISTS}}
MODEL = ${MODEL_DIST:.zip=}
MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
MODEL_URL = ${MODEL_STORAGE}/${MODEL_DIST}
MODEL_EVAL_URL = ${MODEL_URL:.zip=.eval.zip}
## directory with all test sets (submodule OPUS-MT-testsets)
TESTSET_HOME := ${REPOHOME}OPUS-MT-testsets/testsets
@ -37,13 +53,46 @@ WORK_DIR = ${WORK_HOME}/${MODEL}
## model directory (for test results)
## model score file and zipfile with evaluation results
# MODEL_HOME = ${REPOHOME}models-tatoeba
MODEL_HOME = ${REPOHOME}tatoeba/models
MODEL_DIR = ${MODEL_HOME}/${MODEL}
MODEL_SCORES = ${MODEL_DIR}.scores.txt
MODEL_EVALZIP = ${MODEL_DIR}.eval.zip
MODEL_HOME = ${REPOHOME}tatoeba/models
MODEL_DIR = ${MODEL_HOME}/${MODEL}
MODEL_SCORES = ${MODEL_DIR}.scores.txt
MODEL_EVALZIP = ${MODEL_DIR}.eval.zip
LEADERBOARD_DIR = ${REPOHOME}scores
MODEL_BLEUSCORES = ${MODEL_DIR}.bleu-scores.txt
MODEL_CHRFSCORES = ${MODEL_DIR}.chrf-scores.txt
MODEL_COMETSCORES = ${MODEL_DIR}.comet-scores.txt
## fix individual score files for all modesl in the index!
ALL_MODEL_BLEUSCORES = ${patsubst %.zip,%.bleu-scores,${MODEL_DISTS}}
ALL_MODEL_CHRFSCORES = ${patsubst %.zip,%.chrf-scores,${MODEL_DISTS}}
all-individual-scores: ${ALL_MODEL_BLEUSCORES} ${ALL_MODEL_CHRFSCORES}
${ALL_MODEL_BLEUSCORES}:
-${MAKE} MODEL_DISTS=${@:.bleu-scores=.zip} individual-scores
# -${MAKE} MODEL_DISTS=${@:.bleu-scores=.zip} ${MODEL_HOME}/$@.txt
${ALL_MODEL_CHRFSCORES}:
-${MAKE} MODEL_DISTS=${@:.chrf-scores=.zip} ${MODEL_HOME}/$@.txt
## MODEL_NOTEVALS ... all released models that do not have an evaluation file yet
## MODEL_LOCAL ...... all model packages in the local release dir
##
## NEW: don't set those variables by default as this slows down other makefile calls
# MODEL_NOTEVALS := $(shell ${WGET} -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | \
sed 's/\.eval\.zip/.zip/' | sort | uniq -c | sed 's/^ *//' | grep '^1 ' | cut -f2 -d' ')
# MODEL_LOCAL := $(patsubst ${MODEL_HOME}/%,%,$(filter-out %.eval.zip,$(shell find ${MODEL_HOME}/ -type f -name '*.zip')))
## all zip files with benchmark results
MODEL_EVALZIPS := ${patsubst %.zip,${MODEL_HOME}/%.eval.zip,${MODEL_DISTS}}
@ -59,38 +108,38 @@ all: ${MODEL_EVALZIPS}
first: $(firstword ${MODEL_EVALZIPS})
## check models that still need to be evaluated
## (i.e. *.eval.zip does not exist)
MODEL_EVALCHECK := ${patsubst %.zip,${MODEL_HOME}/%.eval.check,${MODEL_DISTS}}
.PNONY: print-eval-needed ${MODEL_EVALCHECK}
print-eval-needed: ${MODEL_EVALCHECK}
${MODEL_EVALCHECK}:
@if [ ! -e $(@:.check=.zip) ]; then \
echo "need to make $(@:.check=.zip)"; \
fi
print-model-list:
@echo "${MODEL_DISTS}"
@echo "number of models: ${words ${MODEL_DISTS}}"
#-------------------------------------------------
## phony targets to evaluate only new models
## or only models that exist locally
## (no dependency on testset index)
#-------------------------------------------------
MODEL_EVALNEW := ${patsubst %.zip,${MODEL_HOME}/%.eval.new,${MODEL_DISTS}}
.PNONY: eval-new eval-new-models ${MODEL_EVALNEW}
eval-new eval-new-models: ${MODEL_EVALNEW}
${MODEL_EVALNEW}:
@if [ ! -e $(@:.new=.zip) ]; then \
${MAKE} MODEL_DIST=${patsubst ${MODEL_HOME}/%.eval.new,%.zip,$@} eval-model; \
fi
## check models that still need to be evaluated
## (i.e. *.eval.zip does not exist)
.PNONY: print-eval-needed
print-eval-needed:
@echo "$(shell ${WGET} -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | \
sed 's/\.eval\.zip/.zip/' | sort | uniq -c | sed 's/^ *//' | grep '^1 ' | cut -f2 -d' ')" | \
tr ' ' "\n"
.PNONY: eval-new eval-new-models
eval-new eval-new-models:
${MAKE} MODEL_DISTS="$(shell ${WGET} -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$' | \
sed 's/\.eval\.zip/.zip/' | sort | uniq -c | sed 's/^ *//' | grep '^1 ' | cut -f2 -d' ')" all
## filter out all models that exist locally
MODEL_LOCAL := ${filter ${patsubst %.zip,%.eval.new,$(wildcard ${MODEL_HOME}/*/*.zip)},${MODEL_EVALNEW}}
eval-local: ${MODEL_LOCAL}
.PHONY: print-eval-local
print-eval-local:
@echo ${MODEL_LOCAL} | tr ' ' "\n"
@echo "$(patsubst ${MODEL_HOME}/%,%,$(filter-out %.eval.zip,$(shell find ${MODEL_HOME}/ -type f -name '*.zip')))" | tr ' ' "\n"
.PHONY: eval-local
eval-local:
${MAKE} MODEL_DISTS="$(patsubst ${MODEL_HOME}/%,%,$(filter-out %.eval.zip,$(shell find ${MODEL_HOME}/ -type f -name '*.zip')))" all
#-------------------------------------------------
## create zip-files with all evaluation files
@ -103,7 +152,7 @@ ${MODEL_EVALZIPS}: ${TESTSET_INDEX}
mkdir -p ${@:.eval.zip=}; \
unzip -d ${@:.eval.zip=} $@; \
fi
${MAKE} MODEL_DIST=${patsubst ${MODEL_HOME}/%.eval.zip,%.zip,$@} eval-model
-${MAKE} MODEL_DISTS=${patsubst ${MODEL_HOME}/%.eval.zip,%.zip,$@} eval-model
#-------------------------------------------------
@ -118,13 +167,16 @@ eval-model: ${MODEL_SCORES}
${MAKE} sort-leaderboards; \
fi
if [ -d ${MODEL_DIR} ]; then \
cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.eval *.compare; \
cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.*; \
rm -f ${MODEL_DIR}/*.eval; \
rm -f ${MODEL_DIR}/*.compare; \
rm -f ${MODEL_DIR}/*.comet; \
rm -f ${MODEL_DIR}.done; \
rmdir ${MODEL_DIR}; \
fi
# cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.eval *.compare;
## temporary directory with all benchmark results
${MODEL_DIR}.done:
@ -148,15 +200,46 @@ cleanup:
## fetch translation model
.PHONY: fetch
fetch: ${WORK_DIR}/model/decoder.yml
fetch: ${WORK_DIR}/model/decoder.yml ${MODEL_DIR}
## prepare the model evaluation file directory
## fetch already existing evaluations
${MODEL_DIR}:
mkdir -p $@
-if [ -e ${MODEL_EVALZIP} ]; then \
cd ${MODEL_DIR}; \
unzip -n ${MODEL_EVALZIP}; \
fi
-${WGET} -q -O ${MODEL_DIR}/eval.zip ${MODEL_EVAL_URL}
-if [ -e ${MODEL_DIR}/eval.zip ]; then \
cd ${MODEL_DIR}; \
unzip -n eval.zip; \
rm -f eval.zip; \
fi
localmodel:
if [ -e ${MODEL_HOME}/${MODEL_DIST} ]; then \
echo "local model found: ${MODEL_HOME}/${MODEL_DIST}"; \
else \
echo "${MODEL_URL}"; \
fi
## fetch the model (either from local release dir or from the model storage)
${WORK_DIR}/model/decoder.yml:
mkdir -p ${dir $@}
${WGET} -q -O ${dir $@}model.zip ${MODEL_URL}
if [ -e ${MODEL_HOME}/${MODEL_DIST} ]; then \
cp ${MODEL_HOME}/${MODEL_DIST} ${dir $@}model.zip; \
else \
${WGET} -q -O ${dir $@}model.zip ${MODEL_URL}; \
fi
unzip -d ${dir $@} ${dir $@}model.zip
## fix an old problem with the pre-process script
mv ${dir $@}preprocess.sh ${dir $@}preprocess-old.sh
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
sed -e 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
-e 's#/projappl/project_2001569#$${HOME}/projappl#' \
-e 's#SPMENCODE=.*$$#SPMENCODE=`which spm_encode || echo "$${PWD}/tools/marian-dev/build/spm_encode"`#' \
< ${dir $@}preprocess-old.sh > ${dir $@}preprocess.sh
chmod +x ${dir $@}preprocess.sh
@ -194,6 +277,33 @@ TESTSETS = ${notdir ${basename ${wildcard ${TESTSET_DIR}/*.${SRC}}}}
TESTSET = ${firstword ${TESTSETS}}
MODEL_EVAL_MISSING = $(patsubst %,%.missing,${ALL_LANGPAIRS})
METRICS = bleu chrf comet
.PHONY: find-missing
find-missing: models.missing
models.missing: ${MODEL_EVAL_MISSING}
find . -name '*.missing' | xargs cat | cut -f1 | sort -u > $@
${MODEL_EVAL_MISSING}:
if [ -e ${LEADERBOARD_DIR}/$(@:.missing=)/model-list.txt ]; then \
for m in `grep 'Tatoeba-MT-models' ${LEADERBOARD_DIR}/$(@:.missing=)/model-list.txt`; do\
for t in $(sort $(basename $(filter-out %.labels,$(notdir $(wildcard ${TESTSET_HOME}/$(@:.missing=)/*.*))))); do \
for b in ${METRICS}; do \
if [ ! -f ${LEADERBOARD_DIR}/$(@:.missing=)/$$t/$$b-scores.txt ]; then \
echo "$$m $$t $$b" | sed 's#^.*MT-models/##' >> $@; \
elif [ `grep "$$m" ${LEADERBOARD_DIR}/$(@:.missing=)/$$t/$$b-scores.txt | wc -l` -eq 0 ]; then \
echo "$$m $$t $$b" | sed 's#^.*MT-models/##' >> $@; \
fi \
done \
done \
done \
fi
# for t in `find ${LEADERBOARD_DIR}/$$l -mindepth 1 -maxdepth 1 -type d -printf " %f"`; do \
## eval all language pairs
.PHONY: eval-langpairs
eval-langpairs:
@ -205,9 +315,50 @@ eval-langpairs:
.PHONY: eval-testsets
eval-testsets:
for t in ${TESTSETS}; do \
${MAKE} TESTSET=$$t eval; \
${MAKE} TESTSET=$$t eval comet-eval; \
done
## make score files for individual metrics
## (more convenient to read and extend with new metrics)
## TODO: make them by default and create proper dependencies
individual-scores: ${MODEL_BLEUSCORES} ${MODEL_CHRFSCORES} ${MODEL_COMETSCORES}
${MODEL_BLEUSCORES}: ${MODEL_SCORES}
cut -f1,2,4 ${MODEL_SCORES} | \
sed 's/\(news.*[0-9][0-9][0-9][0-9]\)\-[a-z][a-z][a-z][a-z] /\1 /' |\
sed -e 's/\(news.*2021\)\.[a-z][a-z]\-[a-z][a-z] /\1 /' |\
rev | uniq -f1 | rev > $@
${MODEL_CHRFSCORES}: ${MODEL_SCORES}
cut -f1,2,3 ${MODEL_SCORES} |\
sed 's/\(news.*[0-9][0-9][0-9][0-9]\)\-[a-z][a-z][a-z][a-z] /\1 /' |\
sed -e 's/\(news.*2021\)\.[a-z][a-z]\-[a-z][a-z] /\1 /' |\
rev | uniq -f1 | rev > $@
EVAL_FILES = ${wildcard ${MODEL_DIR}/*.eval}
COMET_EVAL_FILES = ${wildcard ${MODEL_DIR}/*.comet}
${MODEL_COMETSCORES}: ${COMET_EVAL_FILES}
if [ -d ${MODEL_DIR} ]; then \
mkdir -p $(dir $@); \
grep -H COMET ${MODEL_DIR}/*eval | sort > $@.comet; \
cut -f1 -d: $@.comet | rev | cut -f2 -d. | rev > $@.langs; \
cut -f1 -d: $@.comet | rev | cut -f1 -d/ | cut -f3- -d. | rev > $@.testsets; \
cat $@.comet | rev | cut -f1 -d' ' | rev > $@.comet-scores; \
paste $@.langs $@.testsets $@.comet-scores >> $@; \
cat $@ |\
sed -e 's/\(news.*[0-9][0-9][0-9][0-9]\)-[a-z][a-z][a-z][a-z] /\1 /' | \
sed -e 's/\(news.*2021\)\.[a-z][a-z]\-[a-z][a-z] /\1 /' |\
rev | uniq -f1 | rev > $@.sorted; \
mv -f $@.sorted $@; \
rm -f $@.comet $@.langs $@.testsets $@.comet-scores; \
fi
#-------------------------------------------------
# create input file for translation
#-------------------------------------------------
@ -224,6 +375,9 @@ else
USE_TARGET_LABELS = 0
endif
ifneq (${wildcard ${WORK_DIR}}/model/preprocess.sh,)
## double-check whether the preprocessing script
## requires both language IDs or not
ifeq (${shell grep 'source-langid target-langid' ${WORK_DIR}/model/preprocess.sh 2>/dev/null | wc -l},1)
@ -237,6 +391,8 @@ else
PREPROCESS = ${WORK_DIR}/model/preprocess.sh ${SRC} ${WORK_DIR}/model/source.spm
endif
endif
${WORK_DIR}/${TESTSET}.${LANGPAIR}.input: ${TESTSET_DIR}/${TESTSET}.${SRC}
${PREPROCESS} < $< > $@
@ -306,14 +462,71 @@ ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval:
fi
## make the comet score
.PHONY: comet
comet:
${MAKE} fetch
${MAKE} comet-langpairs
${MAKE} ${MODEL_COMETSCORES}
comet-score-file: ${MODEL_COMETSCORES}
comet-register-scores: ${MODEL_COMETSCORES:.txt=.registered}
bleu-register-scores: ${MODEL_BLEUSCORES:.txt=.registered}
chrf-register-scores: ${MODEL_CHRFSCORES:.txt=.registered}
.PHONY: comet-langpairs
comet-langpairs:
for l in ${LANGPAIRS}; do \
${MAKE} LANGPAIR=$$l comet-testsets; \
done
.PHONY: comet-testsets
comet-testsets:
for t in ${TESTSETS}; do \
${MAKE} TESTSET=$$t comet-eval; \
done
.PHONY: comet-eval
comet-eval: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.comet
ifneq (${GPU_AVAILABLE},1)
COMET_PARAM += --gpus 0
endif
${MODEL_DIR}/${TESTSET}.${LANGPAIR}.comet: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval
mkdir -p ${dir $@}
sed -n '1~4p' $(<:.eval=.compare) > $@.src
sed -n '2~4p' $(<:.eval=.compare) > $@.ref
sed -n '3~4p' $(<:.eval=.compare) > $@.hyp
${LOAD_COMET_ENV} comet-score ${COMET_PARAM} \
-s $@.src -r $@.ref -t $@.hyp | cut -f2,3 > $@
tail -1 $@ | sed 's/^.*score:/COMET+default =/' >> $<
rm -f $@.src $@.ref $@.hyp
#-------------------------------------------------
# collect all scores in a file
#-------------------------------------------------
#
# updating scores for models that already have some scores registered
# - need to fetch eval file package
# - avoid re-running things that are already done
# - ingest the new evaluation scores
#
.PHONY: scores
scores: ${MODEL_SCORES}
${MODEL_SCORES}: ${TESTSET_INDEX}
${MODEL_SCORES}: ${TESTSET_INDEX} ${MODEL_COMETSCORES}
-if [ ! -e $@ ]; then \
mkdir -p $(dir $@); \
wget -qq -O $@ ${MODEL_STORAGE}/${MODEL}.scores.txt; \
fi
${MAKE} ${MODEL_DIR}.done
if [ -d ${MODEL_DIR} ]; then \
grep -H BLEU ${MODEL_DIR}/*eval | sort > $@.bleu; \
@ -330,11 +543,17 @@ ${MODEL_SCORES}: ${TESTSET_INDEX}
cat $@.bleu | rev | cut -f1 -d' ' | rev | cut -f1 -d')' > $@.nrwords; \
paste $@.langs $@.testsets \
$@.chrf-scores $@.bleu-scores \
$@.urls $@.nrlines $@.nrwords > $@; \
$@.urls $@.nrlines $@.nrwords >> $@; \
cat $@ | \
sed -e 's/\(news.*[0-9][0-9][0-9][0-9]\)-[a-z][a-z][a-z][a-z] /\1 /' | \
sed -e 's/\(news.*2021\)\.[a-z][a-z]\-[a-z][a-z] /\1 /' |\
rev | uniq -f5 | rev | sort -u > $@.sorted; \
mv -f $@.sorted $@; \
rm -f $@.bleu $@.chrf $@.langs $@.testsets \
$@.chrf-scores $@.bleu-scores \
$@.urls $@.nrlines $@.nrwords; \
fi
${MAKE} individual-scores
@ -348,6 +567,11 @@ SCOREFILES := ${wildcard ${MODEL_HOME}/*/*.scores.txt}
SCOREFILES_DONE = ${SCOREFILES:.txt=.registered}
SCOREFILE_DONE = ${MODEL_SCORES:.txt=.registered}
BLEUSCOREFILE_DONE = ${MODEL_BLEUSCORES:.txt=.registered}
CHRFSCOREFILE_DONE = ${MODEL_CHRFSCORES:.txt=.registered}
COMETSCOREFILE_DONE = ${MODEL_COMETSCORES:.txt=.registered}
## update all leader boards with all scores
update-leaderboards: ${SCOREFILES_DONE}
${MAKE} sort-leaderboards
@ -355,13 +579,21 @@ update-leaderboards: ${SCOREFILES_DONE}
## register the scores for the current model
## (scores will be added to some temporary files sorted by language pair and benchmark)
## NOTE: this removes langIDs from newstest sets to avoid confusion and duplicates
register-scores: ${SCOREFILE_DONE}
# register-scores: ${SCOREFILE_DONE}
# register-scores: ${BLEUSCOREFILE_DONE} ${CHRFSCOREFILE_DONE} ${COMETSCOREFILE_DONE}
register-scores: ${SCOREFILE_DONE} ${COMETSCOREFILE_DONE}
${SCOREFILES_DONE}: %.registered: %.txt
@echo "register scores from ${patsubst ${MODEL_HOME}/%,%,$<}"
@cat $< | perl -e 'while (<>){ @a=split(/\t/); $$a[1]=~s/^(news.*)\-[a-z]{4}/$$1/; system "mkdir -p ${LEADERBOARD_DIR}/$$a[0]/$$a[1]"; open B,">>${LEADERBOARD_DIR}/$$a[0]/$$a[1]/bleu-scores.$(subst /,.,${patsubst ${MODEL_HOME}/%,%,$<}).unsorted.txt"; open C,">>${LEADERBOARD_DIR}/$$a[0]/$$a[1]/chrf-scores.$(subst /,.,${patsubst ${MODEL_HOME}/%,%,$<}).unsorted.txt"; print B "$$a[3]\t$$a[4]\n"; print C "$$a[2]\t$$a[4]\n"; close B; close C; }'
touch $@
${MODEL_DIR}.%-scores.registered: ${MODEL_DIR}.%-scores.txt
@echo "register scores from ${patsubst ${MODEL_HOME}/%,%,$<}"
@cat $< | perl -e 'while (<>){ chomp; @a=split(/\t/);system "mkdir -p ${LEADERBOARD_DIR}/$$a[0]/$$a[1]"; open C,">>${LEADERBOARD_DIR}/$$a[0]/$$a[1]/$(patsubst ${MODEL_DIR}.%-scores.txt,%-scores,$<).$(subst /,.,${patsubst ${MODEL_HOME}/%,%,$<}).unsorted.txt"; print C "$$a[2]\t${MODEL_URL}\n"; close C; }'
touch $@
##-------------------------------------------------------------------
## UPDATE_SCORE_DIRS = directory that contains new scores
@ -369,13 +601,17 @@ ${SCOREFILES_DONE}: %.registered: %.txt
## LEADERBOARDS_BLEU = list of chr-F leader boards that need to be sorted
##-------------------------------------------------------------------
UPDATE_SCORE_DIRS := $(sort $(dir ${wildcard ${LEADERBOARD_DIR}/*/*/*.unsorted.txt}))
LEADERBOARDS_BLEU := $(patsubst %,%bleu-scores.txt,${UPDATE_SCORE_DIRS})
LEADERBOARDS_CHRF := $(patsubst %,%chrf-scores.txt,${UPDATE_SCORE_DIRS})
UPDATE_SCORE_DIRS := $(sort $(dir ${wildcard ${LEADERBOARD_DIR}/*/*/*.unsorted.txt}))
LEADERBOARDS_BLEU := $(patsubst %,%bleu-scores.txt,${UPDATE_SCORE_DIRS})
LEADERBOARDS_CHRF := $(patsubst %,%chrf-scores.txt,${UPDATE_SCORE_DIRS})
LEADERBOARDS_COMET := $(patsubst %,%comet-scores.txt,${UPDATE_SCORE_DIRS})
## sort all leaderboards for which we have new unsorted scores
.PHONY: sort-leaderboards
sort-leaderboards: ${LEADERBOARDS_BLEU} ${LEADERBOARDS_CHRF}
.PHONY: sort-leaderboards sort-bleu-leaderboards sort-chrf-leaderboards sort-comet-leaderboards
sort-leaderboards: ${LEADERBOARDS_BLEU} ${LEADERBOARDS_CHRF} ${LEADERBOARDS_COMET}
sort-bleu-leaderboards: ${LEADERBOARDS_BLEU}
sort-chrf-leaderboards: ${LEADERBOARDS_CHRF}
sort-comet-leaderboards: ${LEADERBOARDS_COMET}
${LEADERBOARDS_BLEU}: ${UPDATE_SCORE_DIRS}
@echo "sort ${patsubst ${LEADERBOARD_DIR}/%,%,$@}"
@ -389,3 +625,9 @@ ${LEADERBOARDS_CHRF}: ${UPDATE_SCORE_DIRS}
@rm -f $(dir $@)chrf-scores*.txt
@mv $@.sorted $@
${LEADERBOARDS_COMET}: ${UPDATE_SCORE_DIRS}
@echo "sort ${patsubst ${LEADERBOARD_DIR}/%,%,$@}"
@cat $(dir $@)comet-scores*.txt | grep '^[0-9]' | sort -k1,1nr | uniq -f1 > $@.sorted
@rm -f $(dir $@)comet-scores*.txt
@mv $@.sorted $@

@ -1 +1 @@
Subproject commit f9afa950e26f5d548d955f92e83e6b8e10cc8438
Subproject commit cff5336ec71b6fee396a95bb0e4bea365e0cd1e8

@ -1 +1 @@
Subproject commit 95720ae19fa21b1726787fb2db57535cafba84fa
Subproject commit e27da623938b84f9abe600774af6fad4fd5f1dd6