evaluation of all released tatoeba models

2024-09-11 12:25:37 +03:00 · 2021-11-28 15:55:34 +02:00 · 2021-11-28 15:55:34 +02:00 · 2abd203be7
commit 2abd203be7
parent 0017e83e6b
8 changed files with 264 additions and 30 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -28,3 +28,6 @@
 [submodule "tools/jq"]
 	path = tools/jq
 	url = https://github.com/stedolan/jq.git
+[submodule "OPUS-MT-testsets"]
+	path = OPUS-MT-testsets
+	url = https://github.com/Helsinki-NLP/OPUS-MT-testsets.git
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 3716e2e47ba13c9a90184570b2e1b80457951f2d
--- a/lib/allas.mk
+++ b/lib/allas.mk
@ -57,7 +57,7 @@ fetch-data:
 	if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
 	  b=OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}; \
 	  cd $(dir $@); \
-	  a-put -b $$b --nc --follow-links --override $(notdir $<); \
+	  a-put -t ${TMPDIR} -b $$b --nc --follow-links --override $(notdir $<); \
 	  if [ "`swift list $$b | grep '$(notdir $<).tar$$'`" == "$(notdir $<).tar" ]; then \
 	    rm -fr $(notdir $<); \
 	    touch $(notdir $@); \
@ -68,6 +68,7 @@ fetch-data:
 	  fi \
 	fi

+# -t /scratch/project_2001194

 ## fetch work data from allas (now with wget instead of a-get)
 ## advantage of wget: don't need to login
--- a/lib/config.mk
+++ b/lib/config.mk
@ -282,7 +282,7 @@ TUNE_GPUJOB_SUBMIT  ?=


 ## existing projects in WORKHOME
-ALL_LANG_PAIRS := ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
+ALL_LANG_PAIRS := ${shell ls ${WORKHOME} 2>/dev/null | grep -- '-' | grep -v old}
 ALL_BILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" |  grep -v -- '\+'}
 ALL_MULTILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -- '\+'}

@ -470,10 +470,10 @@ MARIAN_ATT_HEADS        ?= 8
 MARIAN_DIM_EMB          ?= 512

 MARIAN_DECODER_GPU    = -b 12 -n1 -d ${MARIAN_GPUS} \
-			--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
+			--mini-batch 100 --maxi-batch 200 --maxi-batch-sort src \
 			--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
 MARIAN_DECODER_CPU    = -b 12 -n1 --cpu-threads ${HPC_CORES} \
-			--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
+			--mini-batch 8 --maxi-batch 100 --maxi-batch-sort src \
 			--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
 MARIAN_DECODER_FLAGS  = ${MARIAN_DECODER_GPU}

--- a/lib/data.mk
+++ b/lib/data.mk
@ -710,8 +710,8 @@ endif



-include lib/preprocess.mk
-include lib/bpe.mk
-include lib/sentencepiece.mk
+include ${REPOHOME}lib/preprocess.mk
+include ${REPOHOME}lib/bpe.mk
+include ${REPOHOME}lib/sentencepiece.mk


--- a/lib/env.mk
+++ b/lib/env.mk
@ -7,7 +7,6 @@

 SHELL := /bin/bash

-
 # job-specific settings (overwrite if necessary)
 # HPC_EXTRA: additional SBATCH commands

@ -40,15 +39,15 @@ LOAD_MARIAN_BUILD_ENV = echo "nothing to load"


 ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
-  include lib/env/mahti.mk
+  include ${REPOHOME}lib/env/mahti.mk
 else ifeq (${shell hostname},dx6-ibs-p2)
-  include lib/env/dx6.mk
+  include ${REPOHOME}lib/env/dx6.mk
 else ifeq (${shell hostname},dx7-nkiel-4gpu)
-  include lib/env/dx7.mk
+  include ${REPOHOME}lib/env/dx7.mk
 else ifneq ($(wildcard /wrk/tiedeman/research),)
-  include lib/env/taito.mk
+  include ${REPOHOME}lib/env/taito.mk
 else ifeq (${shell hostname --domain 2>/dev/null},bullx)
-  include lib/env/puhti.mk
+  include ${REPOHOME}lib/env/puhti.mk
 endif


--- a/lib/projects.mk
+++ b/lib/projects.mk
@ -5,24 +5,24 @@
 #


-include lib/projects/celtic.mk
-include lib/projects/finland.mk
-include lib/projects/fiskmo.mk
-include lib/projects/memad.mk
-include lib/projects/multilingual.mk
-include lib/projects/opus.mk
-include lib/projects/romance.mk
-include lib/projects/russian.mk
-include lib/projects/sami.mk
-include lib/projects/finno-ugric.mk
-include lib/projects/wikimedia.mk
-include lib/projects/wikimatrix.mk
+include ${REPOHOME}lib/projects/celtic.mk
+include ${REPOHOME}lib/projects/finland.mk
+include ${REPOHOME}lib/projects/fiskmo.mk
+include ${REPOHOME}lib/projects/memad.mk
+include ${REPOHOME}lib/projects/multilingual.mk
+include ${REPOHOME}lib/projects/opus.mk
+include ${REPOHOME}lib/projects/romance.mk
+include ${REPOHOME}lib/projects/russian.mk
+include ${REPOHOME}lib/projects/sami.mk
+include ${REPOHOME}lib/projects/finno-ugric.mk
+include ${REPOHOME}lib/projects/wikimedia.mk
+include ${REPOHOME}lib/projects/wikimatrix.mk

-include lib/projects/doclevel.mk
-include lib/projects/simplify.mk
+include ${REPOHOME}lib/projects/doclevel.mk
+include ${REPOHOME}lib/projects/simplify.mk

-include lib/projects/tatoeba.mk
+include ${REPOHOME}lib/projects/tatoeba.mk

-include lib/projects/americasnlp2021.mk
+include ${REPOHOME}lib/projects/americasnlp2021.mk

-include lib/projects/distill.mk
+include ${REPOHOME}lib/projects/distill.mk
--- a/tatoeba/eval/Makefile
+++ b/tatoeba/eval/Makefile
@ -0,0 +1,230 @@
+#
+# evaluate released Tatoeba MT models
+# with existing benchmarks (collected in OPUS-MT-testsets)
+#
+
+
+## set the home directory of the repository
+## this is to find the included makefiles
+## (important to have a trailing '/')
+
+SHELL    := bash
+PWD      := ${shell pwd}
+REPOHOME := ${PWD}/../../
+
+include ${REPOHOME}lib/env.mk
+include ${REPOHOME}lib/config.mk
+include ${REPOHOME}lib/slurm.mk
+
+
+MODEL_STORAGE  := https://object.pouta.csc.fi/Tatoeba-MT-models
+MODEL_DISTS    := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$'}
+MODEL_DIST     = ${firstword ${MODEL_DISTS}}
+MODEL          = ${MODEL_DIST:.zip=}
+MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
+MODEL_URL      = ${MODEL_STORAGE}/${MODEL_DIST}
+
+## directory with all test sets (submodule OPUS-MT-testsets)
+TESTSET_HOME   := ${REPOHOME}OPUS-MT-testsets/testsets
+
+## work directory (for the temporary models)
+WORK_HOME      = ${PWD}
+WORK_DIR       = ${WORK_HOME}/${MODEL}
+
+## model directory (for test results)
+## model score file and zipfile with evaluation results
+MODEL_HOME     = ${REPOHOME}models-tatoeba
+MODEL_DIR      = ${MODEL_HOME}/${MODEL}
+MODEL_SCORES   = ${MODEL_DIR}.scores.txt
+MODEL_EVALZIP  = ${MODEL_DIR}.eval.zip
+
+## all zip files with benchmark results
+MODEL_EVALZIPS := ${patsubst %.zip,${MODEL_HOME}/%.eval.zip,${MODEL_DISTS}}
+
+
+## make all evaluation zip-files
+.PHONY: all
+all: ${MODEL_EVALZIPS}
+
+## test: make the first evaluation zip-file
+.PHONY: first
+first: $(firstword ${MODEL_EVALZIPS})
+
+
+## zip-files with all evaluation files
+${MODEL_EVALZIPS}:
+	${MAKE} MODEL_DIST=${patsubst ${MODEL_HOME}/%.eval.zip,%.zip,$@} eval-model
+
+.PHONY: eval-model
+eval-model: ${MODEL_SCORES}
+	cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.eval *.compare
+	rm -f ${MODEL_DIR}/*.eval
+	rm -f ${MODEL_DIR}/*.compare
+	rmdir ${MODEL_DIR}
+
+
+## temporary directory with all benchmark results
+${MODEL_DIR}:
+	${MAKE} fetch
+	${MAKE} eval-langpairs
+	${MAKE} cleanup
+
+## cleanup some additional workfiles
+.PHONY: cleanup
+cleanup:
+	rm -f ${WORK_DIR}/*.*
+	rm -f ${WORK_DIR}/model/*
+	rmdir ${WORK_DIR}/model
+	rmdir ${WORK_DIR}
+
+#-------------------------------------------------
+# fetch model and get supported languages
+#-------------------------------------------------
+
+## fetch translation model
+.PHONY: fetch
+fetch: ${WORK_DIR}/model/decoder.yml
+
+${WORK_DIR}/model/decoder.yml:
+	mkdir -p ${dir $@}
+	wget -q -O ${dir $@}model.zip ${MODEL_URL}
+	unzip -d ${dir $@} ${dir $@}model.zip
+
+## get supported source and target languages
+SRCLANGS = ${shell grep 'source language(s)' ${WORK_DIR}/model/README.md 2>/dev/null | cut -f2 -d: | xargs}
+TRGLANGS = ${shell grep 'target language(s)' ${WORK_DIR}/model/README.md 2>/dev/null | cut -f2 -d: | xargs}
+
+## more than one target language
+## --> need target language labels
+ifneq (${words ${TRGLANGS}},1)
+  USE_TARGET_LABELS = 1
+else
+  USE_TARGET_LABELS = 0
+endif
+
+
+## all language pairs that the model supports
+MODEL_LANGPAIRS = ${MODEL_LANGPAIR} \
+	${shell for s in ${SRCLANGS}; do for t in ${TRGLANGS}; do echo "$$s-$$t"; done done}
+
+## get language pairs for which we have test sets
+ALL_LANGPAIRS = $(notdir ${wildcard ${TESTSET_HOME}/*})
+LANGPAIRS     = ${sort $(filter ${ALL_LANGPAIRS},${MODEL_LANGPAIRS})}
+LANGPAIR      = ${firstword ${LANGPAIRS}}
+LANGPAIRSTR   = ${LANGPAIRS}
+SRC           = ${firstword ${subst -, ,${LANGPAIR}}}
+TRG           = ${lastword ${subst -, ,${LANGPAIR}}}
+TESTSET_DIR   = ${TESTSET_HOME}/${LANGPAIR}
+TESTSETS      = ${notdir ${basename ${wildcard ${TESTSET_DIR}/*.${SRC}}}}
+TESTSET       = ${firstword ${TESTSETS}}
+
+
+## eval all language pairs
+.PHONY: eval-langpairs
+eval-langpairs:
+	for l in ${LANGPAIRS}; do \
+	  ${MAKE} LANGPAIR=$$l eval-testsets; \
+	done
+
+## eval all testsets for the current langpair
+.PHONY: eval-testsets
+eval-testsets:
+	for t in ${TESTSETS}; do \
+	  ${MAKE} TESTSET=$$t eval; \
+	done
+
+#-------------------------------------------------
+# create input file for translation
+#-------------------------------------------------
+
+.PHONY: input
+input: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.input
+
+${WORK_DIR}/${TESTSET}.${LANGPAIR}.input: ${TESTSET_DIR}/${TESTSET}.${SRC}
+## check whether we need to specify the target language with labels
+ifeq (${USE_TARGET_LABELS},1)
+	${WORK_DIR}/model/preprocess.sh \
+		${SRC} ${TRG} \
+		${WORK_DIR}/model/source.spm \
+	< $< > $@
+## replace default label if language labels are given
+ifneq (${wildcard ${TESTSET_DIR}/${TESTSET}.${TRG}.labels},)
+	cut -f2- -d' ' $@ > $@.tmp1
+	sed 's/^/>>/;s/$$/<</' < ${TESTSET_DIR}/${TESTSET}.${TRG}.labels > $@.tmp2
+	paste -d' ' $@.tmp2 $@.tmp1 > $@
+	rm -f $@.tmp2 $@.tmp1
+endif
+else
+	${WORK_DIR}/model/preprocess.sh ${SRC} \
+		${WORK_DIR}/model/source.spm \
+	< $< > $@
+endif
+
+
+#-------------------------------------------------
+# create output file (translation)
+#-------------------------------------------------
+
+.PHONY: output
+output: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output
+
+${WORK_DIR}/${TESTSET}.${LANGPAIR}.output: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.input
+	${LOAD_ENV} && ${MARIAN_DECODER} -i $< \
+		-c ${WORK_DIR}/model/decoder.yml \
+		${MARIAN_DECODER_FLAGS} |\
+	sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' > $@
+
+
+#-------------------------------------------------
+# evaluation
+#-------------------------------------------------
+
+.PHONY: eval
+eval: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval
+
+${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval: 
+	${MAKE} ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output
+	mkdir -p ${dir $@}
+	cat ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output | \
+	sacrebleu ${TESTSET_DIR}/${TESTSET}.${TRG} > $@
+	cat ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output | \
+	sacrebleu --metrics=chrf --width=3 ${TESTSET_DIR}/${TESTSET}.${TRG} >> $@
+	paste -d "\n" \
+		${TESTSET_DIR}/${TESTSET}.${SRC} \
+		${TESTSET_DIR}/${TESTSET}.${TRG} \
+		${WORK_DIR}/${TESTSET}.${LANGPAIR}.output |\
+	sed 	-e "s/&apos;/'/g" \
+		-e 's/&quot;/"/g' \
+		-e 's/&lt;/</g' \
+		-e 's/&gt;/>/g' \
+		-e 's/&amp;/&/g' |\
+	sed 'n;n;G;' > ${@:.eval=.compare}
+
+
+#-------------------------------------------------
+# collect all scores in a file
+#-------------------------------------------------
+
+.PHONY: scores
+scores: ${MODEL_SCORES}
+
+${MODEL_SCORES}:
+	${MAKE} ${MODEL_DIR}
+	grep -H BLEU ${MODEL_DIR}/*eval | sort                   > $@.bleu
+	grep -H chrF ${MODEL_DIR}/*eval | sort                   > $@.chrf
+	cut -f1 -d: $@.bleu | rev | cut -f2 -d. | rev            > $@.langs
+	cut -f1 -d: $@.bleu | rev | cut -f1 -d/ | cut -f3- -d. | rev  > $@.testsets
+	cat $@.chrf | rev | cut -f1 -d' ' | rev                  > $@.chrf-scores
+	cut -f2 -d= $@.bleu | cut -f2 -d' '                      > $@.bleu-scores
+	cut -f1 -d: $@.bleu | rev | cut -f2,3 -d/ | \
+	rev | sed 's#^#${MODEL_STORAGE}/#' | sed 's/$$/.zip/'    > $@.urls
+	cut -f1 -d: $@.bleu | sed 's/.eval$$/.compare/' | \
+	xargs wc -l |  grep -v '[0-9] total' | \
+	perl -pe '$$_/=4;print "\n"' | tail -n +2                > $@.nrlines
+	cat $@.bleu | rev | cut -f1 -d' ' | rev | cut -f1 -d')'  > $@.nrwords
+	paste 	$@.langs $@.testsets \
+		$@.chrf-scores $@.bleu-scores \
+		$@.urls $@.nrlines $@.nrwords                    > $@
+	rm -f 	$@.bleu $@.chrf $@.langs $@.testsets \
+		$@.chrf-scores $@.bleu-scores \
+		$@.urls $@.nrlines $@.nrwords
				`@ -0,0 +1 @@`
				`Subproject commit 3716e2e47ba13c9a90184570b2e1b80457951f2d`