evaluation of all released tatoeba models

This commit is contained in:
Joerg Tiedemann 2021-11-28 15:55:34 +02:00
parent 0017e83e6b
commit 2abd203be7
8 changed files with 264 additions and 30 deletions

3
.gitmodules vendored
View File

@ -28,3 +28,6 @@
[submodule "tools/jq"]
path = tools/jq
url = https://github.com/stedolan/jq.git
[submodule "OPUS-MT-testsets"]
path = OPUS-MT-testsets
url = https://github.com/Helsinki-NLP/OPUS-MT-testsets.git

1
OPUS-MT-testsets Submodule

@ -0,0 +1 @@
Subproject commit 3716e2e47ba13c9a90184570b2e1b80457951f2d

View File

@ -57,7 +57,7 @@ fetch-data:
if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
b=OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}; \
cd $(dir $@); \
a-put -b $$b --nc --follow-links --override $(notdir $<); \
a-put -t ${TMPDIR} -b $$b --nc --follow-links --override $(notdir $<); \
if [ "`swift list $$b | grep '$(notdir $<).tar$$'`" == "$(notdir $<).tar" ]; then \
rm -fr $(notdir $<); \
touch $(notdir $@); \
@ -68,6 +68,7 @@ fetch-data:
fi \
fi
# -t /scratch/project_2001194
## fetch work data from allas (now with wget instead of a-get)
## advantage of wget: don't need to login

View File

@ -282,7 +282,7 @@ TUNE_GPUJOB_SUBMIT ?=
## existing projects in WORKHOME
ALL_LANG_PAIRS := ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
ALL_LANG_PAIRS := ${shell ls ${WORKHOME} 2>/dev/null | grep -- '-' | grep -v old}
ALL_BILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
ALL_MULTILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -- '\+'}
@ -470,10 +470,10 @@ MARIAN_ATT_HEADS ?= 8
MARIAN_DIM_EMB ?= 512
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} \
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--mini-batch 100 --maxi-batch 200 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} \
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--mini-batch 8 --maxi-batch 100 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}

View File

@ -710,8 +710,8 @@ endif
include lib/preprocess.mk
include lib/bpe.mk
include lib/sentencepiece.mk
include ${REPOHOME}lib/preprocess.mk
include ${REPOHOME}lib/bpe.mk
include ${REPOHOME}lib/sentencepiece.mk

View File

@ -7,7 +7,6 @@
SHELL := /bin/bash
# job-specific settings (overwrite if necessary)
# HPC_EXTRA: additional SBATCH commands
@ -40,15 +39,15 @@ LOAD_MARIAN_BUILD_ENV = echo "nothing to load"
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
include lib/env/mahti.mk
include ${REPOHOME}lib/env/mahti.mk
else ifeq (${shell hostname},dx6-ibs-p2)
include lib/env/dx6.mk
include ${REPOHOME}lib/env/dx6.mk
else ifeq (${shell hostname},dx7-nkiel-4gpu)
include lib/env/dx7.mk
include ${REPOHOME}lib/env/dx7.mk
else ifneq ($(wildcard /wrk/tiedeman/research),)
include lib/env/taito.mk
include ${REPOHOME}lib/env/taito.mk
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
include lib/env/puhti.mk
include ${REPOHOME}lib/env/puhti.mk
endif

View File

@ -5,24 +5,24 @@
#
include lib/projects/celtic.mk
include lib/projects/finland.mk
include lib/projects/fiskmo.mk
include lib/projects/memad.mk
include lib/projects/multilingual.mk
include lib/projects/opus.mk
include lib/projects/romance.mk
include lib/projects/russian.mk
include lib/projects/sami.mk
include lib/projects/finno-ugric.mk
include lib/projects/wikimedia.mk
include lib/projects/wikimatrix.mk
include ${REPOHOME}lib/projects/celtic.mk
include ${REPOHOME}lib/projects/finland.mk
include ${REPOHOME}lib/projects/fiskmo.mk
include ${REPOHOME}lib/projects/memad.mk
include ${REPOHOME}lib/projects/multilingual.mk
include ${REPOHOME}lib/projects/opus.mk
include ${REPOHOME}lib/projects/romance.mk
include ${REPOHOME}lib/projects/russian.mk
include ${REPOHOME}lib/projects/sami.mk
include ${REPOHOME}lib/projects/finno-ugric.mk
include ${REPOHOME}lib/projects/wikimedia.mk
include ${REPOHOME}lib/projects/wikimatrix.mk
include lib/projects/doclevel.mk
include lib/projects/simplify.mk
include ${REPOHOME}lib/projects/doclevel.mk
include ${REPOHOME}lib/projects/simplify.mk
include lib/projects/tatoeba.mk
include ${REPOHOME}lib/projects/tatoeba.mk
include lib/projects/americasnlp2021.mk
include ${REPOHOME}lib/projects/americasnlp2021.mk
include lib/projects/distill.mk
include ${REPOHOME}lib/projects/distill.mk

230
tatoeba/eval/Makefile Normal file
View File

@ -0,0 +1,230 @@
#
# evaluate released Tatoeba MT models
# with existing benchmarks (collected in OPUS-MT-testsets)
#
## set the home directory of the repository
## this is to find the included makefiles
## (important to have a trailing '/')
SHELL := bash
PWD := ${shell pwd}
REPOHOME := ${PWD}/../../
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$'}
MODEL_DIST = ${firstword ${MODEL_DISTS}}
MODEL = ${MODEL_DIST:.zip=}
MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
MODEL_URL = ${MODEL_STORAGE}/${MODEL_DIST}
## directory with all test sets (submodule OPUS-MT-testsets)
TESTSET_HOME := ${REPOHOME}OPUS-MT-testsets/testsets
## work directory (for the temporary models)
WORK_HOME = ${PWD}
WORK_DIR = ${WORK_HOME}/${MODEL}
## model directory (for test results)
## model score file and zipfile with evaluation results
MODEL_HOME = ${REPOHOME}models-tatoeba
MODEL_DIR = ${MODEL_HOME}/${MODEL}
MODEL_SCORES = ${MODEL_DIR}.scores.txt
MODEL_EVALZIP = ${MODEL_DIR}.eval.zip
## all zip files with benchmark results
MODEL_EVALZIPS := ${patsubst %.zip,${MODEL_HOME}/%.eval.zip,${MODEL_DISTS}}
## make all evaluation zip-files
.PHONY: all
all: ${MODEL_EVALZIPS}
## test: make the first evaluation zip-file
.PHONY: first
first: $(firstword ${MODEL_EVALZIPS})
## zip-files with all evaluation files
${MODEL_EVALZIPS}:
${MAKE} MODEL_DIST=${patsubst ${MODEL_HOME}/%.eval.zip,%.zip,$@} eval-model
.PHONY: eval-model
eval-model: ${MODEL_SCORES}
cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.eval *.compare
rm -f ${MODEL_DIR}/*.eval
rm -f ${MODEL_DIR}/*.compare
rmdir ${MODEL_DIR}
## temporary directory with all benchmark results
${MODEL_DIR}:
${MAKE} fetch
${MAKE} eval-langpairs
${MAKE} cleanup
## cleanup some additional workfiles
.PHONY: cleanup
cleanup:
rm -f ${WORK_DIR}/*.*
rm -f ${WORK_DIR}/model/*
rmdir ${WORK_DIR}/model
rmdir ${WORK_DIR}
#-------------------------------------------------
# fetch model and get supported languages
#-------------------------------------------------
## fetch translation model
.PHONY: fetch
fetch: ${WORK_DIR}/model/decoder.yml
${WORK_DIR}/model/decoder.yml:
mkdir -p ${dir $@}
wget -q -O ${dir $@}model.zip ${MODEL_URL}
unzip -d ${dir $@} ${dir $@}model.zip
## get supported source and target languages
SRCLANGS = ${shell grep 'source language(s)' ${WORK_DIR}/model/README.md 2>/dev/null | cut -f2 -d: | xargs}
TRGLANGS = ${shell grep 'target language(s)' ${WORK_DIR}/model/README.md 2>/dev/null | cut -f2 -d: | xargs}
## more than one target language
## --> need target language labels
ifneq (${words ${TRGLANGS}},1)
USE_TARGET_LABELS = 1
else
USE_TARGET_LABELS = 0
endif
## all language pairs that the model supports
MODEL_LANGPAIRS = ${MODEL_LANGPAIR} \
${shell for s in ${SRCLANGS}; do for t in ${TRGLANGS}; do echo "$$s-$$t"; done done}
## get language pairs for which we have test sets
ALL_LANGPAIRS = $(notdir ${wildcard ${TESTSET_HOME}/*})
LANGPAIRS = ${sort $(filter ${ALL_LANGPAIRS},${MODEL_LANGPAIRS})}
LANGPAIR = ${firstword ${LANGPAIRS}}
LANGPAIRSTR = ${LANGPAIRS}
SRC = ${firstword ${subst -, ,${LANGPAIR}}}
TRG = ${lastword ${subst -, ,${LANGPAIR}}}
TESTSET_DIR = ${TESTSET_HOME}/${LANGPAIR}
TESTSETS = ${notdir ${basename ${wildcard ${TESTSET_DIR}/*.${SRC}}}}
TESTSET = ${firstword ${TESTSETS}}
## eval all language pairs
.PHONY: eval-langpairs
eval-langpairs:
for l in ${LANGPAIRS}; do \
${MAKE} LANGPAIR=$$l eval-testsets; \
done
## eval all testsets for the current langpair
.PHONY: eval-testsets
eval-testsets:
for t in ${TESTSETS}; do \
${MAKE} TESTSET=$$t eval; \
done
#-------------------------------------------------
# create input file for translation
#-------------------------------------------------
.PHONY: input
input: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.input
${WORK_DIR}/${TESTSET}.${LANGPAIR}.input: ${TESTSET_DIR}/${TESTSET}.${SRC}
## check whether we need to specify the target language with labels
ifeq (${USE_TARGET_LABELS},1)
${WORK_DIR}/model/preprocess.sh \
${SRC} ${TRG} \
${WORK_DIR}/model/source.spm \
< $< > $@
## replace default label if language labels are given
ifneq (${wildcard ${TESTSET_DIR}/${TESTSET}.${TRG}.labels},)
cut -f2- -d' ' $@ > $@.tmp1
sed 's/^/>>/;s/$$/<</' < ${TESTSET_DIR}/${TESTSET}.${TRG}.labels > $@.tmp2
paste -d' ' $@.tmp2 $@.tmp1 > $@
rm -f $@.tmp2 $@.tmp1
endif
else
${WORK_DIR}/model/preprocess.sh ${SRC} \
${WORK_DIR}/model/source.spm \
< $< > $@
endif
#-------------------------------------------------
# create output file (translation)
#-------------------------------------------------
.PHONY: output
output: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output
${WORK_DIR}/${TESTSET}.${LANGPAIR}.output: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.input
${LOAD_ENV} && ${MARIAN_DECODER} -i $< \
-c ${WORK_DIR}/model/decoder.yml \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' > $@
#-------------------------------------------------
# evaluation
#-------------------------------------------------
.PHONY: eval
eval: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval
${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval:
${MAKE} ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output
mkdir -p ${dir $@}
cat ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output | \
sacrebleu ${TESTSET_DIR}/${TESTSET}.${TRG} > $@
cat ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output | \
sacrebleu --metrics=chrf --width=3 ${TESTSET_DIR}/${TESTSET}.${TRG} >> $@
paste -d "\n" \
${TESTSET_DIR}/${TESTSET}.${SRC} \
${TESTSET_DIR}/${TESTSET}.${TRG} \
${WORK_DIR}/${TESTSET}.${LANGPAIR}.output |\
sed -e "s/&apos;/'/g" \
-e 's/&quot;/"/g' \
-e 's/&lt;/</g' \
-e 's/&gt;/>/g' \
-e 's/&amp;/&/g' |\
sed 'n;n;G;' > ${@:.eval=.compare}
#-------------------------------------------------
# collect all scores in a file
#-------------------------------------------------
.PHONY: scores
scores: ${MODEL_SCORES}
${MODEL_SCORES}:
${MAKE} ${MODEL_DIR}
grep -H BLEU ${MODEL_DIR}/*eval | sort > $@.bleu
grep -H chrF ${MODEL_DIR}/*eval | sort > $@.chrf
cut -f1 -d: $@.bleu | rev | cut -f2 -d. | rev > $@.langs
cut -f1 -d: $@.bleu | rev | cut -f1 -d/ | cut -f3- -d. | rev > $@.testsets
cat $@.chrf | rev | cut -f1 -d' ' | rev > $@.chrf-scores
cut -f2 -d= $@.bleu | cut -f2 -d' ' > $@.bleu-scores
cut -f1 -d: $@.bleu | rev | cut -f2,3 -d/ | \
rev | sed 's#^#${MODEL_STORAGE}/#' | sed 's/$$/.zip/' > $@.urls
cut -f1 -d: $@.bleu | sed 's/.eval$$/.compare/' | \
xargs wc -l | grep -v '[0-9] total' | \
perl -pe '$$_/=4;print "\n"' | tail -n +2 > $@.nrlines
cat $@.bleu | rev | cut -f1 -d' ' | rev | cut -f1 -d')' > $@.nrwords
paste $@.langs $@.testsets \
$@.chrf-scores $@.bleu-scores \
$@.urls $@.nrlines $@.nrwords > $@
rm -f $@.bleu $@.chrf $@.langs $@.testsets \
$@.chrf-scores $@.bleu-scores \
$@.urls $@.nrlines $@.nrwords