mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-05 16:47:21 +03:00
more metrics
This commit is contained in:
parent
83abeb05eb
commit
8da7f7579f
@ -1 +1 @@
|
||||
Subproject commit 6c08afb24684b468635e0471f92efa3d6e3def82
|
||||
Subproject commit 7d96d730f001e6f634856cf973f1cad83ed64a9d
|
@ -43,18 +43,35 @@ ELG_EU_SELECTED_BIG = gmq zle zls zlw spa fra deu
|
||||
# "cat oci"
|
||||
|
||||
|
||||
raul-timo:
|
||||
# raul-timo: rus2eng ukr2eng sla2eng ine2eng mul2eng
|
||||
raul-timo: sla2eng ine2eng mul2eng
|
||||
|
||||
# ${MAKE} rus2eng
|
||||
# ${MAKE} ukr2eng
|
||||
# ${MAKE} sla2eng
|
||||
${MAKE} ine2eng
|
||||
${MAKE} mul2eng
|
||||
# ${MAKE} ine2eng
|
||||
# ${MAKE} mul2eng
|
||||
|
||||
rus2eng ukr2eng:
|
||||
make FIT_DATA_SIZE=1000000 MODELTYPE=transformer tatoeba-$@-data
|
||||
make MODELTYPE=transformer tatoeba-$@-data-5m
|
||||
|
||||
sla2eng ine2eng mul2eng:
|
||||
make DATA_SAMPLING_WEIGHT=0.5 MAX_DATA_SIZE=1000000 MODELTYPE=transformer tatoeba-$@-data
|
||||
make MODELTYPE=transformer tatoeba-$@-data-5m0.5temp
|
||||
|
||||
|
||||
%-5m:
|
||||
${MAKE} LANGGROUP_FIT_DATA_SIZE=5000000 \
|
||||
FIT_DATA_SIZE=5000000 \
|
||||
DATASET=${DATASET}5m \
|
||||
${@:-5m=}
|
||||
|
||||
%-5m0.5temp:
|
||||
${MAKE} DATA_SAMPLING_WEIGHT=0.5 \
|
||||
MAX_DATA_SIZE=5000000 \
|
||||
DATASET=${DATASET}5m \
|
||||
${@:-5m0.5temp=}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -727,6 +727,12 @@ tatoeba-dist-4m:
|
||||
MARIAN_VALID_FREQ=10000 \
|
||||
${@:-4m=}
|
||||
|
||||
%-5m:
|
||||
${MAKE} LANGGROUP_FIT_DATA_SIZE=5000000 \
|
||||
FIT_DATA_SIZE=5000000 \
|
||||
DATASET=${DATASET}5m \
|
||||
${@:-5m=}
|
||||
|
||||
|
||||
|
||||
## evaluate and create dist packages
|
||||
|
@ -20,6 +20,30 @@ redo-chrf:
|
||||
first: $(firstword ${MODEL_EVALZIPS})
|
||||
|
||||
|
||||
|
||||
|
||||
## do things in reverse order
|
||||
## (just to start another job)
|
||||
|
||||
## convenient function to reverse a list
|
||||
reverse = $(if $(wordlist 2,2,$(1)),$(call reverse,$(wordlist 2,$(words $(1)),$(1))) $(firstword $(1)),$(1))
|
||||
|
||||
MODEL_EVALZIPS_REVERSE = $(call reverse,${MODEL_EVALZIPS})
|
||||
|
||||
all-reverse: ${MODEL_EVALZIPS_REVERSE}
|
||||
|
||||
|
||||
|
||||
|
||||
redo-chrf-reverse:
|
||||
make METRICS="bleu chrf" REDO_CHRF_SCORES=1 all-reverse
|
||||
|
||||
add-new-metrics:
|
||||
${MAKE} -j16 METRICS="spbleu chrf++" all
|
||||
|
||||
all-comet-reverse:
|
||||
make METRICS="comet" all-reverse
|
||||
|
||||
#-------------------------------------------------
|
||||
## phony targets to evaluate only new models
|
||||
## or only models that exist locally
|
||||
@ -82,6 +106,9 @@ eval-model: ${MODEL_SCORES}
|
||||
rm -f ${MODEL_DIR}/*.bleu; \
|
||||
rm -f ${MODEL_DIR}/*.chrf; \
|
||||
rm -f ${MODEL_DIR}/*.comet; \
|
||||
for m in ${METRICS}; do \
|
||||
rm -f ${MODEL_DIR}/*.$$m; \
|
||||
done; \
|
||||
rm -f ${MODEL_DIR}.done; \
|
||||
rmdir ${MODEL_DIR}; \
|
||||
fi
|
||||
@ -104,7 +131,7 @@ cleanup:
|
||||
rm -f ${WORK_DIR}/model/*
|
||||
rmdir ${WORK_DIR}/model
|
||||
rmdir ${WORK_DIR}
|
||||
rmdir ${WORK_HOME}/${MODEL_LANGPAIR}
|
||||
-rmdir ${WORK_HOME}/${MODEL_LANGPAIR}
|
||||
|
||||
#-------------------------------------------------
|
||||
# fetch model and get supported languages
|
||||
@ -189,9 +216,7 @@ TESTSETS := ${notdir ${basename ${wildcard ${TESTSET_DIR}/*.${SRC}}}}
|
||||
TESTSET := ${firstword ${TESTSETS}}
|
||||
|
||||
|
||||
|
||||
MODEL_EVAL_MISSING := $(patsubst %,%.missing,${ALL_LANGPAIRS})
|
||||
METRICS := bleu chrf comet
|
||||
|
||||
.PHONY: find-missing
|
||||
find-missing: models.missing
|
||||
@ -251,7 +276,6 @@ ${MODEL_CHRFSCORES}: ${MODEL_SCORES}
|
||||
sed -e 's/\(news.*2021\)\.[a-z][a-z]\-[a-z][a-z] /\1 /' |\
|
||||
rev | uniq -f1 | rev > $@
|
||||
|
||||
|
||||
COMET_EVAL_FILES = ${wildcard ${MODEL_DIR}/*.comet}
|
||||
${MODEL_COMETSCORES}: ${COMET_EVAL_FILES}
|
||||
if [ -d ${MODEL_DIR} ]; then \
|
||||
@ -269,6 +293,28 @@ ${MODEL_COMETSCORES}: ${COMET_EVAL_FILES}
|
||||
rm -f $@.comet $@.langs $@.testsets $@.comet-scores; \
|
||||
fi
|
||||
|
||||
|
||||
## generic recipe for extracting scores for a metric
|
||||
## (works for all sacrebleu results but not for other metrics)
|
||||
|
||||
${MODEL_DIR}.%-scores.txt: ${MODEL_SCORES} # ${MODEL_DIR}
|
||||
if [ -d ${MODEL_DIR} ]; then \
|
||||
mkdir -p $(dir $@); \
|
||||
grep -H . ${MODEL_DIR}/*.$(patsubst ${MODEL_DIR}.%-scores.txt,%) > $@.all; \
|
||||
cut -f1 -d: $@.all | rev | cut -f2 -d. | rev > $@.langs; \
|
||||
cut -f1 -d: $@.all | rev | cut -f1 -d/ | cut -f3- -d. | rev > $@.testsets; \
|
||||
cut -f3 -d ' ' $@.all > $@.scores; \
|
||||
paste $@.langs $@.testsets $@.scores >> $@; \
|
||||
cat $@ |\
|
||||
sed -e 's/\(news.*[0-9][0-9][0-9][0-9]\)-[a-z][a-z][a-z][a-z] /\1 /' | \
|
||||
sed -e 's/\(news.*2021\)\.[a-z][a-z]\-[a-z][a-z] /\1 /' |\
|
||||
rev | uniq -f1 | rev > $@.sorted; \
|
||||
mv -f $@.sorted $@; \
|
||||
rm -f $@.all $@.langs $@.testsets $@.scores; \
|
||||
fi
|
||||
|
||||
|
||||
|
||||
#-------------------------------------------------
|
||||
# create input file for translation
|
||||
#-------------------------------------------------
|
||||
@ -393,11 +439,29 @@ ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.compare:
|
||||
fi
|
||||
|
||||
## adjust tokenisation to non-space-separated languages
|
||||
## TODO: is it correct to simply use 'zh' even for jpn or should we use 'intl'?
|
||||
ifneq ($(filter cmn jpn yue zho,${TRG}),)
|
||||
ifneq ($(filter cmn yue zho,${TRG}),)
|
||||
SACREBLEU_PARAMS = --tokenize zh
|
||||
endif
|
||||
|
||||
ifneq ($(filter jpn,${TRG}),)
|
||||
SACREBLEU_PARAMS = --tokenize ja-mecab
|
||||
endif
|
||||
|
||||
ifneq ($(filter kor,${TRG}),)
|
||||
SACREBLEU_PARAMS = --tokenize ko-mecab
|
||||
endif
|
||||
|
||||
|
||||
|
||||
${MODEL_DIR}/${TESTSET}.${LANGPAIR}.spbleu: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.compare
|
||||
mkdir -p ${dir $@}
|
||||
sed -n '1~4p' $< > $@.src
|
||||
sed -n '2~4p' $< > $@.ref
|
||||
sed -n '3~4p' $< > $@.hyp
|
||||
cat $@.hyp | \
|
||||
sacrebleu -f text --metrics=bleu --tokenize flores200 $@.ref > $@
|
||||
rm -f $@.src $@.ref $@.hyp
|
||||
|
||||
${MODEL_DIR}/${TESTSET}.${LANGPAIR}.bleu: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.compare
|
||||
mkdir -p ${dir $@}
|
||||
sed -n '1~4p' $< > $@.src
|
||||
@ -417,11 +481,24 @@ ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.chrf: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.co
|
||||
perl -pe 'unless (/version\:1\./){@a=split(/\s+/);$$a[-1]/=100;$$_=join(" ",@a);}' > $@
|
||||
rm -f $@.src $@.ref $@.hyp
|
||||
|
||||
## normalise to decimals from percentage (like it used to be)
|
||||
## let's don't do this anymore and rather update all scores with new scores!
|
||||
##
|
||||
# perl -pe 'unless (/version\:1\./){@a=split(/\s+/);$$a[-1]/=100;$$_=join(" ",@a);}' > $@
|
||||
${MODEL_DIR}/${TESTSET}.${LANGPAIR}.chrf++: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.compare
|
||||
mkdir -p ${dir $@}
|
||||
sed -n '1~4p' $< > $@.src
|
||||
sed -n '2~4p' $< > $@.ref
|
||||
sed -n '3~4p' $< > $@.hyp
|
||||
cat $@.hyp | \
|
||||
sacrebleu -f text ${SACREBLEU_PARAMS} --metrics=chrf --width=3 --chrf-word-order 2 $@.ref |\
|
||||
perl -pe 'unless (/version\:1\./){@a=split(/\s+/);$$a[-1]/=100;$$_=join(" ",@a);}' > $@
|
||||
rm -f $@.src $@.ref $@.hyp
|
||||
|
||||
${MODEL_DIR}/${TESTSET}.${LANGPAIR}.ter: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.compare
|
||||
mkdir -p ${dir $@}
|
||||
sed -n '1~4p' $< > $@.src
|
||||
sed -n '2~4p' $< > $@.ref
|
||||
sed -n '3~4p' $< > $@.hyp
|
||||
cat $@.hyp | \
|
||||
sacrebleu -f text ${SACREBLEU_PARAMS} --metrics=ter $@.ref > $@
|
||||
rm -f $@.src $@.ref $@.hyp
|
||||
|
||||
ifneq (${GPU_AVAILABLE},1)
|
||||
COMET_PARAM += --gpus 0
|
||||
|
@ -18,6 +18,8 @@ include ${REPOHOME}lib/slurm.mk
|
||||
GPUJOB_HPC_MEM = 20g
|
||||
|
||||
|
||||
METRICS := bleu spbleu chrf chrf++ comet
|
||||
|
||||
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
|
||||
ifndef MODEL_DISTS
|
||||
ifneq ($(wildcard models.missing),)
|
||||
@ -56,3 +58,7 @@ MODEL_COMETSCORES = ${MODEL_DIR}.comet-scores.txt
|
||||
|
||||
## all zip files with benchmark results
|
||||
MODEL_EVALZIPS := ${patsubst %.zip,${MODEL_HOME}/%.eval.zip,${MODEL_DISTS}}
|
||||
|
||||
|
||||
## collected scores for a model using a specific metric
|
||||
MODEL_METRIC_SCORES = $(patsubst %,${MODEL_DIR}.%-scores.txt,${METRICS})
|
||||
|
Loading…
Reference in New Issue
Block a user