Merge branch 'master' of github.com:Helsinki-NLP/OPUS-MT-train

This commit is contained in:
Joerg Tiedemann 2021-11-30 14:52:44 +02:00
commit b6bbca2000
11 changed files with 375 additions and 83 deletions

3
.gitmodules vendored
View File

@ -28,3 +28,6 @@
[submodule "tools/jq"]
path = tools/jq
url = https://github.com/stedolan/jq.git
[submodule "OPUS-MT-testsets"]
path = OPUS-MT-testsets
url = https://github.com/Helsinki-NLP/OPUS-MT-testsets.git

1
OPUS-MT-testsets Submodule

@ -0,0 +1 @@
Subproject commit e417bd4cb2d3f1a9611c68891cae6896d7536dca

View File

@ -57,7 +57,7 @@ fetch-data:
if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
b=OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}; \
cd $(dir $@); \
a-put -b $$b --nc --follow-links --override $(notdir $<); \
a-put -t ${TMPDIR} -b $$b --nc --follow-links --override $(notdir $<); \
if [ "`swift list $$b | grep '$(notdir $<).tar$$'`" == "$(notdir $<).tar" ]; then \
rm -fr $(notdir $<); \
touch $(notdir $@); \
@ -68,6 +68,7 @@ fetch-data:
fi \
fi
# -t /scratch/project_2001194
## fetch work data from allas (now with wget instead of a-get)
## advantage of wget: don't need to login

View File

@ -278,7 +278,7 @@ TUNE_GPUJOB_SUBMIT ?=
## existing projects in WORKHOME
ALL_LANG_PAIRS := ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
ALL_LANG_PAIRS := ${shell ls ${WORKHOME} 2>/dev/null | grep -- '-' | grep -v old}
ALL_BILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
ALL_MULTILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -- '\+'}
@ -478,13 +478,6 @@ MARIAN_DEC_DEPTH ?= 6
MARIAN_ATT_HEADS ?= 8
MARIAN_DIM_EMB ?= 512
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} \
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} \
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
## TODO: currently marianNMT crashes with workspace > 26000
@ -526,8 +519,6 @@ ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
MARIAN_TRAIN_WEIGHTS = --data-weighting ${TRAIN_WEIGHTS}
endif
### training a model with Marian NMT
##
## NR allows to train several models for proper ensembling
@ -543,6 +534,16 @@ else
endif
## decoder flags (CPU and GPU variants)
MARIAN_DECODER_GPU = -b 4 -n1 -d ${MARIAN_GPUS} --quiet-translation -w ${MARIAN_WORKSPACE} \
--mini-batch 768 --maxi-batch 2048 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16
MARIAN_DECODER_CPU = -b 4 -n1 --cpu-threads ${HPC_CORES} --quiet-translation \
--mini-batch ${HPC_CORES} --maxi-batch 100 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
## make some data size-specific configuration parameters

View File

@ -719,8 +719,8 @@ endif
include lib/preprocess.mk
include lib/bpe.mk
include lib/sentencepiece.mk
include ${REPOHOME}lib/preprocess.mk
include ${REPOHOME}lib/bpe.mk
include ${REPOHOME}lib/sentencepiece.mk

View File

@ -7,20 +7,19 @@
SHELL := /bin/bash
# job-specific settings (overwrite if necessary)
# HPC_EXTRA: additional SBATCH commands
NR_GPUS = 1
HPC_NODES = 1
HPC_DISK = 500
HPC_QUEUE = serial
NR_GPUS = 1
HPC_NODES = 1
HPC_DISK = 500
HPC_QUEUE = serial
HPC_GPUQUEUE = gpu
HPC_EXTRA =
MEM = 4g
THREADS = 1
WALLTIME = 72
MEM = 4g
THREADS = 1
WALLTIME = 72
GPUJOB_HPC_MEM ?= 4g
@ -29,6 +28,10 @@ DEVICE = cuda
LOAD_CPU_ENV = echo "nothing to load"
LOAD_GPU_ENV = echo "nothing to load"
## default SLURM option to allocate GPU resources
HPC_GPU_ALLOCATION = --gres=gpu:${GPU}:${NR_GPUS}
WORKHOME = ${PWD}/work
@ -40,15 +43,15 @@ LOAD_MARIAN_BUILD_ENV = echo "nothing to load"
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
include lib/env/mahti.mk
include ${REPOHOME}lib/env/mahti.mk
else ifeq (${shell hostname},dx6-ibs-p2)
include lib/env/dx6.mk
include ${REPOHOME}lib/env/dx6.mk
else ifeq (${shell hostname},dx7-nkiel-4gpu)
include lib/env/dx7.mk
include ${REPOHOME}lib/env/dx7.mk
else ifneq ($(wildcard /wrk/tiedeman/research),)
include lib/env/taito.mk
include ${REPOHOME}lib/env/taito.mk
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
include lib/env/puhti.mk
include ${REPOHOME}lib/env/puhti.mk
endif

4
lib/env/mahti.mk vendored
View File

@ -41,6 +41,10 @@ LOAD_CPU_ENV = module load ${CPU_MODULES}
LOAD_GPU_ENV = module load ${GPU_MODULES}
## extra SLURM directives (up to 5 variables)
HPC_EXTRA1 = \#SBATCH --account=${CSCPROJECT}
## setup for compiling marian-nmt
MARIAN_BUILD_MODULES = gcc/10.3.0 cuda/11.4.2 cudnn/8.0.4.30-11.0-linux-x64 cmake/3.18.4 openblas/0.3.14-omp openmpi/4.0.5-cuda

14
lib/env/puhti.mk vendored
View File

@ -27,12 +27,20 @@ export PATH := ${APPLHOME}/bin:${PATH}
CPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env
GPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env
LOAD_CPU_ENV = module load ${CPU_MODULES}
LOAD_GPU_ENV = module load ${GPU_MODULES}
LOAD_CPU_ENV = module load ${CPU_MODULES} && module list
LOAD_GPU_ENV = module load ${GPU_MODULES} && module list
ifdef HPC_DISK
HPC_GPU_ALLOCATION = --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}
HPC_CPU_EXTRA1 = \#SBATCH --gres=nvme:${HPC_DISK}
endif
## extra SLURM directives (up to 3 numbered variables)
HPC_EXTRA1 = \#SBATCH --account=${CSCPROJECT}
BUILD_MODULES = cmake perl/5.30.0
LOAD_BUILD_ENV = module load ${BUILD_MODULES}
LOAD_BUILD_ENV = module load ${BUILD_MODULES} && module list
MARIAN_BUILD_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 cmake/3.18.2
LOAD_MARIAN_BUILD_ENV = module purge && module load ${MARIAN_BUILD_MODULES}

View File

@ -5,24 +5,24 @@
#
include lib/projects/celtic.mk
include lib/projects/finland.mk
include lib/projects/fiskmo.mk
include lib/projects/memad.mk
include lib/projects/multilingual.mk
include lib/projects/opus.mk
include lib/projects/romance.mk
include lib/projects/russian.mk
include lib/projects/sami.mk
include lib/projects/finno-ugric.mk
include lib/projects/wikimedia.mk
include lib/projects/wikimatrix.mk
include ${REPOHOME}lib/projects/celtic.mk
include ${REPOHOME}lib/projects/finland.mk
include ${REPOHOME}lib/projects/fiskmo.mk
include ${REPOHOME}lib/projects/memad.mk
include ${REPOHOME}lib/projects/multilingual.mk
include ${REPOHOME}lib/projects/opus.mk
include ${REPOHOME}lib/projects/romance.mk
include ${REPOHOME}lib/projects/russian.mk
include ${REPOHOME}lib/projects/sami.mk
include ${REPOHOME}lib/projects/finno-ugric.mk
include ${REPOHOME}lib/projects/wikimedia.mk
include ${REPOHOME}lib/projects/wikimatrix.mk
include lib/projects/doclevel.mk
include lib/projects/simplify.mk
include ${REPOHOME}lib/projects/doclevel.mk
include ${REPOHOME}lib/projects/simplify.mk
include lib/projects/tatoeba.mk
include ${REPOHOME}lib/projects/tatoeba.mk
include lib/projects/americasnlp2021.mk
include ${REPOHOME}lib/projects/americasnlp2021.mk
include lib/projects/distill.mk
include ${REPOHOME}lib/projects/distill.mk

View File

@ -18,13 +18,14 @@ endif
## submit job to gpu queue
## echo '#SBATCH --exclude=r18g08' >> $@
SLURM_JOBNAME ?= $(subst -,,${LANGPAIRSTR})
%.submit:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "$(subst -,,${LANGPAIRSTR})${@:.submit=}"' >>$@
echo '#SBATCH -o $(subst -,,${LANGPAIRSTR})${@:.submit=}.out.%j' >> $@
echo '#SBATCH -e $(subst -,,${LANGPAIRSTR})${@:.submit=}.err.%j' >> $@
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submit=}"' >>$@
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submit=}.out.%j' >> $@
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submit=}.err.%j' >> $@
echo '#SBATCH --mem=${HPC_MEM}' >> $@
ifdef EMAIL
echo '#SBATCH --mail-type=END' >> $@
@ -33,21 +34,16 @@ endif
echo '#SBATCH -n 1' >> $@
echo '#SBATCH -N 1' >> $@
echo '#SBATCH -p ${HPC_GPUQUEUE}' >> $@
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
endif
ifeq (${shell hostname --domain 2>/dev/null},bullx)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}' >> $@
else
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS}' >> $@
endif
echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> $@
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
echo 'module use -a /proj/nlpl/modules' >> $@
for m in ${GPU_MODULES}; do \
echo "module load $$m" >> $@; \
done
echo 'module list' >> $@
echo '${HPC_EXTRA}' >> $@
echo '${HPC_EXTRA1}' >> $@
echo '${HPC_EXTRA2}' >> $@
echo '${HPC_EXTRA3}' >> $@
echo '${HPC_GPU_EXTRA1}' >> $@
echo '${HPC_GPU_EXTRA2}' >> $@
echo '${HPC_GPU_EXTRA3}' >> $@
echo '${LOAD_GPU_ENV}' >> $@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
echo 'pwd' >> $@
echo 'echo "Starting at `date`"' >> $@
@ -65,31 +61,26 @@ endif
%.submitcpu:
mkdir -p ${WORKDIR}
echo '#!/bin/bash -l' > $@
echo '#SBATCH -J "$(subst -,,${LANGPAIRSTR})${@:.submitcpu=}"' >>$@
echo '#SBATCH -o $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.out.%j' >> $@
echo '#SBATCH -e $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.err.%j' >> $@
echo '#SBATCH --mem=${HPC_MEM}' >> $@
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submitcpu=}"' >>$@
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submitcpu=}.out.%j' >> $@
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submitcpu=}.err.%j' >> $@
echo '#SBATCH --mem=${HPC_MEM}' >> $@
ifdef EMAIL
echo '#SBATCH --mail-type=END' >> $@
echo '#SBATCH --mail-user=${EMAIL}' >> $@
endif
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
endif
ifeq (${shell hostname --domain 2>/dev/null},bullx)
echo '#SBATCH --account=${CSCPROJECT}' >> $@
echo '#SBATCH --gres=nvme:${HPC_DISK}' >> $@
echo '#SBATCH --mail-type=END' >> $@
echo '#SBATCH --mail-user=${EMAIL}' >> $@
endif
echo '#SBATCH -n ${HPC_CORES}' >> $@
echo '#SBATCH -N ${HPC_NODES}' >> $@
echo '#SBATCH -p ${HPC_QUEUE}' >> $@
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
echo '${HPC_EXTRA}' >> $@
echo 'module use -a /proj/nlpl/modules' >> $@
for m in ${CPU_MODULES}; do \
echo "module load $$m" >> $@; \
done
echo 'module list' >> $@
echo '${HPC_EXTRA1}' >> $@
echo '${HPC_EXTRA2}' >> $@
echo '${HPC_EXTRA3}' >> $@
echo '${HPC_CPU_EXTRA1}' >> $@
echo '${HPC_CPU_EXTRA2}' >> $@
echo '${HPC_CPU_EXTRA3}' >> $@
echo '${LOAD_GPU_ENV}' >> $@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
echo 'pwd' >> $@
echo 'echo "Starting at `date`"' >> $@

280
tatoeba/eval/Makefile Normal file
View File

@ -0,0 +1,280 @@
#
# evaluate released Tatoeba MT models
# with existing benchmarks (collected in OPUS-MT-testsets)
#
## set the home directory of the repository
## this is to find the included makefiles
## (important to have a trailing '/')
SHELL := bash
PWD := ${shell pwd}
REPOHOME := ${PWD}/../../
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$'}
MODEL_DIST = ${firstword ${MODEL_DISTS}}
MODEL = ${MODEL_DIST:.zip=}
MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
MODEL_URL = ${MODEL_STORAGE}/${MODEL_DIST}
## directory with all test sets (submodule OPUS-MT-testsets)
TESTSET_HOME := ${REPOHOME}OPUS-MT-testsets/testsets
TESTSET_INDEX := ${REPOHOME}OPUS-MT-testsets/index.txt
## work directory (for the temporary models)
WORK_HOME = ${PWD}
WORK_DIR = ${WORK_HOME}/${MODEL}
## model directory (for test results)
## model score file and zipfile with evaluation results
MODEL_HOME = ${REPOHOME}models-tatoeba
MODEL_DIR = ${MODEL_HOME}/${MODEL}
MODEL_SCORES = ${MODEL_DIR}.scores.txt
MODEL_EVALZIP = ${MODEL_DIR}.eval.zip
## all zip files with benchmark results
MODEL_EVALZIPS := ${patsubst %.zip,${MODEL_HOME}/%.eval.zip,${MODEL_DISTS}}
## make all evaluation zip-files
.PHONY: all
all: ${MODEL_EVALZIPS}
## test: make the first evaluation zip-file
.PHONY: first
first: $(firstword ${MODEL_EVALZIPS})
## zip-files with all evaluation files
## if the zip file already exists: unpack first
## to avoid re-doing things
## TODO: should also fetch from ObjectStorage if it exists there!
${MODEL_EVALZIPS}: ${TESTSET_INDEX}
if [ -e $@ ]; then \
mkdir -p ${@:.eval.zip=}; \
unzip -d ${@:.eval.zip=} $@; \
fi
${MAKE} MODEL_DIST=${patsubst ${MODEL_HOME}/%.eval.zip,%.zip,$@} eval-model
.PHONY: eval-model
eval-model: ${MODEL_SCORES}
if [ -d ${MODEL_DIR} ]; then \
cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.eval *.compare; \
rm -f ${MODEL_DIR}/*.eval; \
rm -f ${MODEL_DIR}/*.compare; \
rm -f ${MODEL_DIR}.done; \
rmdir ${MODEL_DIR}; \
fi
## temporary directory with all benchmark results
${MODEL_DIR}.done:
${MAKE} fetch
${MAKE} eval-langpairs
${MAKE} cleanup
-touch $@
## cleanup some additional workfiles
.PHONY: cleanup
cleanup:
rm -f ${WORK_DIR}/*.*
rm -f ${WORK_DIR}/model/*
rmdir ${WORK_DIR}/model
rmdir ${WORK_DIR}
rmdir ${WORK_HOME}/${MODEL_LANGPAIR}
#-------------------------------------------------
# fetch model and get supported languages
#-------------------------------------------------
## fetch translation model
.PHONY: fetch
fetch: ${WORK_DIR}/model/decoder.yml
${WORK_DIR}/model/decoder.yml:
mkdir -p ${dir $@}
wget -q -O ${dir $@}model.zip ${MODEL_URL}
unzip -d ${dir $@} ${dir $@}model.zip
## fix an old problem with the pre-process script
mv ${dir $@}preprocess.sh ${dir $@}preprocess-old.sh
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
< ${dir $@}preprocess-old.sh > ${dir $@}preprocess.sh
chmod +x ${dir $@}preprocess.sh
## get supported source and target languages
MODELINFO = ${WORK_DIR}/model/README.md
ifneq (${wildcard ${MODELINFO}},)
SRCLANGS = ${shell grep '\* source language(s)' ${MODELINFO} | cut -f2 -d: | xargs}
TRGLANGS = ${shell grep '\* valid language labels' ${MODELINFO} | cut -f2 -d: | tr '<>' ' ' | xargs}
ifeq (${words ${TRGLANGS}},0)
TRGLANGS = ${shell grep '\* target language(s)' ${MODELINFO} | cut -f2 -d: | xargs}
endif
endif
## all language pairs that the model supports
MODEL_LANGPAIRS = ${MODEL_LANGPAIR} \
${shell for s in ${SRCLANGS}; do for t in ${TRGLANGS}; do echo "$$s-$$t"; done done}
## get language pairs for which we have test sets
ALL_LANGPAIRS := $(notdir ${wildcard ${TESTSET_HOME}/*})
LANGPAIRS = ${sort $(filter ${ALL_LANGPAIRS},${MODEL_LANGPAIRS})}
LANGPAIR = ${firstword ${LANGPAIRS}}
LANGPAIRSTR = ${LANGPAIR}
SRC = ${firstword ${subst -, ,${LANGPAIR}}}
TRG = ${lastword ${subst -, ,${LANGPAIR}}}
TESTSET_DIR = ${TESTSET_HOME}/${LANGPAIR}
TESTSETS = ${notdir ${basename ${wildcard ${TESTSET_DIR}/*.${SRC}}}}
TESTSET = ${firstword ${TESTSETS}}
## eval all language pairs
.PHONY: eval-langpairs
eval-langpairs:
for l in ${LANGPAIRS}; do \
${MAKE} LANGPAIR=$$l eval-testsets; \
done
## eval all testsets for the current langpair
.PHONY: eval-testsets
eval-testsets:
for t in ${TESTSETS}; do \
${MAKE} TESTSET=$$t eval; \
done
#-------------------------------------------------
# create input file for translation
#-------------------------------------------------
.PHONY: input
input: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.input
## more than one target language
## --> need target language labels
ifneq (${words ${TRGLANGS}},1)
USE_TARGET_LABELS = 1
else
USE_TARGET_LABELS = 0
endif
## double-check whether the preprocessing script
## requires both language IDs or not
ifeq (${shell grep 'source-langid target-langid' ${WORK_DIR}/model/preprocess.sh 2>/dev/null | wc -l},1)
USE_BOTH_LANGIDS = 1
endif
## take care of different calls to the pre-processing script
ifeq (${USE_BOTH_LANGIDS},1)
PREPROCESS = ${WORK_DIR}/model/preprocess.sh ${SRC} ${TRG} ${WORK_DIR}/model/source.spm
else
PREPROCESS = ${WORK_DIR}/model/preprocess.sh ${SRC} ${WORK_DIR}/model/source.spm
endif
${WORK_DIR}/${TESTSET}.${LANGPAIR}.input: ${TESTSET_DIR}/${TESTSET}.${SRC}
${PREPROCESS} < $< > $@
## check whether we need to replace the target language labels
ifeq (${USE_TARGET_LABELS},1)
ifneq (${wildcard ${TESTSET_DIR}/${TESTSET}.${TRG}.labels},)
cut -f2- -d' ' $@ > $@.tmp1
sed 's/^/>>/;s/$$/<</' < ${TESTSET_DIR}/${TESTSET}.${TRG}.labels > $@.tmp2
paste -d' ' $@.tmp2 $@.tmp1 > $@
rm -f $@.tmp2 $@.tmp1
endif
endif
#-------------------------------------------------
# create output file (translation)
#-------------------------------------------------
.PHONY: output
output: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output
${WORK_DIR}/${TESTSET}.${LANGPAIR}.output: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.input
if [ -e $< ]; then \
if [ -s $< ]; then \
${LOAD_ENV} && ${MARIAN_DECODER} -i $< \
-c ${WORK_DIR}/model/decoder.yml \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' > $@; \
fi \
fi
#-------------------------------------------------
# evaluation
#-------------------------------------------------
.PHONY: eval
eval: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval
## adjust tokenisation to non-space-separated languages
## TODO: is it correct to simply use 'zh' even for jpn or should we use 'intl'?
ifneq ($(filter cmn jpn yue zho,${TRG}),)
SACREBLEU_PARAMS = --tokenize zh
endif
${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval:
${MAKE} ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output
if [ -e ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output ]; then \
if [ -s ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output ]; then \
mkdir -p ${dir $@}; \
cat ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output | \
sacrebleu ${SACREBLEU_PARAMS} ${TESTSET_DIR}/${TESTSET}.${TRG} > $@; \
cat ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output | \
sacrebleu ${SACREBLEU_PARAMS} --metrics=chrf --width=3 ${TESTSET_DIR}/${TESTSET}.${TRG} >> $@; \
paste -d "\n" \
${TESTSET_DIR}/${TESTSET}.${SRC} \
${TESTSET_DIR}/${TESTSET}.${TRG} \
${WORK_DIR}/${TESTSET}.${LANGPAIR}.output |\
sed -e "s/&apos;/'/g" \
-e 's/&quot;/"/g' \
-e 's/&lt;/</g' \
-e 's/&gt;/>/g' \
-e 's/&amp;/&/g' |\
sed 'n;n;G;' > ${@:.eval=.compare}; \
fi \
fi
#-------------------------------------------------
# collect all scores in a file
#-------------------------------------------------
.PHONY: scores
scores: ${MODEL_SCORES}
${MODEL_SCORES}: ${TESTSET_INDEX}
${MAKE} ${MODEL_DIR}.done
if [ -d ${MODEL_DIR} ]; then \
grep -H BLEU ${MODEL_DIR}/*eval | sort > $@.bleu; \
grep -H chrF ${MODEL_DIR}/*eval | sort > $@.chrf; \
cut -f1 -d: $@.bleu | rev | cut -f2 -d. | rev > $@.langs; \
cut -f1 -d: $@.bleu | rev | cut -f1 -d/ | cut -f3- -d. | rev > $@.testsets; \
cat $@.chrf | rev | cut -f1 -d' ' | rev > $@.chrf-scores; \
cut -f2 -d= $@.bleu | cut -f2 -d' ' > $@.bleu-scores; \
cut -f1 -d: $@.bleu | rev | cut -f2,3 -d/ | \
rev | sed 's#^#${MODEL_STORAGE}/#' | sed 's/$$/.zip/' > $@.urls; \
cut -f1 -d: $@.bleu | sed 's/.eval$$/.compare/' | \
xargs wc -l | grep -v '[0-9] total' | \
perl -pe '$$_/=4;print "\n"' | tail -n +2 > $@.nrlines; \
cat $@.bleu | rev | cut -f1 -d' ' | rev | cut -f1 -d')' > $@.nrwords; \
paste $@.langs $@.testsets \
$@.chrf-scores $@.bleu-scores \
$@.urls $@.nrlines $@.nrwords > $@; \
rm -f $@.bleu $@.chrf $@.langs $@.testsets \
$@.chrf-scores $@.bleu-scores \
$@.urls $@.nrlines $@.nrwords; \
fi