mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 12:32:24 +03:00
Merge branch 'master' of github.com:Helsinki-NLP/OPUS-MT-train
This commit is contained in:
commit
b6bbca2000
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -28,3 +28,6 @@
|
||||
[submodule "tools/jq"]
|
||||
path = tools/jq
|
||||
url = https://github.com/stedolan/jq.git
|
||||
[submodule "OPUS-MT-testsets"]
|
||||
path = OPUS-MT-testsets
|
||||
url = https://github.com/Helsinki-NLP/OPUS-MT-testsets.git
|
||||
|
1
OPUS-MT-testsets
Submodule
1
OPUS-MT-testsets
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit e417bd4cb2d3f1a9611c68891cae6896d7536dca
|
@ -57,7 +57,7 @@ fetch-data:
|
||||
if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
|
||||
b=OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}; \
|
||||
cd $(dir $@); \
|
||||
a-put -b $$b --nc --follow-links --override $(notdir $<); \
|
||||
a-put -t ${TMPDIR} -b $$b --nc --follow-links --override $(notdir $<); \
|
||||
if [ "`swift list $$b | grep '$(notdir $<).tar$$'`" == "$(notdir $<).tar" ]; then \
|
||||
rm -fr $(notdir $<); \
|
||||
touch $(notdir $@); \
|
||||
@ -68,6 +68,7 @@ fetch-data:
|
||||
fi \
|
||||
fi
|
||||
|
||||
# -t /scratch/project_2001194
|
||||
|
||||
## fetch work data from allas (now with wget instead of a-get)
|
||||
## advantage of wget: don't need to login
|
||||
|
@ -278,7 +278,7 @@ TUNE_GPUJOB_SUBMIT ?=
|
||||
|
||||
|
||||
## existing projects in WORKHOME
|
||||
ALL_LANG_PAIRS := ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
|
||||
ALL_LANG_PAIRS := ${shell ls ${WORKHOME} 2>/dev/null | grep -- '-' | grep -v old}
|
||||
ALL_BILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
|
||||
ALL_MULTILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -- '\+'}
|
||||
|
||||
@ -478,13 +478,6 @@ MARIAN_DEC_DEPTH ?= 6
|
||||
MARIAN_ATT_HEADS ?= 8
|
||||
MARIAN_DIM_EMB ?= 512
|
||||
|
||||
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} \
|
||||
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
|
||||
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} \
|
||||
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
|
||||
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
|
||||
|
||||
|
||||
## TODO: currently marianNMT crashes with workspace > 26000
|
||||
@ -526,8 +519,6 @@ ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
|
||||
MARIAN_TRAIN_WEIGHTS = --data-weighting ${TRAIN_WEIGHTS}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
### training a model with Marian NMT
|
||||
##
|
||||
## NR allows to train several models for proper ensembling
|
||||
@ -543,6 +534,16 @@ else
|
||||
endif
|
||||
|
||||
|
||||
## decoder flags (CPU and GPU variants)
|
||||
|
||||
MARIAN_DECODER_GPU = -b 4 -n1 -d ${MARIAN_GPUS} --quiet-translation -w ${MARIAN_WORKSPACE} \
|
||||
--mini-batch 768 --maxi-batch 2048 --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16
|
||||
MARIAN_DECODER_CPU = -b 4 -n1 --cpu-threads ${HPC_CORES} --quiet-translation \
|
||||
--mini-batch ${HPC_CORES} --maxi-batch 100 --maxi-batch-sort src \
|
||||
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop --fp16
|
||||
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
|
||||
|
||||
|
||||
|
||||
## make some data size-specific configuration parameters
|
||||
|
@ -719,8 +719,8 @@ endif
|
||||
|
||||
|
||||
|
||||
include lib/preprocess.mk
|
||||
include lib/bpe.mk
|
||||
include lib/sentencepiece.mk
|
||||
include ${REPOHOME}lib/preprocess.mk
|
||||
include ${REPOHOME}lib/bpe.mk
|
||||
include ${REPOHOME}lib/sentencepiece.mk
|
||||
|
||||
|
||||
|
31
lib/env.mk
31
lib/env.mk
@ -7,20 +7,19 @@
|
||||
|
||||
SHELL := /bin/bash
|
||||
|
||||
|
||||
# job-specific settings (overwrite if necessary)
|
||||
# HPC_EXTRA: additional SBATCH commands
|
||||
|
||||
NR_GPUS = 1
|
||||
HPC_NODES = 1
|
||||
HPC_DISK = 500
|
||||
HPC_QUEUE = serial
|
||||
NR_GPUS = 1
|
||||
HPC_NODES = 1
|
||||
HPC_DISK = 500
|
||||
HPC_QUEUE = serial
|
||||
HPC_GPUQUEUE = gpu
|
||||
HPC_EXTRA =
|
||||
|
||||
MEM = 4g
|
||||
THREADS = 1
|
||||
WALLTIME = 72
|
||||
|
||||
MEM = 4g
|
||||
THREADS = 1
|
||||
WALLTIME = 72
|
||||
|
||||
GPUJOB_HPC_MEM ?= 4g
|
||||
|
||||
@ -29,6 +28,10 @@ DEVICE = cuda
|
||||
LOAD_CPU_ENV = echo "nothing to load"
|
||||
LOAD_GPU_ENV = echo "nothing to load"
|
||||
|
||||
## default SLURM option to allocate GPU resources
|
||||
HPC_GPU_ALLOCATION = --gres=gpu:${GPU}:${NR_GPUS}
|
||||
|
||||
|
||||
WORKHOME = ${PWD}/work
|
||||
|
||||
|
||||
@ -40,15 +43,15 @@ LOAD_MARIAN_BUILD_ENV = echo "nothing to load"
|
||||
|
||||
|
||||
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
|
||||
include lib/env/mahti.mk
|
||||
include ${REPOHOME}lib/env/mahti.mk
|
||||
else ifeq (${shell hostname},dx6-ibs-p2)
|
||||
include lib/env/dx6.mk
|
||||
include ${REPOHOME}lib/env/dx6.mk
|
||||
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
||||
include lib/env/dx7.mk
|
||||
include ${REPOHOME}lib/env/dx7.mk
|
||||
else ifneq ($(wildcard /wrk/tiedeman/research),)
|
||||
include lib/env/taito.mk
|
||||
include ${REPOHOME}lib/env/taito.mk
|
||||
else ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
include lib/env/puhti.mk
|
||||
include ${REPOHOME}lib/env/puhti.mk
|
||||
endif
|
||||
|
||||
|
||||
|
4
lib/env/mahti.mk
vendored
4
lib/env/mahti.mk
vendored
@ -41,6 +41,10 @@ LOAD_CPU_ENV = module load ${CPU_MODULES}
|
||||
LOAD_GPU_ENV = module load ${GPU_MODULES}
|
||||
|
||||
|
||||
## extra SLURM directives (up to 5 variables)
|
||||
HPC_EXTRA1 = \#SBATCH --account=${CSCPROJECT}
|
||||
|
||||
|
||||
## setup for compiling marian-nmt
|
||||
|
||||
MARIAN_BUILD_MODULES = gcc/10.3.0 cuda/11.4.2 cudnn/8.0.4.30-11.0-linux-x64 cmake/3.18.4 openblas/0.3.14-omp openmpi/4.0.5-cuda
|
||||
|
14
lib/env/puhti.mk
vendored
14
lib/env/puhti.mk
vendored
@ -27,12 +27,20 @@ export PATH := ${APPLHOME}/bin:${PATH}
|
||||
|
||||
CPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env
|
||||
GPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env
|
||||
LOAD_CPU_ENV = module load ${CPU_MODULES}
|
||||
LOAD_GPU_ENV = module load ${GPU_MODULES}
|
||||
LOAD_CPU_ENV = module load ${CPU_MODULES} && module list
|
||||
LOAD_GPU_ENV = module load ${GPU_MODULES} && module list
|
||||
|
||||
ifdef HPC_DISK
|
||||
HPC_GPU_ALLOCATION = --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}
|
||||
HPC_CPU_EXTRA1 = \#SBATCH --gres=nvme:${HPC_DISK}
|
||||
endif
|
||||
|
||||
## extra SLURM directives (up to 3 numbered variables)
|
||||
HPC_EXTRA1 = \#SBATCH --account=${CSCPROJECT}
|
||||
|
||||
|
||||
BUILD_MODULES = cmake perl/5.30.0
|
||||
LOAD_BUILD_ENV = module load ${BUILD_MODULES}
|
||||
LOAD_BUILD_ENV = module load ${BUILD_MODULES} && module list
|
||||
|
||||
MARIAN_BUILD_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 cmake/3.18.2
|
||||
LOAD_MARIAN_BUILD_ENV = module purge && module load ${MARIAN_BUILD_MODULES}
|
||||
|
@ -5,24 +5,24 @@
|
||||
#
|
||||
|
||||
|
||||
include lib/projects/celtic.mk
|
||||
include lib/projects/finland.mk
|
||||
include lib/projects/fiskmo.mk
|
||||
include lib/projects/memad.mk
|
||||
include lib/projects/multilingual.mk
|
||||
include lib/projects/opus.mk
|
||||
include lib/projects/romance.mk
|
||||
include lib/projects/russian.mk
|
||||
include lib/projects/sami.mk
|
||||
include lib/projects/finno-ugric.mk
|
||||
include lib/projects/wikimedia.mk
|
||||
include lib/projects/wikimatrix.mk
|
||||
include ${REPOHOME}lib/projects/celtic.mk
|
||||
include ${REPOHOME}lib/projects/finland.mk
|
||||
include ${REPOHOME}lib/projects/fiskmo.mk
|
||||
include ${REPOHOME}lib/projects/memad.mk
|
||||
include ${REPOHOME}lib/projects/multilingual.mk
|
||||
include ${REPOHOME}lib/projects/opus.mk
|
||||
include ${REPOHOME}lib/projects/romance.mk
|
||||
include ${REPOHOME}lib/projects/russian.mk
|
||||
include ${REPOHOME}lib/projects/sami.mk
|
||||
include ${REPOHOME}lib/projects/finno-ugric.mk
|
||||
include ${REPOHOME}lib/projects/wikimedia.mk
|
||||
include ${REPOHOME}lib/projects/wikimatrix.mk
|
||||
|
||||
include lib/projects/doclevel.mk
|
||||
include lib/projects/simplify.mk
|
||||
include ${REPOHOME}lib/projects/doclevel.mk
|
||||
include ${REPOHOME}lib/projects/simplify.mk
|
||||
|
||||
include lib/projects/tatoeba.mk
|
||||
include ${REPOHOME}lib/projects/tatoeba.mk
|
||||
|
||||
include lib/projects/americasnlp2021.mk
|
||||
include ${REPOHOME}lib/projects/americasnlp2021.mk
|
||||
|
||||
include lib/projects/distill.mk
|
||||
include ${REPOHOME}lib/projects/distill.mk
|
||||
|
61
lib/slurm.mk
61
lib/slurm.mk
@ -18,13 +18,14 @@ endif
|
||||
## submit job to gpu queue
|
||||
## echo '#SBATCH --exclude=r18g08' >> $@
|
||||
|
||||
SLURM_JOBNAME ?= $(subst -,,${LANGPAIRSTR})
|
||||
|
||||
%.submit:
|
||||
mkdir -p ${WORKDIR}
|
||||
echo '#!/bin/bash -l' > $@
|
||||
echo '#SBATCH -J "$(subst -,,${LANGPAIRSTR})${@:.submit=}"' >>$@
|
||||
echo '#SBATCH -o $(subst -,,${LANGPAIRSTR})${@:.submit=}.out.%j' >> $@
|
||||
echo '#SBATCH -e $(subst -,,${LANGPAIRSTR})${@:.submit=}.err.%j' >> $@
|
||||
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submit=}"' >>$@
|
||||
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submit=}.out.%j' >> $@
|
||||
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submit=}.err.%j' >> $@
|
||||
echo '#SBATCH --mem=${HPC_MEM}' >> $@
|
||||
ifdef EMAIL
|
||||
echo '#SBATCH --mail-type=END' >> $@
|
||||
@ -33,21 +34,16 @@ endif
|
||||
echo '#SBATCH -n 1' >> $@
|
||||
echo '#SBATCH -N 1' >> $@
|
||||
echo '#SBATCH -p ${HPC_GPUQUEUE}' >> $@
|
||||
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
|
||||
echo '#SBATCH --account=${CSCPROJECT}' >> $@
|
||||
endif
|
||||
ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
echo '#SBATCH --account=${CSCPROJECT}' >> $@
|
||||
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}' >> $@
|
||||
else
|
||||
echo '#SBATCH --gres=gpu:${GPU}:${NR_GPUS}' >> $@
|
||||
endif
|
||||
echo '#SBATCH ${HPC_GPU_ALLOCATION}' >> $@
|
||||
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
|
||||
echo 'module use -a /proj/nlpl/modules' >> $@
|
||||
for m in ${GPU_MODULES}; do \
|
||||
echo "module load $$m" >> $@; \
|
||||
done
|
||||
echo 'module list' >> $@
|
||||
echo '${HPC_EXTRA}' >> $@
|
||||
echo '${HPC_EXTRA1}' >> $@
|
||||
echo '${HPC_EXTRA2}' >> $@
|
||||
echo '${HPC_EXTRA3}' >> $@
|
||||
echo '${HPC_GPU_EXTRA1}' >> $@
|
||||
echo '${HPC_GPU_EXTRA2}' >> $@
|
||||
echo '${HPC_GPU_EXTRA3}' >> $@
|
||||
echo '${LOAD_GPU_ENV}' >> $@
|
||||
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
|
||||
echo 'pwd' >> $@
|
||||
echo 'echo "Starting at `date`"' >> $@
|
||||
@ -65,31 +61,26 @@ endif
|
||||
%.submitcpu:
|
||||
mkdir -p ${WORKDIR}
|
||||
echo '#!/bin/bash -l' > $@
|
||||
echo '#SBATCH -J "$(subst -,,${LANGPAIRSTR})${@:.submitcpu=}"' >>$@
|
||||
echo '#SBATCH -o $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.out.%j' >> $@
|
||||
echo '#SBATCH -e $(subst -,,${LANGPAIRSTR})${@:.submitcpu=}.err.%j' >> $@
|
||||
echo '#SBATCH --mem=${HPC_MEM}' >> $@
|
||||
echo '#SBATCH -J "$(SLURM_JOBNAME)${@:.submitcpu=}"' >>$@
|
||||
echo '#SBATCH -o $(SLURM_JOBNAME)${@:.submitcpu=}.out.%j' >> $@
|
||||
echo '#SBATCH -e $(SLURM_JOBNAME)${@:.submitcpu=}.err.%j' >> $@
|
||||
echo '#SBATCH --mem=${HPC_MEM}' >> $@
|
||||
ifdef EMAIL
|
||||
echo '#SBATCH --mail-type=END' >> $@
|
||||
echo '#SBATCH --mail-user=${EMAIL}' >> $@
|
||||
endif
|
||||
ifeq (${shell hostname -d 2>/dev/null},mahti.csc.fi)
|
||||
echo '#SBATCH --account=${CSCPROJECT}' >> $@
|
||||
endif
|
||||
ifeq (${shell hostname --domain 2>/dev/null},bullx)
|
||||
echo '#SBATCH --account=${CSCPROJECT}' >> $@
|
||||
echo '#SBATCH --gres=nvme:${HPC_DISK}' >> $@
|
||||
echo '#SBATCH --mail-type=END' >> $@
|
||||
echo '#SBATCH --mail-user=${EMAIL}' >> $@
|
||||
endif
|
||||
echo '#SBATCH -n ${HPC_CORES}' >> $@
|
||||
echo '#SBATCH -N ${HPC_NODES}' >> $@
|
||||
echo '#SBATCH -p ${HPC_QUEUE}' >> $@
|
||||
echo '#SBATCH -t ${HPC_TIME}:00' >> $@
|
||||
echo '${HPC_EXTRA}' >> $@
|
||||
echo 'module use -a /proj/nlpl/modules' >> $@
|
||||
for m in ${CPU_MODULES}; do \
|
||||
echo "module load $$m" >> $@; \
|
||||
done
|
||||
echo 'module list' >> $@
|
||||
echo '${HPC_EXTRA1}' >> $@
|
||||
echo '${HPC_EXTRA2}' >> $@
|
||||
echo '${HPC_EXTRA3}' >> $@
|
||||
echo '${HPC_CPU_EXTRA1}' >> $@
|
||||
echo '${HPC_CPU_EXTRA2}' >> $@
|
||||
echo '${HPC_CPU_EXTRA3}' >> $@
|
||||
echo '${LOAD_GPU_ENV}' >> $@
|
||||
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
|
||||
echo 'pwd' >> $@
|
||||
echo 'echo "Starting at `date`"' >> $@
|
||||
|
280
tatoeba/eval/Makefile
Normal file
280
tatoeba/eval/Makefile
Normal file
@ -0,0 +1,280 @@
|
||||
#
|
||||
# evaluate released Tatoeba MT models
|
||||
# with existing benchmarks (collected in OPUS-MT-testsets)
|
||||
#
|
||||
|
||||
|
||||
## set the home directory of the repository
|
||||
## this is to find the included makefiles
|
||||
## (important to have a trailing '/')
|
||||
|
||||
SHELL := bash
|
||||
PWD := ${shell pwd}
|
||||
REPOHOME := ${PWD}/../../
|
||||
|
||||
include ${REPOHOME}lib/env.mk
|
||||
include ${REPOHOME}lib/config.mk
|
||||
include ${REPOHOME}lib/slurm.mk
|
||||
|
||||
|
||||
MODEL_STORAGE := https://object.pouta.csc.fi/Tatoeba-MT-models
|
||||
MODEL_DISTS := ${shell wget -q -O - ${MODEL_STORAGE}/index.txt | grep '.zip$$'}
|
||||
MODEL_DIST = ${firstword ${MODEL_DISTS}}
|
||||
MODEL = ${MODEL_DIST:.zip=}
|
||||
MODEL_LANGPAIR = ${firstword ${subst /, ,${MODEL_DIST}}}
|
||||
MODEL_URL = ${MODEL_STORAGE}/${MODEL_DIST}
|
||||
|
||||
## directory with all test sets (submodule OPUS-MT-testsets)
|
||||
TESTSET_HOME := ${REPOHOME}OPUS-MT-testsets/testsets
|
||||
TESTSET_INDEX := ${REPOHOME}OPUS-MT-testsets/index.txt
|
||||
|
||||
## work directory (for the temporary models)
|
||||
WORK_HOME = ${PWD}
|
||||
WORK_DIR = ${WORK_HOME}/${MODEL}
|
||||
|
||||
## model directory (for test results)
|
||||
## model score file and zipfile with evaluation results
|
||||
MODEL_HOME = ${REPOHOME}models-tatoeba
|
||||
MODEL_DIR = ${MODEL_HOME}/${MODEL}
|
||||
MODEL_SCORES = ${MODEL_DIR}.scores.txt
|
||||
MODEL_EVALZIP = ${MODEL_DIR}.eval.zip
|
||||
|
||||
## all zip files with benchmark results
|
||||
MODEL_EVALZIPS := ${patsubst %.zip,${MODEL_HOME}/%.eval.zip,${MODEL_DISTS}}
|
||||
|
||||
|
||||
## make all evaluation zip-files
|
||||
.PHONY: all
|
||||
all: ${MODEL_EVALZIPS}
|
||||
|
||||
## test: make the first evaluation zip-file
|
||||
.PHONY: first
|
||||
first: $(firstword ${MODEL_EVALZIPS})
|
||||
|
||||
|
||||
## zip-files with all evaluation files
|
||||
## if the zip file already exists: unpack first
|
||||
## to avoid re-doing things
|
||||
## TODO: should also fetch from ObjectStorage if it exists there!
|
||||
${MODEL_EVALZIPS}: ${TESTSET_INDEX}
|
||||
if [ -e $@ ]; then \
|
||||
mkdir -p ${@:.eval.zip=}; \
|
||||
unzip -d ${@:.eval.zip=} $@; \
|
||||
fi
|
||||
${MAKE} MODEL_DIST=${patsubst ${MODEL_HOME}/%.eval.zip,%.zip,$@} eval-model
|
||||
|
||||
.PHONY: eval-model
|
||||
eval-model: ${MODEL_SCORES}
|
||||
if [ -d ${MODEL_DIR} ]; then \
|
||||
cd ${MODEL_DIR} && zip ${MODEL_EVALZIP} *.eval *.compare; \
|
||||
rm -f ${MODEL_DIR}/*.eval; \
|
||||
rm -f ${MODEL_DIR}/*.compare; \
|
||||
rm -f ${MODEL_DIR}.done; \
|
||||
rmdir ${MODEL_DIR}; \
|
||||
fi
|
||||
|
||||
|
||||
## temporary directory with all benchmark results
|
||||
${MODEL_DIR}.done:
|
||||
${MAKE} fetch
|
||||
${MAKE} eval-langpairs
|
||||
${MAKE} cleanup
|
||||
-touch $@
|
||||
|
||||
## cleanup some additional workfiles
|
||||
.PHONY: cleanup
|
||||
cleanup:
|
||||
rm -f ${WORK_DIR}/*.*
|
||||
rm -f ${WORK_DIR}/model/*
|
||||
rmdir ${WORK_DIR}/model
|
||||
rmdir ${WORK_DIR}
|
||||
rmdir ${WORK_HOME}/${MODEL_LANGPAIR}
|
||||
|
||||
#-------------------------------------------------
|
||||
# fetch model and get supported languages
|
||||
#-------------------------------------------------
|
||||
|
||||
## fetch translation model
|
||||
.PHONY: fetch
|
||||
fetch: ${WORK_DIR}/model/decoder.yml
|
||||
|
||||
${WORK_DIR}/model/decoder.yml:
|
||||
mkdir -p ${dir $@}
|
||||
wget -q -O ${dir $@}model.zip ${MODEL_URL}
|
||||
unzip -d ${dir $@} ${dir $@}model.zip
|
||||
## fix an old problem with the pre-process script
|
||||
mv ${dir $@}preprocess.sh ${dir $@}preprocess-old.sh
|
||||
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \
|
||||
< ${dir $@}preprocess-old.sh > ${dir $@}preprocess.sh
|
||||
chmod +x ${dir $@}preprocess.sh
|
||||
|
||||
|
||||
## get supported source and target languages
|
||||
MODELINFO = ${WORK_DIR}/model/README.md
|
||||
ifneq (${wildcard ${MODELINFO}},)
|
||||
SRCLANGS = ${shell grep '\* source language(s)' ${MODELINFO} | cut -f2 -d: | xargs}
|
||||
TRGLANGS = ${shell grep '\* valid language labels' ${MODELINFO} | cut -f2 -d: | tr '<>' ' ' | xargs}
|
||||
ifeq (${words ${TRGLANGS}},0)
|
||||
TRGLANGS = ${shell grep '\* target language(s)' ${MODELINFO} | cut -f2 -d: | xargs}
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
## all language pairs that the model supports
|
||||
MODEL_LANGPAIRS = ${MODEL_LANGPAIR} \
|
||||
${shell for s in ${SRCLANGS}; do for t in ${TRGLANGS}; do echo "$$s-$$t"; done done}
|
||||
|
||||
## get language pairs for which we have test sets
|
||||
ALL_LANGPAIRS := $(notdir ${wildcard ${TESTSET_HOME}/*})
|
||||
LANGPAIRS = ${sort $(filter ${ALL_LANGPAIRS},${MODEL_LANGPAIRS})}
|
||||
LANGPAIR = ${firstword ${LANGPAIRS}}
|
||||
LANGPAIRSTR = ${LANGPAIR}
|
||||
SRC = ${firstword ${subst -, ,${LANGPAIR}}}
|
||||
TRG = ${lastword ${subst -, ,${LANGPAIR}}}
|
||||
TESTSET_DIR = ${TESTSET_HOME}/${LANGPAIR}
|
||||
TESTSETS = ${notdir ${basename ${wildcard ${TESTSET_DIR}/*.${SRC}}}}
|
||||
TESTSET = ${firstword ${TESTSETS}}
|
||||
|
||||
|
||||
## eval all language pairs
|
||||
.PHONY: eval-langpairs
|
||||
eval-langpairs:
|
||||
for l in ${LANGPAIRS}; do \
|
||||
${MAKE} LANGPAIR=$$l eval-testsets; \
|
||||
done
|
||||
|
||||
## eval all testsets for the current langpair
|
||||
.PHONY: eval-testsets
|
||||
eval-testsets:
|
||||
for t in ${TESTSETS}; do \
|
||||
${MAKE} TESTSET=$$t eval; \
|
||||
done
|
||||
|
||||
#-------------------------------------------------
|
||||
# create input file for translation
|
||||
#-------------------------------------------------
|
||||
|
||||
.PHONY: input
|
||||
input: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.input
|
||||
|
||||
|
||||
## more than one target language
|
||||
## --> need target language labels
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
USE_TARGET_LABELS = 1
|
||||
else
|
||||
USE_TARGET_LABELS = 0
|
||||
endif
|
||||
|
||||
## double-check whether the preprocessing script
|
||||
## requires both language IDs or not
|
||||
ifeq (${shell grep 'source-langid target-langid' ${WORK_DIR}/model/preprocess.sh 2>/dev/null | wc -l},1)
|
||||
USE_BOTH_LANGIDS = 1
|
||||
endif
|
||||
|
||||
## take care of different calls to the pre-processing script
|
||||
ifeq (${USE_BOTH_LANGIDS},1)
|
||||
PREPROCESS = ${WORK_DIR}/model/preprocess.sh ${SRC} ${TRG} ${WORK_DIR}/model/source.spm
|
||||
else
|
||||
PREPROCESS = ${WORK_DIR}/model/preprocess.sh ${SRC} ${WORK_DIR}/model/source.spm
|
||||
endif
|
||||
|
||||
|
||||
${WORK_DIR}/${TESTSET}.${LANGPAIR}.input: ${TESTSET_DIR}/${TESTSET}.${SRC}
|
||||
${PREPROCESS} < $< > $@
|
||||
## check whether we need to replace the target language labels
|
||||
ifeq (${USE_TARGET_LABELS},1)
|
||||
ifneq (${wildcard ${TESTSET_DIR}/${TESTSET}.${TRG}.labels},)
|
||||
cut -f2- -d' ' $@ > $@.tmp1
|
||||
sed 's/^/>>/;s/$$/<</' < ${TESTSET_DIR}/${TESTSET}.${TRG}.labels > $@.tmp2
|
||||
paste -d' ' $@.tmp2 $@.tmp1 > $@
|
||||
rm -f $@.tmp2 $@.tmp1
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
#-------------------------------------------------
|
||||
# create output file (translation)
|
||||
#-------------------------------------------------
|
||||
|
||||
.PHONY: output
|
||||
output: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output
|
||||
|
||||
${WORK_DIR}/${TESTSET}.${LANGPAIR}.output: ${WORK_DIR}/${TESTSET}.${LANGPAIR}.input
|
||||
if [ -e $< ]; then \
|
||||
if [ -s $< ]; then \
|
||||
${LOAD_ENV} && ${MARIAN_DECODER} -i $< \
|
||||
-c ${WORK_DIR}/model/decoder.yml \
|
||||
${MARIAN_DECODER_FLAGS} |\
|
||||
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' > $@; \
|
||||
fi \
|
||||
fi
|
||||
|
||||
|
||||
#-------------------------------------------------
|
||||
# evaluation
|
||||
#-------------------------------------------------
|
||||
|
||||
.PHONY: eval
|
||||
eval: ${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval
|
||||
|
||||
## adjust tokenisation to non-space-separated languages
|
||||
## TODO: is it correct to simply use 'zh' even for jpn or should we use 'intl'?
|
||||
ifneq ($(filter cmn jpn yue zho,${TRG}),)
|
||||
SACREBLEU_PARAMS = --tokenize zh
|
||||
endif
|
||||
|
||||
${MODEL_DIR}/${TESTSET}.${LANGPAIR}.eval:
|
||||
${MAKE} ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output
|
||||
if [ -e ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output ]; then \
|
||||
if [ -s ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output ]; then \
|
||||
mkdir -p ${dir $@}; \
|
||||
cat ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output | \
|
||||
sacrebleu ${SACREBLEU_PARAMS} ${TESTSET_DIR}/${TESTSET}.${TRG} > $@; \
|
||||
cat ${WORK_DIR}/${TESTSET}.${LANGPAIR}.output | \
|
||||
sacrebleu ${SACREBLEU_PARAMS} --metrics=chrf --width=3 ${TESTSET_DIR}/${TESTSET}.${TRG} >> $@; \
|
||||
paste -d "\n" \
|
||||
${TESTSET_DIR}/${TESTSET}.${SRC} \
|
||||
${TESTSET_DIR}/${TESTSET}.${TRG} \
|
||||
${WORK_DIR}/${TESTSET}.${LANGPAIR}.output |\
|
||||
sed -e "s/'/'/g" \
|
||||
-e 's/"/"/g' \
|
||||
-e 's/</</g' \
|
||||
-e 's/>/>/g' \
|
||||
-e 's/&/&/g' |\
|
||||
sed 'n;n;G;' > ${@:.eval=.compare}; \
|
||||
fi \
|
||||
fi
|
||||
|
||||
|
||||
#-------------------------------------------------
|
||||
# collect all scores in a file
|
||||
#-------------------------------------------------
|
||||
|
||||
.PHONY: scores
|
||||
scores: ${MODEL_SCORES}
|
||||
|
||||
${MODEL_SCORES}: ${TESTSET_INDEX}
|
||||
${MAKE} ${MODEL_DIR}.done
|
||||
if [ -d ${MODEL_DIR} ]; then \
|
||||
grep -H BLEU ${MODEL_DIR}/*eval | sort > $@.bleu; \
|
||||
grep -H chrF ${MODEL_DIR}/*eval | sort > $@.chrf; \
|
||||
cut -f1 -d: $@.bleu | rev | cut -f2 -d. | rev > $@.langs; \
|
||||
cut -f1 -d: $@.bleu | rev | cut -f1 -d/ | cut -f3- -d. | rev > $@.testsets; \
|
||||
cat $@.chrf | rev | cut -f1 -d' ' | rev > $@.chrf-scores; \
|
||||
cut -f2 -d= $@.bleu | cut -f2 -d' ' > $@.bleu-scores; \
|
||||
cut -f1 -d: $@.bleu | rev | cut -f2,3 -d/ | \
|
||||
rev | sed 's#^#${MODEL_STORAGE}/#' | sed 's/$$/.zip/' > $@.urls; \
|
||||
cut -f1 -d: $@.bleu | sed 's/.eval$$/.compare/' | \
|
||||
xargs wc -l | grep -v '[0-9] total' | \
|
||||
perl -pe '$$_/=4;print "\n"' | tail -n +2 > $@.nrlines; \
|
||||
cat $@.bleu | rev | cut -f1 -d' ' | rev | cut -f1 -d')' > $@.nrwords; \
|
||||
paste $@.langs $@.testsets \
|
||||
$@.chrf-scores $@.bleu-scores \
|
||||
$@.urls $@.nrlines $@.nrwords > $@; \
|
||||
rm -f $@.bleu $@.chrf $@.langs $@.testsets \
|
||||
$@.chrf-scores $@.bleu-scores \
|
||||
$@.urls $@.nrlines $@.nrwords; \
|
||||
fi
|
||||
|
Loading…
Reference in New Issue
Block a user