language group jobs with some more documentation

This commit is contained in:
Joerg Tiedemann 2020-06-29 12:26:45 +03:00
parent 62c9414122
commit 7df91a9eaa
3 changed files with 151 additions and 135 deletions

View File

@ -117,6 +117,52 @@ Note that this can be quite a lot of language pairs!
## Working with language groups
Language groups are defined according to ISO639-5. The Perl module ISO::639::5 needs to be installed
to retrieve the language group hierarchy. Various combinations of language groups and English can be
trained using the following commands (note that this starts all combinations, see below for individual jobs):
```
make tatoeba-group2eng # start train jobs for all language groups to English
make tatoeba-eng2group # start train jobs for English to all language groups
make tatoeba-langgroup # start train jobs for bi-directional models for all language groups
```
Combine all jobs above:
```
make tatoeba-langgroups
```
Create release packages from the language group models
```
make tatoeba-group2eng-dist # make package for all trained group2eng models
make tatoeba-eng2group-dist # make package for all trained eng2group models
make tatoeba-langgroup-dist # make package for all trained langgroup models
```
Jobs for specific tasks and language groups; example task: `gmw2eng`:
```
make tateoba-gmw2eng-train # make data and start training job
make tateoba-gmw2eng-eval # evaluate model with multilingual test data
make tateoba-gmw2eng-evalall # evaluate model with all individual language pairs
make tateoba-gmw2eng-dist # create release package
```
Similar jobs can be started for any supported language group from and to English
and also as a bidirectional model for all languages in the given language group.
Replace `gmw2eng` with, for example, `eng2gem` (English to Germanic) or
`gmq` (multilingual model for North Germanic languages).
## Generate evaluation tables
Various lists and tables can be generated from the evaluated model files. Remove old files and generat new ones by running:

View File

@ -393,21 +393,28 @@ else
MARIAN_WORKSPACE = 10000
endif
## check whether we have GPUs available
## if not: use CPU mode for decoding
NVIDIA_SMI = ${shell which nvidia-smi 2>dev/null}
ifdef NVIDIA_SMI
ifeq (${shell nvidia-smi | grep failed | wc -l},1)
MARIAN = ${MARIANCPU}
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
endif
else
MARIAN = ${MARIANCPU}
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
endif
## weights associated with training examples
ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
MARIAN_TRAIN_WEIGHTS = --data-weighting ${TRAIN_WEIGHTS}
endif
### training a model with Marian NMT
##
## NR allows to train several models for proper ensembling

View File

@ -47,6 +47,32 @@
# make tatoeba-multilingual-distsubset-medium .... create release files
# make tatoeba-multilingual-evalsubset-medium .... eval all langpairs
#---------------------------------------------------------------------
# jobs for multilingual language group models
#
# make tatoeba-group2eng ...... start train jobs for all language groups to English
# make tatoeba-eng2group ...... start train jobs for English to all language groups
# make tatoeba-langgroup ...... start train jobs for bi-directional models for all language groups
#
# make tatoeba-langgroups ..... make all jobs from above
#
#
# make tatoeba-group2eng-dist . make package for all trained group2eng models
# make tatoeba-eng2group-dist . make package for all trained eng2group models
# make tatoeba-langgroup-dist . make package for all trained langgroup models
#
#
# jobs for specific tasks and language groups, example task: "gmw2eng"
#
# make tateoba-gmw2eng-train .. make data and start training job
# make tateoba-gmw2eng-eval ... evaluate model with multilingual test data
# make tateoba-gmw2eng-evalall evaluate model with all individual language pairs
# make tateoba-gmw2eng-dist ... create release package
#
# Similar jobs can be started for any supported language group from and to English
# and also as a bidirectional model for all languages in the given language group.
# Replace "gmw2eng" with, for example, "eng2gem" (English to Germanic) or
# "gmq" (multilingual model for North Germanic languages).
#---------------------------------------------------------------------
#
# generate evaluation tables
#
@ -57,14 +83,14 @@
## general parameters for Tatoeba models
TATOEBA_DATAURL = https://object.pouta.csc.fi/Tatoeba-Challenge
TATOEBA_RAWGIT = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_WORK = ${PWD}/work-tatoeba
TATOEBA_DATA = ${TATOEBA_WORK}/data/${PRE}
TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge
TATOEBA_RAWGIT := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
TATOEBA_WORK := ${PWD}/work-tatoeba
TATOEBA_DATA := ${TATOEBA_WORK}/data/${PRE}
TATOEBA_MODEL_CONTAINER = Tatoeba-MT-models
TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models
TATOEBA_PARAMS = TRAINSET=Tatoeba-train \
TATOEBA_PARAMS := TRAINSET=Tatoeba-train \
DEVSET=Tatoeba-dev \
TESTSET=Tatoeba-test \
TESTSET_NAME=Tatoeba-test \
@ -80,72 +106,42 @@ TATOEBA_PARAMS = TRAINSET=Tatoeba-train \
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
ALT_MODEL_DIR=tatoeba \
SKIP_DATA_DETAILS=1 \
MIN_BLEU_SCORE=10 \
MIN_BLEU_SCORE=10
ISO639 := iso639
GET_ISO_CODE := ${ISO639} -m
## taken from the Tatoeba-Challenge Makefile
## requires local data for setting TATOEBA_LANGS
# EXCLUDE_LANGGROUPS = afa
ISO639 = iso639
GET_ISO_CODE = ${ISO639} -m
# TATOEBA_LANGS = ${sort ${patsubst %.txt.gz,%,${notdir ${wildcard ${OPUSHOME}/Tatoeba/latest/mono/*.txt.gz}}}}
# TATOEBA_LANGS3 = ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${TATOEBA_LANGS}}}}
# TATOEBA_LANGGROUPS = ${sort ${shell langgroup -p -n ${TATOEBA_LANGS3} 2>/dev/null}}
# TATOEBA_LANGGROUPS1 = ${shell langgroup -g -n ${TATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
# TATOEBA_LANGGROUPS2 = ${shell langgroup -G -n ${TATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
OPUS_LANGS3 = ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${OPUSLANGS}}}}
OPUS_LANG_PARENTS = ${sort ${shell langgroup -p -n ${OPUS_LANGS3} 2>/dev/null}}
OPUS_LANG_GRANDPARENTS = ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2>/dev/null}}
OPUS_LANG_GROUPS = ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}}
# OPUS_LANGGROUPS1 = ${shell langgroup -g -n ${OPUS_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
# OPUS_LANGGROUPS2 = ${shell langgroup -G -n ${OPUS_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
OPUS_LANGS3 := ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${OPUSLANGS}}}}
OPUS_LANG_PARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANGS3} 2>/dev/null}}
OPUS_LANG_GRANDPARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2>/dev/null}}
OPUS_LANG_GROUPS := ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}}
###########################################################################################
# language groups
###########################################################################################
## print language groups
opus-langgroups:
@echo ${OPUS_LANG_PARENTS}
@echo ${OPUS_LANG_GRANDPARENTS}
print-langgroups:
@echo ${OPUS_LANG_GROUPS}
# ## multilingual models for language groups
# tatoeba-langgroup:
# for g in ${TATOEBA_LANGGROUPS1}; do \
# l=`echo $$g | sed 's/\+/ /g'`; \
# n=`langgroup -p $$l | cut -f1 -d' '`; \
# ${MAKE} LANGPAIRSTR="$$n-$$n" TRGLANGS="$$l" SRCLANGS="$$l" \
# MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
# done
# ## models for language groups to English
# tatoeba-group2eng:
# for g in ${TATOEBA_LANGGROUPS1}; do \
# l=`echo $$g | sed 's/\+/ /g'`; \
# n=`langgroup -p $$l | cut -f1 -d' '`; \
# ${MAKE} LANGPAIRSTR="$$n-eng" SRCLANGS="$$l" TRGLANGS=eng \
# MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
# done
# ## models for English to language groups
# tatoeba-eng2group:
# for g in ${TATOEBA_LANGGROUPS1}; do \
# l=`echo $$g | sed 's/\+/ /g'`; \
# n=`langgroup -p $$l | cut -f1 -d' '`; \
# ${MAKE} LANGPAIRSTR="eng-$$n" TRGLANGS="$$l" SRCLANGS=eng \
# MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
# done
## start all jobs for all combinations of
## - language groups and English (separate in both directions)
## - languages in language groups (bi-directional)
##
## language groups include parents and grandparents
tatoeba-langgroups:
${MAKE} tatoeba-group2eng
@ -155,15 +151,15 @@ tatoeba-langgroups:
#### language-group to English
GROUP2ENG_JOB = $(patsubst %,tatoeba-%2eng-job,${OPUS_LANG_GROUPS})
GROUP2ENG_TRAIN = $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS})
GROUP2ENG_EVAL = $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS})
GROUP2ENG_EVALALL = $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS})
GROUP2ENG_DIST = $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS})
GROUP2ENG_TRAIN := $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS})
GROUP2ENG_EVAL := $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS})
GROUP2ENG_EVALALL := $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS})
GROUP2ENG_DIST := $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS})
tatoeba-group2eng: ${GROUP2ENG_JOB}
# tatoeba-group2eng-dist: ${GROUP2ENG_EVAL} ${GROUP2ENG_EVALALL}
# ${MAKE} ${GROUP2ENG_DIST}
## start all jobs for language group to English translation
tatoeba-group2eng: ${GROUP2ENG_TRAIN}
## only start this if there is a model
tatoeba-group2eng-dist:
for g in ${OPUS_LANG_GROUPS}; do \
if [ `find ${TATOEBA_WORK}/$$g-eng -name '*.npz' | wc -l` -gt 0 ]; then \
@ -173,21 +169,22 @@ tatoeba-group2eng-dist:
fi \
done
${GROUP2ENG_JOB}:
${MAKE} $(patsubst %-job,%-train,$@)
${MAKE} $(patsubst %-job,%-eval,$@)
${MAKE} $(patsubst %-job,%-evalall,$@)
${MAKE} $(patsubst %-job,%-dist,$@)
## this would be easier but does not check whether a mode exists
## --> tries to build it if there is no model
# tatoeba-group2eng-dist: ${GROUP2ENG_EVAL} ${GROUP2ENG_EVALALL}
# ${MAKE} ${GROUP2ENG_DIST}
${GROUP2ENG_TRAIN}:
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-train,%,$@)-eng \
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-train,%,$@)}" \
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-train,%,$@)})" \
TRGLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
tatoeba-multilingual-train
${GROUP2ENG_EVAL}:
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-eval,%,$@)-eng \
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-eval,%,$@)}" \
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-eval,%,$@)})" \
TRGLANGS=eng \
MODELTYPE=transformer \
${TATOEBA_PARAMS} \
@ -195,13 +192,13 @@ ${GROUP2ENG_EVAL}:
${GROUP2ENG_EVALALL}:
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-evalall,%,$@)-eng \
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-evalall,%,$@)}" \
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-evalall,%,$@)})" \
TRGLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
tatoeba-multilingual-eval
${GROUP2ENG_DIST}:
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-dist,%,$@)-eng \
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-dist,%,$@)}" \
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-dist,%,$@)})" \
TRGLANGS=eng \
MODELTYPE=transformer \
${TATOEBA_PARAMS} \
@ -210,15 +207,13 @@ ${GROUP2ENG_DIST}:
#### English to language group
ENG2GROUP_JOB = $(patsubst %,tatoeba-eng2%-job,${OPUS_LANG_GROUPS})
ENG2GROUP_TRAIN = $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS})
ENG2GROUP_EVAL = $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS})
ENG2GROUP_EVALALL = $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS})
ENG2GROUP_DIST = $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS})
ENG2GROUP_TRAIN := $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS})
ENG2GROUP_EVAL := $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS})
ENG2GROUP_EVALALL := $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS})
ENG2GROUP_DIST := $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS})
tatoeba-eng2group: ${ENG2GROUP_TRAIN}
tatoeba-eng2group: ${ENG2GROUP_JOB}
# tatoeba-eng2group-dist: ${ENG2GROUP_EVAL} ${ENG2GROUP_EVALALL}
# ${MAKE} ${ENG2GROUP_DIST}
tatoeba-eng2group-dist:
for g in ${OPUS_LANG_GROUPS}; do \
if [ `find ${TATOEBA_WORK}/eng-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
@ -228,45 +223,34 @@ tatoeba-eng2group-dist:
fi \
done
tatoeba-eng2group-dist2:
for g in ${OPUS_LANG_GROUPS}; do \
if [ `find ${TATOEBA_WORK}/eng-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
mv models-tatoeba/eng-$$g models-tatoeba/eng-$$g-old2; \
${MAKE} tatoeba-eng2$${g}-dist; \
fi \
done
# tatoeba-eng2group-dist: ${ENG2GROUP_EVAL} ${ENG2GROUP_EVALALL}
# ${MAKE} ${ENG2GROUP_DIST}
${ENG2GROUP_JOB}:
${MAKE} $(patsubst %-job,%-train,$@)
${MAKE} $(patsubst %-job,%-eval,$@)
${MAKE} $(patsubst %-job,%-evalall,$@)
${MAKE} $(patsubst %-job,%-dist,$@)
${ENG2GROUP_TRAIN}:
${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-train,%,$@) \
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-train,%,$@)}" \
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-train,%,$@)})" \
SRCLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
tatoeba-multilingual-train
${ENG2GROUP_EVAL}:
${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-eval,%,$@) \
SRCLANGS=eng \
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-eval,%,$@)}" \
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-eval,%,$@)})" \
MODELTYPE=transformer \
${TATOEBA_PARAMS} \
compare
${ENG2GROUP_EVALALL}:
${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-evalall,%,$@) \
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-evalall,%,$@)}" \
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-evalall,%,$@)})" \
SRCLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
tatoeba-multilingual-eval
${ENG2GROUP_DIST}:
${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-dist,%,$@) \
SRCLANGS=eng \
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-dist,%,$@)}" \
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-dist,%,$@)})" \
MODELTYPE=transformer \
${TATOEBA_PARAMS} \
best-dist
@ -275,15 +259,13 @@ ${ENG2GROUP_DIST}:
#### multilingual language-group (bi-directional
LANGGROUP_JOB = $(patsubst %,tatoeba-%-job,${OPUS_LANG_GROUPS})
LANGGROUP_TRAIN = $(patsubst %,tatoeba-%-train,${OPUS_LANG_GROUPS})
LANGGROUP_EVAL = $(patsubst %,tatoeba-%-eval,${OPUS_LANG_GROUPS})
LANGGROUP_EVALALL = $(patsubst %,tatoeba-%-evalall,${OPUS_LANG_GROUPS})
LANGGROUP_DIST = $(patsubst %,tatoeba-%-dist,${OPUS_LANG_GROUPS})
LANGGROUP_TRAIN := $(patsubst %,tatoeba-%-train,${OPUS_LANG_GROUPS})
LANGGROUP_EVAL := $(patsubst %,tatoeba-%-eval,${OPUS_LANG_GROUPS})
LANGGROUP_EVALALL := $(patsubst %,tatoeba-%-evalall,${OPUS_LANG_GROUPS})
LANGGROUP_DIST := $(patsubst %,tatoeba-%-dist,${OPUS_LANG_GROUPS})
tatoeba-langgroup: ${LANGGROUP_TRAIN}
tatoeba-langgroup: ${LANGGROUP_JOB}
# tatoeba-langgroup-dist: ${LANGGROUP_EVAL} ${LANGGROUP_EVALALL}
# ${MAKE} ${LANGGROUP_DIST}
tatoeba-langgroup-dist:
for g in ${OPUS_LANG_GROUPS}; do \
if [ `find ${TATOEBA_WORK}/$$g-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
@ -293,40 +275,36 @@ tatoeba-langgroup-dist:
fi \
done
${LANGGROUP_JOB}:
${MAKE} $(patsubst %-job,%-train,$@)
${MAKE} $(patsubst %-job,%-eval,$@)
${MAKE} $(patsubst %-job,%-evalall,$@)
${MAKE} $(patsubst %-job,%-dist,$@)
# tatoeba-langgroup-dist: ${LANGGROUP_EVAL} ${LANGGROUP_EVALALL}
# ${MAKE} ${LANGGROUP_DIST}
${LANGGROUP_TRAIN}:
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-train,%,$@) \
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)}" \
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)}" \
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
tatoeba-multilingual-train
${LANGGROUP_EVAL}:
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-eval,%,$@) \
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-eval,%,$@)}" \
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-eval,%,$@)}" \
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
MODELTYPE=transformer \
${TATOEBA_PARAMS} \
compare
${LANGGROUP_EVALALL}:
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-evalall,%,$@) \
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-evalall,%,$@)}" \
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-evalall,%,$@)}" \
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
tatoeba-multilingual-eval
${LANGGROUP_DIST}:
${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-dist,%,$@) \
SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-dist,%,$@)}" \
TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-dist,%,$@)}" \
TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
MODELTYPE=transformer \
${TATOEBA_PARAMS} \
best-dist
@ -624,22 +602,7 @@ tatoeba-multilingual-testsets:
## generic target for tatoeba challenge jobs
%-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
${MAKE} TRAINSET=Tatoeba-train \
DEVSET=Tatoeba-dev \
TESTSET=Tatoeba-test \
TESTSET_NAME=Tatoeba-test \
SMALLEST_TRAINSIZE=1000 \
USE_REST_DEVDATA=0 \
HELDOUTSIZE=0 \
DEVSIZE=5000 \
TESTSIZE=10000 \
DEVMINSIZE=200 \
WORKHOME=${TATOEBA_WORK} \
MODELSHOME=${PWD}/models-tatoeba \
MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \
MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
ALT_MODEL_DIR=tatoeba \
SKIP_DATA_DETAILS=1 \
${MAKE} ${TATOEBA_PARAMS} \
LANGPAIRSTR=${LANGPAIRSTR} \
SRCLANGS="${shell cat ${word 1,$^} | sed 's/ *$$//;s/^ *//'}" \
TRGLANGS="${shell cat ${word 2,$^} | sed 's/ *$$//;s/^ *//'}" \