language group jobs with some more documentation

2024-09-11 20:27:19 +03:00 · 2020-06-29 12:26:45 +03:00 · 2020-06-29 12:26:45 +03:00 · 7df91a9eaa
commit 7df91a9eaa
parent 62c9414122
3 changed files with 151 additions and 135 deletions
--- a/doc/TatoebaChallenge.md
+++ b/doc/TatoebaChallenge.md
@ -117,6 +117,52 @@ Note that this can be quite a lot of language pairs!



+## Working with language groups
+
+Language groups are defined according to ISO639-5. The Perl module ISO::639::5 needs to be installed 
+to retrieve the language group hierarchy. Various combinations of language groups and English can be
+trained using the following commands (note that this starts all combinations, see below for individual jobs):
+
+```
+make tatoeba-group2eng   # start train jobs for all language groups to English
+make tatoeba-eng2group   # start train jobs for English to all language groups
+make tatoeba-langgroup   # start train jobs for bi-directional models for all language groups
+```
+
+Combine all jobs above:
+
+```
+make tatoeba-langgroups
+```
+
+
+Create release packages from the language group models
+
+```
+make tatoeba-group2eng-dist  # make package for all trained group2eng models
+make tatoeba-eng2group-dist  # make package for all trained eng2group models
+make tatoeba-langgroup-dist  # make package for all trained langgroup models
+```
+
+
+Jobs for specific tasks and language groups; example task: `gmw2eng`:
+
+```
+make tateoba-gmw2eng-train    # make data and start training job
+make tateoba-gmw2eng-eval     # evaluate model with multilingual test data
+make tateoba-gmw2eng-evalall  # evaluate model with all individual language pairs
+make tateoba-gmw2eng-dist     # create release package
+```
+
+Similar jobs can be started for any supported language group from and to English
+and also as a bidirectional model for all languages in the given language group.
+Replace `gmw2eng` with, for example, `eng2gem` (English to Germanic) or 
+`gmq` (multilingual model for North Germanic languages).
+
+
+
+
+
 ## Generate evaluation tables

 Various lists and tables can be generated from the evaluated model files. Remove old files and generat new ones by running:
--- a/lib/config.mk
+++ b/lib/config.mk
@ -393,21 +393,28 @@ else
  MARIAN_WORKSPACE = 10000
 endif

-
+## check whether we have GPUs available
+## if not: use CPU mode for decoding
+NVIDIA_SMI = ${shell which nvidia-smi 2>dev/null}
+ifdef NVIDIA_SMI
 ifeq (${shell nvidia-smi | grep failed | wc -l},1)
  MARIAN = ${MARIANCPU}
  MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
  MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
 endif
+else
+  MARIAN = ${MARIANCPU}
+  MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
+  MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
+endif

-
-
-
+## weights associated with training examples
 ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
 	MARIAN_TRAIN_WEIGHTS = --data-weighting ${TRAIN_WEIGHTS}
 endif


+
 ### training a model with Marian NMT
 ##
 ## NR allows to train several models for proper ensembling
--- a/lib/models/tatoeba.mk
+++ b/lib/models/tatoeba.mk
@ -47,6 +47,32 @@
 #   make tatoeba-multilingual-distsubset-medium .... create release files
 #   make tatoeba-multilingual-evalsubset-medium .... eval all langpairs
 #---------------------------------------------------------------------
+# jobs for multilingual language group models
+#
+#   make tatoeba-group2eng ...... start train jobs for all language groups to English
+#   make tatoeba-eng2group ...... start train jobs for English to all language groups
+#   make tatoeba-langgroup ...... start train jobs for bi-directional models for all language groups
+#
+#   make tatoeba-langgroups ..... make all jobs from above
+#
+#
+#   make tatoeba-group2eng-dist . make package for all trained group2eng models
+#   make tatoeba-eng2group-dist . make package for all trained eng2group models
+#   make tatoeba-langgroup-dist . make package for all trained langgroup models
+#
+#
+# jobs for specific tasks and language groups, example task: "gmw2eng"
+#
+#   make tateoba-gmw2eng-train .. make data and start training job
+#   make tateoba-gmw2eng-eval ... evaluate model with multilingual test data
+#   make tateoba-gmw2eng-evalall  evaluate model with all individual language pairs
+#   make tateoba-gmw2eng-dist ... create release package
+#
+# Similar jobs can be started for any supported language group from and to English
+# and also as a bidirectional model for all languages in the given language group.
+# Replace "gmw2eng" with, for example, "eng2gem" (English to Germanic) or 
+# "gmq" (multilingual model for North Germanic languages).
+#---------------------------------------------------------------------
 #
 # generate evaluation tables
 #
@ -57,14 +83,14 @@

 ## general parameters for Tatoeba models

-TATOEBA_DATAURL = https://object.pouta.csc.fi/Tatoeba-Challenge
-TATOEBA_RAWGIT  = https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
-TATOEBA_WORK    = ${PWD}/work-tatoeba
-TATOEBA_DATA    = ${TATOEBA_WORK}/data/${PRE}
+TATOEBA_DATAURL := https://object.pouta.csc.fi/Tatoeba-Challenge
+TATOEBA_RAWGIT  := https://raw.githubusercontent.com/Helsinki-NLP/Tatoeba-Challenge/master
+TATOEBA_WORK    := ${PWD}/work-tatoeba
+TATOEBA_DATA    := ${TATOEBA_WORK}/data/${PRE}

-TATOEBA_MODEL_CONTAINER = Tatoeba-MT-models
+TATOEBA_MODEL_CONTAINER := Tatoeba-MT-models

-TATOEBA_PARAMS = TRAINSET=Tatoeba-train \
+TATOEBA_PARAMS := TRAINSET=Tatoeba-train \
 		DEVSET=Tatoeba-dev \
 		TESTSET=Tatoeba-test \
 		TESTSET_NAME=Tatoeba-test \
@ -80,72 +106,42 @@ TATOEBA_PARAMS = TRAINSET=Tatoeba-train \
 		MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
 		ALT_MODEL_DIR=tatoeba \
 		SKIP_DATA_DETAILS=1 \
-		MIN_BLEU_SCORE=10 \
+		MIN_BLEU_SCORE=10



+
+ISO639         := iso639
+GET_ISO_CODE   := ${ISO639} -m
+
 ## taken from the Tatoeba-Challenge Makefile
 ## requires local data for setting TATOEBA_LANGS

-# EXCLUDE_LANGGROUPS = afa
-
-ISO639         = iso639
-GET_ISO_CODE   = ${ISO639} -m
-
 # TATOEBA_LANGS       = ${sort ${patsubst %.txt.gz,%,${notdir ${wildcard ${OPUSHOME}/Tatoeba/latest/mono/*.txt.gz}}}}
 # TATOEBA_LANGS3      = ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${TATOEBA_LANGS}}}}
 # TATOEBA_LANGGROUPS  = ${sort ${shell langgroup -p -n ${TATOEBA_LANGS3} 2>/dev/null}}
 # TATOEBA_LANGGROUPS1 = ${shell langgroup -g -n ${TATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
 # TATOEBA_LANGGROUPS2 = ${shell langgroup -G -n ${TATOEBA_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}

-OPUS_LANGS3            = ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${OPUSLANGS}}}}
-OPUS_LANG_PARENTS      = ${sort ${shell langgroup -p -n ${OPUS_LANGS3} 2>/dev/null}}
-OPUS_LANG_GRANDPARENTS = ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2>/dev/null}}
-OPUS_LANG_GROUPS       = ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}}
-
-# OPUS_LANGGROUPS1 = ${shell langgroup -g -n ${OPUS_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
-# OPUS_LANGGROUPS2 = ${shell langgroup -G -n ${OPUS_LANGS3} 2>/dev/null | tr " " "\n" | grep '+'}
-
+OPUS_LANGS3            := ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${OPUSLANGS}}}}
+OPUS_LANG_PARENTS      := ${sort ${shell langgroup -p -n ${OPUS_LANGS3} 2>/dev/null}}
+OPUS_LANG_GRANDPARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2>/dev/null}}
+OPUS_LANG_GROUPS       := ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}}


 ###########################################################################################
 # language groups
 ###########################################################################################

-## print language groups
-opus-langgroups:
-	@echo ${OPUS_LANG_PARENTS}
-	@echo ${OPUS_LANG_GRANDPARENTS}
+print-langgroups:
 	@echo ${OPUS_LANG_GROUPS}

-# ## multilingual models for language groups
-# tatoeba-langgroup:
-# 	for g in ${TATOEBA_LANGGROUPS1}; do \
-# 	  l=`echo $$g | sed 's/\+/ /g'`; \
-# 	  n=`langgroup -p $$l | cut -f1 -d' '`; \
-# 	  ${MAKE} LANGPAIRSTR="$$n-$$n" TRGLANGS="$$l" SRCLANGS="$$l" \
-# 		MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
-# 	done
-
-# ## models for language groups to English
-# tatoeba-group2eng:
-# 	for g in ${TATOEBA_LANGGROUPS1}; do \
-# 	  l=`echo $$g | sed 's/\+/ /g'`; \
-# 	  n=`langgroup -p $$l | cut -f1 -d' '`; \
-# 	  ${MAKE} LANGPAIRSTR="$$n-eng" SRCLANGS="$$l" TRGLANGS=eng \
-# 		MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
-# 	done
-
-# ## models for English to language groups
-# tatoeba-eng2group:
-# 	for g in ${TATOEBA_LANGGROUPS1}; do \
-# 	  l=`echo $$g | sed 's/\+/ /g'`; \
-# 	  n=`langgroup -p $$l | cut -f1 -d' '`; \
-# 	  ${MAKE} LANGPAIRSTR="eng-$$n" TRGLANGS="$$l" SRCLANGS=eng \
-# 		MODELTYPE=transformer FIT_DATA_SIZE=1000000 tatoeba-multilingual-train; \
-# 	done
-

+## start all jobs for all combinations of 
+## - language groups and English (separate in both directions)
+## - languages in language groups (bi-directional)
+##
+## language groups include parents and grandparents

 tatoeba-langgroups: 
 	${MAKE} tatoeba-group2eng
@ -155,15 +151,15 @@ tatoeba-langgroups:

 #### language-group to English

-GROUP2ENG_JOB     = $(patsubst %,tatoeba-%2eng-job,${OPUS_LANG_GROUPS})
-GROUP2ENG_TRAIN   = $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS})
-GROUP2ENG_EVAL    = $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS})
-GROUP2ENG_EVALALL = $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS})
-GROUP2ENG_DIST    = $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS})
+GROUP2ENG_TRAIN   := $(patsubst %,tatoeba-%2eng-train,${OPUS_LANG_GROUPS})
+GROUP2ENG_EVAL    := $(patsubst %,tatoeba-%2eng-eval,${OPUS_LANG_GROUPS})
+GROUP2ENG_EVALALL := $(patsubst %,tatoeba-%2eng-evalall,${OPUS_LANG_GROUPS})
+GROUP2ENG_DIST    := $(patsubst %,tatoeba-%2eng-dist,${OPUS_LANG_GROUPS})

-tatoeba-group2eng: ${GROUP2ENG_JOB}
-# tatoeba-group2eng-dist: ${GROUP2ENG_EVAL} ${GROUP2ENG_EVALALL}
-# 	${MAKE} ${GROUP2ENG_DIST}
+## start all jobs for language group to English translation
+tatoeba-group2eng: ${GROUP2ENG_TRAIN}
+
+## only start this if there is a model
 tatoeba-group2eng-dist:
 	for g in ${OPUS_LANG_GROUPS}; do \
 	  if [ `find ${TATOEBA_WORK}/$$g-eng -name '*.npz' | wc -l` -gt 0 ]; then \
@ -173,21 +169,22 @@ tatoeba-group2eng-dist:
 	  fi \
 	done

-${GROUP2ENG_JOB}:
-	${MAKE} $(patsubst %-job,%-train,$@)
-	${MAKE} $(patsubst %-job,%-eval,$@)
-	${MAKE} $(patsubst %-job,%-evalall,$@)
-	${MAKE} $(patsubst %-job,%-dist,$@)
+## this would be easier but does not check whether a mode exists
+## --> tries to build it if there is no model
+
+# tatoeba-group2eng-dist: ${GROUP2ENG_EVAL} ${GROUP2ENG_EVALALL}
+# 	${MAKE} ${GROUP2ENG_DIST}
+

 ${GROUP2ENG_TRAIN}:
 	${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-train,%,$@)-eng \
-		SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-train,%,$@)}" \
+		SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-train,%,$@)})" \
 		TRGLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
 	tatoeba-multilingual-train

 ${GROUP2ENG_EVAL}:
 	${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-eval,%,$@)-eng \
-		SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-eval,%,$@)}" \
+		SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-eval,%,$@)})" \
 		TRGLANGS=eng \
 		MODELTYPE=transformer \
 		${TATOEBA_PARAMS} \
@ -195,13 +192,13 @@ ${GROUP2ENG_EVAL}:

 ${GROUP2ENG_EVALALL}:
 	${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-evalall,%,$@)-eng \
-		SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-evalall,%,$@)}" \
+		SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-evalall,%,$@)})" \
 		TRGLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
 	tatoeba-multilingual-eval

 ${GROUP2ENG_DIST}:
 	${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%2eng-dist,%,$@)-eng \
-		SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%2eng-dist,%,$@)}" \
+		SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%2eng-dist,%,$@)})" \
 		TRGLANGS=eng \
 		MODELTYPE=transformer \
 		${TATOEBA_PARAMS} \
@ -210,15 +207,13 @@ ${GROUP2ENG_DIST}:

 #### English to language group

-ENG2GROUP_JOB     = $(patsubst %,tatoeba-eng2%-job,${OPUS_LANG_GROUPS})
-ENG2GROUP_TRAIN   = $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS})
-ENG2GROUP_EVAL    = $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS})
-ENG2GROUP_EVALALL = $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS})
-ENG2GROUP_DIST    = $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS})
+ENG2GROUP_TRAIN   := $(patsubst %,tatoeba-eng2%-train,${OPUS_LANG_GROUPS})
+ENG2GROUP_EVAL    := $(patsubst %,tatoeba-eng2%-eval,${OPUS_LANG_GROUPS})
+ENG2GROUP_EVALALL := $(patsubst %,tatoeba-eng2%-evalall,${OPUS_LANG_GROUPS})
+ENG2GROUP_DIST    := $(patsubst %,tatoeba-eng2%-dist,${OPUS_LANG_GROUPS})
+
+tatoeba-eng2group: ${ENG2GROUP_TRAIN}

-tatoeba-eng2group: ${ENG2GROUP_JOB}
-# tatoeba-eng2group-dist: ${ENG2GROUP_EVAL} ${ENG2GROUP_EVALALL}
-#	${MAKE} ${ENG2GROUP_DIST}
 tatoeba-eng2group-dist:
 	for g in ${OPUS_LANG_GROUPS}; do \
 	  if [ `find ${TATOEBA_WORK}/eng-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
@ -228,45 +223,34 @@ tatoeba-eng2group-dist:
 	  fi \
 	done

-tatoeba-eng2group-dist2:
-	for g in ${OPUS_LANG_GROUPS}; do \
-	  if [ `find ${TATOEBA_WORK}/eng-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
-	    mv models-tatoeba/eng-$$g models-tatoeba/eng-$$g-old2; \
-	    ${MAKE} tatoeba-eng2$${g}-dist; \
-	  fi \
-	done
+# tatoeba-eng2group-dist: ${ENG2GROUP_EVAL} ${ENG2GROUP_EVALALL}
+#	${MAKE} ${ENG2GROUP_DIST}


-${ENG2GROUP_JOB}:
-	${MAKE} $(patsubst %-job,%-train,$@)
-	${MAKE} $(patsubst %-job,%-eval,$@)
-	${MAKE} $(patsubst %-job,%-evalall,$@)
-	${MAKE} $(patsubst %-job,%-dist,$@)
-
 ${ENG2GROUP_TRAIN}:
 	${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-train,%,$@) \
-		TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-train,%,$@)}" \
+		TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-train,%,$@)})" \
 		SRCLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
 	tatoeba-multilingual-train

 ${ENG2GROUP_EVAL}:
 	${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-eval,%,$@) \
 		SRCLANGS=eng \
-		TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-eval,%,$@)}" \
+		TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-eval,%,$@)})" \
 		MODELTYPE=transformer \
 		${TATOEBA_PARAMS} \
 	compare

 ${ENG2GROUP_EVALALL}:
 	${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-evalall,%,$@) \
-		TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-evalall,%,$@)}" \
+		TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-evalall,%,$@)})" \
 		SRCLANGS=eng MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
 	tatoeba-multilingual-eval

 ${ENG2GROUP_DIST}:
 	${MAKE} LANGPAIRSTR=eng-$(patsubst tatoeba-eng2%-dist,%,$@) \
 		SRCLANGS=eng \
-		TRGLANGS="${shell langgroup -n $(patsubst tatoeba-eng2%-dist,%,$@)}" \
+		TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-eng2%-dist,%,$@)})" \
 		MODELTYPE=transformer \
 		${TATOEBA_PARAMS} \
 	best-dist
@ -275,15 +259,13 @@ ${ENG2GROUP_DIST}:

 #### multilingual language-group (bi-directional

-LANGGROUP_JOB     = $(patsubst %,tatoeba-%-job,${OPUS_LANG_GROUPS})
-LANGGROUP_TRAIN   = $(patsubst %,tatoeba-%-train,${OPUS_LANG_GROUPS})
-LANGGROUP_EVAL    = $(patsubst %,tatoeba-%-eval,${OPUS_LANG_GROUPS})
-LANGGROUP_EVALALL = $(patsubst %,tatoeba-%-evalall,${OPUS_LANG_GROUPS})
-LANGGROUP_DIST    = $(patsubst %,tatoeba-%-dist,${OPUS_LANG_GROUPS})
+LANGGROUP_TRAIN   := $(patsubst %,tatoeba-%-train,${OPUS_LANG_GROUPS})
+LANGGROUP_EVAL    := $(patsubst %,tatoeba-%-eval,${OPUS_LANG_GROUPS})
+LANGGROUP_EVALALL := $(patsubst %,tatoeba-%-evalall,${OPUS_LANG_GROUPS})
+LANGGROUP_DIST    := $(patsubst %,tatoeba-%-dist,${OPUS_LANG_GROUPS})
+
+tatoeba-langgroup: ${LANGGROUP_TRAIN}

-tatoeba-langgroup: ${LANGGROUP_JOB}
-# tatoeba-langgroup-dist: ${LANGGROUP_EVAL} ${LANGGROUP_EVALALL}
-#	${MAKE} ${LANGGROUP_DIST}
 tatoeba-langgroup-dist:
 	for g in ${OPUS_LANG_GROUPS}; do \
 	  if [ `find ${TATOEBA_WORK}/$$g-$$g -name '*.npz' | wc -l` -gt 0 ]; then \
@ -293,40 +275,36 @@ tatoeba-langgroup-dist:
 	  fi \
 	done

-
-${LANGGROUP_JOB}:
-	${MAKE} $(patsubst %-job,%-train,$@)
-	${MAKE} $(patsubst %-job,%-eval,$@)
-	${MAKE} $(patsubst %-job,%-evalall,$@)
-	${MAKE} $(patsubst %-job,%-dist,$@)
+# tatoeba-langgroup-dist: ${LANGGROUP_EVAL} ${LANGGROUP_EVALALL}
+#	${MAKE} ${LANGGROUP_DIST}


 ${LANGGROUP_TRAIN}:
 	${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-train,%,$@) \
-		TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)}" \
-		SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)}" \
+		TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
+		SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
 		MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
 	tatoeba-multilingual-train

 ${LANGGROUP_EVAL}:
 	${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-eval,%,$@) \
-		SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-eval,%,$@)}" \
-		TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-eval,%,$@)}" \
+		TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
+		SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
 		MODELTYPE=transformer \
 		${TATOEBA_PARAMS} \
 	compare

 ${LANGGROUP_EVALALL}:
 	${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-evalall,%,$@) \
-		TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-evalall,%,$@)}" \
-		SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-evalall,%,$@)}" \
+		TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
+		SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
 		MODELTYPE=transformer FIT_DATA_SIZE=1000000 \
 	tatoeba-multilingual-eval

 ${LANGGROUP_DIST}:
 	${MAKE} LANGPAIRSTR=$(patsubst tatoeba-%-train,%,$@)-$(patsubst tatoeba-%-dist,%,$@) \
-		SRCLANGS="${shell langgroup -n $(patsubst tatoeba-%-dist,%,$@)}" \
-		TRGLANGS="${shell langgroup -n $(patsubst tatoeba-%-dist,%,$@)}" \
+		TRGLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
+		SRCLANGS="$(filter ${OPUS_LANGS3},${shell langgroup -n $(patsubst tatoeba-%-train,%,$@)})" \
 		MODELTYPE=transformer \
 		${TATOEBA_PARAMS} \
 	best-dist
@ -624,22 +602,7 @@ tatoeba-multilingual-testsets:
 ## generic target for tatoeba challenge jobs
 %-tatoeba: ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${SRCEXT}.labels \
 	   ${TATOEBA_DATA}/Tatoeba-train.${LANGPAIRSTR}.clean.${TRGEXT}.labels
-	${MAKE} TRAINSET=Tatoeba-train \
-		DEVSET=Tatoeba-dev \
-		TESTSET=Tatoeba-test \
-		TESTSET_NAME=Tatoeba-test \
-		SMALLEST_TRAINSIZE=1000 \
-		USE_REST_DEVDATA=0 \
-		HELDOUTSIZE=0 \
-		DEVSIZE=5000 \
-		TESTSIZE=10000 \
-		DEVMINSIZE=200 \
-		WORKHOME=${TATOEBA_WORK} \
-		MODELSHOME=${PWD}/models-tatoeba \
-                MODELS_URL=https://object.pouta.csc.fi/${TATOEBA_MODEL_CONTAINER} \
-		MODEL_CONTAINER=${TATOEBA_MODEL_CONTAINER} \
-		ALT_MODEL_DIR=tatoeba \
-		SKIP_DATA_DETAILS=1 \
+	${MAKE} ${TATOEBA_PARAMS} \
 		LANGPAIRSTR=${LANGPAIRSTR} \
 		SRCLANGS="${shell cat ${word 1,$^} | sed 's/ *$$//;s/^ *//'}" \
 		TRGLANGS="${shell cat ${word 2,$^} | sed 's/ *$$//;s/^ *//'}" \