small fixes for tatoeba models

This commit is contained in:
Joerg Tiedemann 2022-02-27 22:42:13 +02:00
parent c421fbdb15
commit 5beb4e58aa
2 changed files with 28 additions and 2 deletions

View File

@ -53,9 +53,21 @@ elg-eval:
done
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng eval-bt-tatoeba; \
${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng tatoeba-multilingual-eval-bt; \
${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng eval-testsets-bt-tatoeba; \
${MAKE} MODELTYPE=transformer-big TRGLANGS="$$l" SRCLANGS=eng eval-bt-tatoeba; \
${MAKE} MODELTYPE=transformer-big TRGLANGS="$$l" SRCLANGS=eng tatoeba-multilingual-eval-bt; \
${MAKE} MODELTYPE=transformer-big TRGLANGS="$$l" SRCLANGS=eng eval-testsets-bt-tatoeba; \
done
## only separate languages in multilingual models (set of individual languages)
elg-multieval:
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng tatoeba-multilingual-eval-bt; \
${MAKE} MODELTYPE=transformer-big TRGLANGS="$$l" SRCLANGS=eng tatoeba-multilingual-eval-bt; \
done
# multieval-bt-tatoeba; \
elg-eng2all:
@ -128,6 +140,16 @@ elg-eng2cel:
CLEAN_DEVDATA_TYPE=clean \
tatoeba-eng2cel-trainjob-bt
elg-por2eng:
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-por2eng-trainjob-bt
elg-lav2eng:
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
tatoeba-lav2eng-trainjob-bt
elg-ara2eng:
${MAKE} MODELTYPE=transformer-big \

View File

@ -273,14 +273,18 @@ MAX_TRGLANGS ?= 7000
# find-langgroup = $(filter ${OPUS_LANGS3},\
# $(sort ${shell langgroup $(1) | xargs iso639 -m -n} ${1} ${2}))
find-langgroup = $(filter $(sort ${shell langgroup $(1) | xargs iso639 -m -n} ${1} ${2}),${TATOEBA_LANGS})
## OLD (singleonly - only allows single languages)
## NEW: also splits on '+' to allow for multiple languages
find-langgroup-singleonly = $(filter $(sort ${shell langgroup $(1) | xargs iso639 -m -n} ${1} ${2}),${TATOEBA_LANGS})
find-langgroup = $(filter $(sort ${shell langgroup $(subst +, ,$(1)) | xargs iso639 -m -n} $(subst +, ,${1}) ${2}),${TATOEBA_LANGS})
find-srclanggroup = $(call find-langgroup,$(firstword ${subst -, ,${subst 2, ,${1}}}),${2})
find-trglanggroup = $(call find-langgroup,$(lastword ${subst -, ,${subst 2, ,${1}}}),${2})
find-langgroup-pair = $(sort $(call find-srclanggroup,${1}) $(call find-trglanggroup,${1}) ${2})
## print languages in this set
tatoeba-%-langs:
-( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-langs,%,$@))); \