elg recipes

This commit is contained in:
Joerg Tiedemann 2022-03-05 23:52:22 +02:00
parent 5beb4e58aa
commit c95b160db7
7 changed files with 156 additions and 20 deletions

View File

@ -37,10 +37,17 @@ DATA_IS_SHUFFLED = 1
## Tatoeba specific data sets
TATOEBA_DATASET := opusTC${TATOEBA_VERSION_NOHYPHEN}
TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION}
TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION}
TATOEBA_TESTSET := Tatoeba-test-${TATOEBA_VERSION}
TATOEBA_DATASET := opusTC${TATOEBA_VERSION_NOHYPHEN}
TATOEBA_TRAINSET := Tatoeba-train-${TATOEBA_VERSION}
TATOEBA_DEVSET := Tatoeba-dev-${TATOEBA_VERSION}
TATOEBA_TESTSET := Tatoeba-test-${TATOEBA_VERSION}
## the name is by default the same as the data set specifier
## (for some tasks the name can differ, i.e. tuning of multilingual models)
TATOEBA_DEVSET_NAME := ${TATOEBA_DEVSET}
TATOEBA_TESTSET_NAME := ${TATOEBA_TESTSET}
TATOEBA_TRAINSET_NAME := ${TATOEBA_TRAINSET}
## change data set names
## DATASET will also be the base name of the model file
@ -48,9 +55,9 @@ DATASET := ${TATOEBA_DATASET}
TRAINSET := ${TATOEBA_TRAINSET}
DEVSET := ${TATOEBA_DEVSET}
TESTSET := ${TATOEBA_TESTSET}
DEVSET_NAME := ${TATOEBA_DEVSET}
TESTSET_NAME := ${TATOEBA_TESTSET}
TRAINSET_NAME := ${TATOEBA_TRAINSET}
DEVSET_NAME := ${TATOEBA_DEVSET_NAME}
TESTSET_NAME := ${TATOEBA_TESTSET_NAME}
TRAINSET_NAME := ${TATOEBA_TRAINSET_NAME}
##
BACKTRANS_HOME = ${PWD}/back-translate

View File

@ -120,6 +120,13 @@ endif
print-datasets:
-@for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
${MAKE} SRC=$$s TRG=$$t print-datasets-current-langpair; \
done \
done
print-datasets-current-langpair:
@echo ${TATOEBA_TRAINSET}
@echo ${TRAINSET}
@echo "all data:"

4
lib/env/puhti.mk vendored
View File

@ -8,8 +8,8 @@ DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=5
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=4 CPUJOB_HPC_JOBS=2 CPUJOB_HPC_MEM=64g CPUJOB_HPC_DISK=500
CSCPROJECT = project_2002688
# CSCPROJECT = project_2002982
# CSCPROJECT = project_2002688
CSCPROJECT = project_2002982
WORKHOME = ${shell realpath ${PWD}/work}
GPU = v100
HPC_QUEUE = small

View File

@ -59,15 +59,62 @@ elg-eval:
${MAKE} MODELTYPE=transformer-big TRGLANGS="$$l" SRCLANGS=eng tatoeba-multilingual-eval-bt; \
${MAKE} MODELTYPE=transformer-big TRGLANGS="$$l" SRCLANGS=eng eval-testsets-bt-tatoeba; \
done
## only separate languages in multilingual models (set of individual languages)
elg-multieval:
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
${MAKE} MODELTYPE=transformer-big SRCLANGS="$$l" TRGLANGS=eng tatoeba-multilingual-eval-bt; \
${MAKE} MODELTYPE=transformer-big TRGLANGS="$$l" SRCLANGS=eng tatoeba-multilingual-eval-bt; \
for p in zle2zle zlw2zle zle2fin zle2zlw; do \
${MAKE} MODELTYPE=transformer-big tatoeba-$${p}-eval-bt; \
${MAKE} MODELTYPE=transformer-big tatoeba-$${p}-multieval-bt; \
${MAKE} MODELTYPE=transformer-big tatoeba-$${p}-eval-testsets-bt; \
done
# multieval-bt-tatoeba; \
elg-eval-zle:
for p in zle2zle zlw2zle zle2fin zle2zlw; do \
${MAKE} MODELTYPE=transformer-big tatoeba-$${p}-eval-bt; \
${MAKE} MODELTYPE=transformer-big tatoeba-$${p}-multieval-bt; \
${MAKE} MODELTYPE=transformer-big tatoeba-$${p}-eval-testsets-bt; \
done
elg-pivot-eval:
for l in dan swe fin deu ron tur; do
${MAKE} tatoeba-$${l}2ukr-eval-pbt; \
${MAKE} tatoeba-ukr2$${l}-eval-pft; \
done
${MAKE} SRCLANGS="ces slk" TRGLANGS=ukr eval-tatoeba
${MAKE} SRCLANGS="ces slk" TRGLANGS=ukr tatoeba-multilingual-eval
${MAKE} TRGLANGS="ces slk" SRCLANGS=ukr eval-tatoeba
${MAKE} TRGLANGS="ces slk" SRCLANGS=ukr tatoeba-multilingual-eval
elg-dan2ukr:
${MAKE} tatoeba-dan2ukr-trainjob-pbt
${MAKE} tatoeba-ukr2dan-trainjob-pft
elg-swe2ukr:
${MAKE} tatoeba-swe2ukr-trainjob-pbt
${MAKE} tatoeba-ukr2swe-trainjob-pft
elg-fin2ukr:
${MAKE} tatoeba-fin2ukr-trainjob-pbt
${MAKE} tatoeba-ukr2fin-trainjob-pft
elg-deu2ukr:
${MAKE} tatoeba-deu2ukr-trainjob-pbt
${MAKE} tatoeba-ukr2deu-trainjob-pft
elg-slk2ukr:
${MAKE} tatoeba-slk2ukr-trainjob-pbt
${MAKE} tatoeba-ukr2slk-trainjob-pft
elg-ces_slk2ukr:
${MAKE} SRCLANGS=ukr TRGLANGS="ces slk" tatoeba-job-pft
${MAKE} TRGLANGS=ukr SRCLANGS="ces slk" tatoeba-job-pbt
elg-ron2ukr:
${MAKE} tatoeba-ron2ukr-trainjob-pbt
${MAKE} tatoeba-ukr2ron-trainjob-pft
elg-tur2ukr:
${MAKE} tatoeba-tur2ukr-trainjob-pbt
${MAKE} tatoeba-ukr2tur-trainjob-pft
elg-eng2all:
@ -127,12 +174,58 @@ elg-all2eng-eval:
done
elg-tune4ukr2eng:
${MAKE} MODELTYPE=transformer-big TUNE_SRC=ukr TUNE_TRG=eng tatoeba-zle2eng-langtunejob
elg-zle2zlx:
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-zle2zlw-trainjob-bt
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-zlw2zle-trainjob-bt
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-zle2zls-trainjob-bt
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-zls2zle-trainjob-bt
elg-zle2zle:
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-zle2zle-trainjob-bt
elg-gmq2zle:
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-gmq2zle-trainjob-bt
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-zle2gmq-trainjob-bt
elg-zle2fin:
${MAKE} MODELTYPE=transformer-big tatoeba-zle2fin-trainjob-bt
elg-sla2sla:
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
tatoeba-sla2sla-trainjob-bt
elg-eng2cel:
${MAKE} MODELTYPE=transformer-big \
@ -191,6 +284,13 @@ elg-multi2eng:
TRGLANGS=eng SRCLANGS="por glg" \
tatoeba-job-bt
elg-eng2spa:
${MAKE} MODELTYPE=transformer-big \
MARIAN_EXTRA=--no-restore-corpus \
DATA_PREPARE_HPCPARAMS='CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=1000' \
SRCLANGS=eng TRGLANGS="cat oci spa" \
tatoeba-job-bt
elg-ces2eng:
${MAKE} MODELTYPE=transformer-big \
@ -226,3 +326,25 @@ elg-eng2fin:
MARIAN_EXTRA=--no-restore-corpus \
tatoeba-eng2fin-trainjob-bt
good-ukr-models:
@grep '^[^ ]*-ukr' ~/research/Tatoeba-Challenge/models/released-model-results-all.txt | \
grep -v 'tuned4' | rev | uniq -f5 | rev | grep '[3-9][0-9]\.[0-9]' | grep -P '\t[0-9]{3,}\t'
@grep '^ukr-' ~/research/Tatoeba-Challenge/models/released-model-results-all.txt | \
grep -v 'tuned4' | rev | uniq -f5 | rev | grep '[3-9][0-9]\.[0-9]' | grep -P '\t[0-9]{3,}\t'
ukr-model-table:
make -s good-ukr-models |\
cut -f1-4 |\
sed 's/ / | /g;s/^/| /;s/$$/ |/' |\
sed 's#\(https://object.pouta.csc.fi/Tatoeba-MT-models/\)\(.*\).zip#[\2](\1\2.zip)#'
ukr-model-table2:
make -s good-ukr-models | cut -f1-4 > $@.tmp1
cut -f1 $@.tmp1 | xargs iso639 -p | sed "s/^\"//;s/\"$$//;s#\" \"#\n#g" > $@.tmp2
paste $@.tmp2 $@.tmp1 |\
sed 's/ / | /g;s/^/| /;s/$$/ |/' |\
sed 's#\(https://object.pouta.csc.fi/Tatoeba-MT-models/\)\(.*\).zip#[\2](\1\2.zip)#'
rm -f $@.tmp*

View File

@ -244,8 +244,8 @@ OPUS_LANG_GRANDPARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2>
OPUS_LANG_GROUPS := ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}}
.PHONY: tatoeba
tatoeba:
.PHONY: tatoeba tatoeba-all
tatoeba tatoeba-all:
${MAKE} tatoeba-prepare
${MAKE} all-tatoeba

View File

@ -102,7 +102,7 @@ tatoeba-%-langtune:
( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-langtune,%,$@))); \
t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-langtune,%,$@))); \
if [ -d ${WORKHOME}/$$s-$$t ]; then \
${MAKE} LANGPAIRSTR=$$s-$$t ${TATOEBA_LANGTUNE_PARAMS} tatoeba; \
${MAKE} LANGPAIRSTR=$$s-$$t ${TATOEBA_LANGTUNE_PARAMS} tatoeba-all; \
fi )
# SRCLANGS="${call find-srclanggroup,${patsubst tatoeba-%-langtune,%,$@},${PIVOT}}" \

View File

@ -143,7 +143,7 @@ endif
## pre-process data
ifeq (${MULTI_TARGET_MODEL},1)
PREPROCESS_ARGS = ${PIVOT} ${TRG} ${MODELDIR}/source.spm
PREPROCESS_ARGS = ${PIVOT} ${SRC} ${MODELDIR}/source.spm
else
PREPROCESS_ARGS = ${PIVOT} ${MODELDIR}/source.spm
endif