bugs fixed

This commit is contained in:
Joerg Tiedemann 2022-02-08 00:16:23 +02:00
parent 60624f7ba1
commit ee6eb8551f
5 changed files with 34 additions and 30 deletions

View File

@ -417,12 +417,12 @@ endif
## add training data for each language combination
## and put it together in local space
${LOCAL_TRAIN_SRC}: ${LOCAL_TRAINDATA_DEPENDENCIES}
mkdir -p ${dir $@}
echo "" > ${dir $@}README.md
echo "# ${notdir ${TRAIN_BASE}}" >> ${dir $@}README.md
echo "" >> ${dir $@}README.md
rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
-for s in ${SRCLANGS}; do \
@mkdir -p ${dir $@}
@echo "" > ${dir $@}README.md
@echo "# ${notdir ${TRAIN_BASE}}" >> ${dir $@}README.md
@echo "" >> ${dir $@}README.md
@rm -f ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG}
-@for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ ! `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
if [ "${SKIP_SAME_LANG}" == "1" ] && [ "$$s" == "$$t" ]; then \
@ -437,7 +437,8 @@ ${LOCAL_TRAIN_SRC}: ${LOCAL_TRAINDATA_DEPENDENCIES}
done \
done
ifeq (${USE_REST_DEVDATA},1)
if [ -e ${DEV_SRC}.notused.gz ]; then \
@if [ -e ${DEV_SRC}.notused.gz ]; then \
echo "..... add unused devdata to training data"; \
echo "* unused dev/test data is added to training data" >> ${dir $@}README.md; \
${GZIP} -cd < ${DEV_SRC}.notused.gz >> ${LOCAL_TRAIN_SRC}; \
${GZIP} -cd < ${DEV_TRG}.notused.gz >> ${LOCAL_TRAIN_TRG}; \

2
lib/env/mahti.mk vendored
View File

@ -4,7 +4,7 @@
#
DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=8g
DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=128 CPUJOB_HPC_JOBS=20 CPUJOB_HPC_MEM=128g

2
lib/env/puhti.mk vendored
View File

@ -4,7 +4,7 @@
#
DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=8g CPUJOB_HPC_DISK=500
DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g CPUJOB_HPC_DISK=500
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=4 CPUJOB_HPC_JOBS=2 CPUJOB_HPC_MEM=64g CPUJOB_HPC_DISK=500

View File

@ -45,3 +45,6 @@ elg-eng2missing:
for l in est lav ron hbs sqi spa fra ita por zlw ara heb deu fin; do \
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-trainjob-bt; \
done
elg-eng2slv:
${MAKE} MODELTYPE=transformer-big tatoeba-eng2slv-trainjob-bt-separate-spm; \

View File

@ -132,17 +132,18 @@ all:
${MAKE} eval-testsets-tatoeba
## start unidirectional training job
## - make data first, then submit a job
.PHONY: tatoeba-job job slurmjob
job slurmjob tatoeba-job:
.PHONY: tatoeba-job
tatoeba-job:
rm -f train-and-eval.submit
${MAKE} tatoeba-prepare
${MAKE} all-job-tatoeba
## start jobs in both translation directions
.PHONY: tatoeba-bidirectional-job bidirectional-job
bidirectional-job tatoeba-bidirectional-job:
.PHONY: tatoeba-bidirectional-job
tatoeba-bidirectional-job:
${MAKE} tatoeba-prepare
${MAKE} all-job-tatoeba
ifneq (${SRCLANGS},${TRGLANGS})
@ -164,8 +165,8 @@ endif
## prepare data (fetch data and extract language labesl)
.PHONY: prepare tatoeba-prepare
prepare tatoeba-prepare: ${TATOEBA_LANGIDS_TRAINONLY}
.PHONY: tatoeba-prepare
tatoeba-prepare: ${TATOEBA_LANGIDS_TRAINONLY}
${MAKE} fetch-datasets
${MAKE} langlabel-files
@ -185,15 +186,17 @@ tatoeba-train: train
tatoeba-eval: eval
tatoeba-compare: compare
## a file that contains langids without test data
.PHONY: trainonly_langids
trainonly_langids: ${TATOEBA_LANGIDS_TRAINONLY}
## fetch the essential data and get labels for language variants
.PHONY: tatoeba-data tatoeba-labels
tatoeba-labels: ${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.src \
${WORKHOME}/${LANGPAIRSTR}/${DATASET}-langlabels.trg
## a file that contains langids without test data
.PHONY: trainonly_langids
trainonly_langids: ${TATOEBA_LANGIDS_TRAINONLY}
print-langgroups:
@echo ${OPUS_LANG_GROUPS}
@ -251,9 +254,6 @@ MAX_TRGLANGS ?= 7000
# find-langgroup = $(filter ${OPUS_LANGS3},\
# $(sort ${shell langgroup $(1) | xargs iso639 -m -n} ${1} ${2}))
# find-langgroup = $(filter ${OPUS_LANGS3},$(sort ${shell langgroup $(1)} ${1} ${2}))
# find-langgroup = $(sort ${shell langgroup $(1)} ${1} ${2})
find-langgroup = $(filter $(sort ${shell langgroup $(1) | xargs iso639 -m -n} ${1} ${2}),${TATOEBA_LANGS})
find-srclanggroup = $(call find-langgroup,$(firstword ${subst -, ,${subst 2, ,${1}}}),${2})
find-trglanggroup = $(call find-langgroup,$(lastword ${subst -, ,${subst 2, ,${1}}}),${2})
@ -272,15 +272,15 @@ tatoeba-%-langs:
## shortcut to start a target only if certain language group limits are met
## (maximum and minimum number of languages)
%-groupsize-limits:
if [ ${words ${SRCLANGS}} -ge ${MIN_SRCLANGS} ]; then \
if [ ${words ${TRGLANGS}} -ge ${MIN_TRGLANGS} ]; then \
if [ ${words ${SRCLANGS}} -le ${MAX_SRCLANGS} ]; then \
if [ ${words ${TRGLANGS}} -le ${MAX_TRGLANGS} ]; then \
${MAKE} ${@:-groupsize-limits=}; \
fi \
fi \
fi \
fi
@if [ ${words ${SRCLANGS}} -ge ${MIN_SRCLANGS} ]; then \
if [ ${words ${TRGLANGS}} -ge ${MIN_TRGLANGS} ]; then \
if [ ${words ${SRCLANGS}} -le ${MAX_SRCLANGS} ]; then \
if [ ${words ${TRGLANGS}} -le ${MAX_TRGLANGS} ]; then \
${MAKE} ${@:-groupsize-limits=}; \
fi \
fi \
fi \
fi
## create data sets (also works for language groups)
tatoeba-%-data: