local scratch in tatoeba recipes

This commit is contained in:
Joerg Tiedemann 2022-02-06 22:45:11 +02:00
parent 176977df32
commit 2d2663a332
10 changed files with 175 additions and 55 deletions

View File

@ -102,6 +102,8 @@ SHUFFLE_DATA = 1
## devtest data is shuffled by default
SHUFFLE_DEVDATA = 1
## shuffle multilingual training data to mix language examples
SHUFFLE_MULTILINGUAL_DATA = 1
##----------------------------------------------------------------------
## set FIT_DATA_SIZE to a specific value to fit the training data

View File

@ -23,8 +23,11 @@ DEVSIZE ?= 5000
TESTSIZE ?= 10000
DEVMINSIZE ?= 200
USE_REST_DEVDATA = 0
DATA_IS_SHUFFLED = 1
USE_REST_DEVDATA = 0
SHUFFLE_DATA = 0
SHUFFLE_DEVDATA = 1
SHUFFLE_MULTILINGUAL_DATA = 1
DATA_IS_SHUFFLED = 1
## this will be the base name of the model file
TATOEBA_DATASET := opusTC${TATOEBA_VERSION_NOHYPHEN}
@ -89,15 +92,18 @@ RELEASED_TATOEBA_DATA_FILE = tatoeba/released-bitexts-${TATOEBA_VERSION}.txt
## all released language pairs with test sets > 200 test pairs
## also extract all source languages that are available for a give target language
## and vice versa
TATOEBA_RELEASED_DATA = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
TATOEBA_AVAILABLE_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_AVAILABLE_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_RELEASED_DATA := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-min200.txt | cut -f1)
TATOEBA_AVAILABLE_TRG := ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_DATA}}}}}
TATOEBA_AVAILABLE_SRC := ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_DATA}}}}}
## extract language pairs for a specific subset
TATOEBA_SUBSET = lower
TATOEBA_RELEASED_SUBSET = $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
TATOEBA_AVAILABLE_SUBSET_TRG = ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_AVAILABLE_SUBSET_SRC = ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_SUBSET := lower
TATOEBA_RELEASED_SUBSET := $(shell wget -qq -O - ${TATOEBA_DATA_COUNT_BASE}-${TATOEBA_SUBSET}.txt | cut -f1)
TATOEBA_AVAILABLE_SUBSET_TRG := ${sort ${filter-out ${SRC},${subst -, ,${filter %-${SRC} ${SRC}-%,${TATOEBA_RELEASED_SUBSET}}}}}
TATOEBA_AVAILABLE_SUBSET_SRC := ${sort ${filter-out ${TRG},${subst -, ,${filter %-${TRG} ${TRG}-%,${TATOEBA_RELEASED_SUBSET}}}}}
## all available language pairs
## (download the file once and keep it here to get the language pairs in the release)
@ -106,6 +112,15 @@ TATOEBA_LANGPAIRS := ${shell if [ ! -e ${RELEASED_TATOEBA_DATA_FILE} ]; then \
fi; \
tail -n +2 ${RELEASED_TATOEBA_DATA_FILE} | cut -f1 }
## all available languages in tatoeba
TATOEBA_LANGS := $(sort $(subst -, ,${TATOEBA_LANGPAIRS}))
## SRCLANGS converted to macro languages used in tatoeba releases
## and all non-available languages filtered out
MACRO_SRCLANGS := $(filter ${sort ${shell iso639 -m -n ${SRCLANGS}}},${TATOEBA_LANGS})
MACRO_TRGLANGS := $(filter ${sort ${shell iso639 -m -n ${TRGLANGS}}},${TATOEBA_LANGS})
WIKILANGS ?= ${notdir ${wildcard backtranslate/wiki-iso639-3/*}}

View File

@ -446,6 +446,15 @@ ifeq (${USE_REST_DEVDATA},1)
${GZIP} -cd < ${DEV_TRG}.notused.gz >> ${LOCAL_TRAIN_TRG}; \
fi
endif
ifeq (${SHUFFLE_MULTILINGUAL_DATA},1)
ifneq ($(words ${SRCLANGS} ${TRGLANGS}),2)
@echo ".... shuffle multilingual data"
@paste ${LOCAL_TRAIN_SRC} ${LOCAL_TRAIN_TRG} | ${SHUFFLE} > ${LOCAL_TRAIN_SRC}.shuffled
@cut -f1 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_SRC}
@cut -f2 ${LOCAL_TRAIN_SRC}.shuffled > ${LOCAL_TRAIN_TRG}
@rm -f ${LOCAL_TRAIN_SRC}.shuffled
endif
endif
## everything is done in the target above

View File

@ -89,10 +89,6 @@ endif
SUBMIT_PREFIX ?= submit
ifdef LOCAL_SCRATCH
TMPDIR := ${LOCAL_SCRATCH}
endif
ifndef TMPDIR
TMPDIR := /tmp
endif

13
lib/env/mahti.mk vendored
View File

@ -1,11 +1,11 @@
# -*-makefile-*-
#
# environment on mathi@CSC
# environment on mahti@CSC
#
DATAJOB_HPCPARAMS = CPUJOB_HPC_CORES=128 CPUJOB_HPC_MEM=128g CPUJOB_HPC_JOBS=20
ALLJOB_HPCPARAMS = ${DATAJOB_HPCPARAMS}
DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=8g
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=128 CPUJOB_HPC_JOBS=20 CPUJOB_HPC_MEM=128g
CSCPROJECT = project_2002688
@ -34,6 +34,13 @@ endif
## default local scratch if not set otherwise
LOCAL_SCRATCH ?= /scratch/${CSCPROJECT}
# set tmpdir
ifdef LOCAL_SCRATCH
TMPDIR := ${LOCAL_SCRATCH}
else
TMPDIR := /scratch/${CSCPROJECT}
endif
## select queue depending on the number of GPUs allocated
ifeq (${NR_GPUS},1)

19
lib/env/puhti.mk vendored
View File

@ -4,8 +4,9 @@
#
DATAJOB_HPCPARAMS = CPUJOB_HPC_CORES=4 CPUJOB_HPC_MEM=64g CPUJOB_HPC_JOBS=2 CPUJOB_HPC_DISK=500
ALLJOB_HPCPARAMS = ${DATAJOB_HPCPARAMS}
DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=8g CPUJOB_HPC_DISK=500
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=4 CPUJOB_HPC_JOBS=2 CPUJOB_HPC_MEM=64g CPUJOB_HPC_DISK=500
CSCPROJECT = project_2002688
WORKHOME = ${shell realpath ${PWD}/work}
@ -27,9 +28,19 @@ ifneq (${wildcard /projappl/project_2001194/bin},)
endif
# set LOCAL_SCRATCH to nvme disk if it exists
ifdef SLURM_JOBID
ifneq ($(wildcard /run/nvme/job_${SLURM_JOBID}/tmp),)
LOCAL_SCRATCH = /run/nvme/job_${SLURM_JOBID}/tmp
endif
endif
LOCAL_SCRATCH ?= /scratch/${CSCPROJECT}
# set tmpdir
ifdef LOCAL_SCRATCH
TMPDIR := ${LOCAL_SCRATCH}
else
TMPDIR := /scratch/${CSCPROJECT}
endif
CPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 python-env

42
lib/projects/elg.mk Normal file
View File

@ -0,0 +1,42 @@
# -*-makefile-*-
## 23 official EU languages:
#
# English
# German
# Swedish
# Finnish
# Dutch
# Danish
# Spanish
# Czech
# French
# Polish
# Portuguese
# Latvian
# Romanian
# Estonian
# Bulgarian
# Greek, Modern (1453-)
# Slovak
# Italian
# Maltese
# Slovenian
# Croatian
# Lithuanian
# Irish
# Hungarian
ELG_EU_LANGIDS = eng deu swe fin nld dan spa ces fra pol por lav ron est bul ell slk ita mlt slv hrv lit gle hun
ELG_EU_SELECTED = gmq nld spa fra pol por lav ron est bul ell ita mlt slv hbs lit cel hun glg eus zle zls zlw tur ara heb sqi deu fin
ELG_EU_SELECTED_MULTILANG = "ces slk" "cat oci" "fry ltz nds afr"
elg-eng2all:
for l in ${ELG_EU_SELECTED_MULTILANG}; do \
${MAKE} MODELTYPE=transformer-big SRCLANGS=eng TRGLANGS="$$l" tatoeba-job-bt; \
done
for l in ${ELG_EU_SELECTED}; do \
${MAKE} MODELTYPE=transformer-big tatoeba-eng2$${l}-trainjob-bt; \
done

View File

@ -159,6 +159,16 @@ endif
# create slurm jobs
#------------------------------------------------------------------------
## copy different HPC params for jobs that need to wordalign data or not
ifeq ($(findstring align,${MODELTYPE}),)
DATAJOB_HPCPARAMS = ${DATA_ALIGN_HPCPARAMS}
ALLJOB_HPCPARAMS = ${DATA_ALIGN_HPCPARAMS} ${TRAINJOB_HPCPARAMS}
else
DATAJOB_HPCPARAMS = ${DATA_PREPARE_HPCPARAMS}
ALLJOB_HPCPARAMS = ${DATA_PREPARE_HPCPARAMS} ${TRAINJOB_HPCPARAMS}
endif
# all-job:
# - check whether data files exist
# - if not: create a CPU job that makes the data and starts a training job after that

View File

@ -26,24 +26,31 @@ ${RELEASED_TATOEBA_DATA_FILE}:
## fetch data for all language combinations
## TODO: should we check whether we are supposed to skip some language pairs?
## - only if they don't exist already
## - skip certain language pairs (if specified in SKIP_LANGPAIRS)
## - skip same language pairs (if SKIP_SAME_LANG is set to 1)
## (need to convert to macro-languages to match Tatoeba TC releases!)
.PHONY: fetch-tatoeba-datasets fetch-datasets
fetch-datasets fetch-tatoeba-datasets:
-for s in ${sort ${SRCLANGS}}; do \
for t in ${sort ${TRGLANGS}}; do \
-for s in ${MACRO_SRCLANGS}; do \
for t in ${MACRO_TRGLANGS}; do \
if [ `echo "$$s-$$t $$t-$$s" | egrep '${SKIP_LANGPAIRS}' | wc -l` -gt 0 ]; then \
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
else \
if [ "$$s" \< "$$t" ]; then \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
${TATOEBA_DATA}/${TATOEBA_TRAINSET}.$$s-$$t.clean.$$s.gz; \
if [ ! -e ${TATOEBA_DATA}/${TATOEBA_TRAINSET}.$$s-$$t.clean.$$s.gz ]; then \
${MAKE} SRCLANGS=$$s TRGLANGS=$$t \
${TATOEBA_DATA}/${TATOEBA_TRAINSET}.$$s-$$t.clean.$$s.gz; \
fi \
else \
if [ "${SKIP_SAME_LANG}" == "1" ] && [ "$$s" == "$$t" ]; then \
echo "!!!!!!!!!!! skip language pair $$s-$$t !!!!!!!!!!!!!!!!"; \
else \
${MAKE} SRCLANGS=$$t TRGLANGS=$$s \
${TATOEBA_DATA}/${TATOEBA_TRAINSET}.$$t-$$s.clean.$$t.gz; \
if [ ! -e ${TATOEBA_DATA}/${TATOEBA_TRAINSET}.$$t-$$s.clean.$$t.gz ]; then \
${MAKE} SRCLANGS=$$t TRGLANGS=$$s \
${TATOEBA_DATA}/${TATOEBA_TRAINSET}.$$t-$$s.clean.$$t.gz; \
fi \
fi \
fi \
fi \
@ -210,26 +217,38 @@ TATOEBADATA = data/release/${TATOEBA_VERSION}/${LANGPAIR}
## sub-language pairs from the collection
## TDOD: this creates empty files for languages that don't have released data sets
## --> should we rather skip those somehow? (without breaking anything)
ifneq ($(filter ${LANGPAIR},${TATOEBA_LANGPAIRS}),${LANGPAIR})
%/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz:
${MAKE} $@.d/source.labels $@.d/target.labels
@if [ `cat $@.d/source.labels $@.d/target.labels | wc -w` -gt 1 ]; then \
@echo ".... no package released for ${LANGPAIR}!"
else
%/${TATOEBA_TRAINSET}.${LANGPAIR}.clean.${SRCEXT}.gz:
${MAKE} ${TMPWORKDIR}/$@.d/source.labels ${TMPWORKDIR}/$@.d/target.labels
@if [ `cat ${TMPWORKDIR}/$@.d/source.labels ${TMPWORKDIR}/$@.d/target.labels | wc -w` -gt 1 ]; then \
echo ".... found sublanguages in the data"; \
b="$@.d/${TATOEBADATA}"; \
for s in `cat $@.d/source.labels`; do \
for t in `cat $@.d/target.labels`; do \
b="${TMPWORKDIR}/$@.d/${TATOEBADATA}"; \
for s in `cat ${TMPWORKDIR}/$@.d/source.labels`; do \
for t in `cat ${TMPWORKDIR}/$@.d/target.labels`; do \
if [ "$$s" \< "$$t" ]; then \
echo ".... extract $$s-$$t data"; \
for d in dev test train; do \
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.src.gz) <(gzip -cd $$b/$$d.trg.gz) | \
grep -P "^$$s\t$$t\t" > $@.d/$$d; \
if [ -s $@.d/$$d ]; then \
cut -f1,2 $@.d/$$d | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz; \
cut -f3 $@.d/$$d | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.gz; \
cut -f4 $@.d/$$d | ${GZIP} -c > ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.gz; \
grep -P "^$$s\t$$t\t" > ${TMPWORKDIR}/$@.d/$$d; \
if [ -s ${TMPWORKDIR}/$@.d/$$d ]; then \
cut -f1,2 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz; \
cut -f3 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$s.gz; \
cut -f4 ${TMPWORKDIR}/$@.d/$$d | ${GZIP} -c \
> ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.$$s-$$t.clean.$$t.gz; \
fi \
done; \
if [ -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.id.gz ]; then \
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.domain.gz) | grep -P "^$$s\t$$t\t" | cut -f3 | \
paste <(gzip -cd $$b/$$d.id.gz) <(gzip -cd $$b/$$d.domain.gz) | \
grep -P "^$$s\t$$t\t" | cut -f3 | \
${GZIP} -c > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz; \
${ZCAT} ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domain.gz |\
sort -u > ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.$$s-$$t.clean.domains; \
@ -242,7 +261,7 @@ TATOEBADATA = data/release/${TATOEBA_VERSION}/${LANGPAIR}
fi
@if [ ! -e ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SRCEXT}.gz ]; then \
echo ".... move data files"; \
b="$@.d/${TATOEBADATA}"; \
b="${TMPWORKDIR}/$@.d/${TATOEBADATA}"; \
for d in dev test train; do \
mv $$b/$$d.src.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.gz; \
mv $$b/$$d.trg.gz ${dir $@}Tatoeba-$$d-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.gz; \
@ -251,12 +270,15 @@ TATOEBADATA = data/release/${TATOEBA_VERSION}/${LANGPAIR}
${ZCAT} $$b/train.domain.gz | sort -u | tr "\n" ' ' | sed 's/ *$$//' \
> ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.domains; \
mv $$b/train.domain.gz ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.domain.gz; \
mv $@.d/source.labels ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.labels; \
mv $@.d/target.labels ${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.labels; \
mv ${TMPWORKDIR}/$@.d/source.labels \
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTSRCEXT}.labels; \
mv ${TMPWORKDIR}/$@.d/target.labels \
${dir $@}Tatoeba-train-${TATOEBA_VERSION}.${LANGPAIR}.clean.${SORTTRGEXT}.labels; \
fi
@echo ".... cleanup of temporary files"
@rm -fr $@.d
@rm -fr ${TMPWORKDIR}/$@.d
endif
## fetch data
## don't break if this fails!

View File

@ -129,6 +129,7 @@ include ${REPOHOME}lib/tasks/tatoeba/tune.mk
include ${REPOHOME}lib/tasks/tatoeba/misc.mk
include ${REPOHOME}lib/projects/distill.mk
include ${REPOHOME}lib/projects/elg.mk
@ -137,15 +138,13 @@ include ${REPOHOME}lib/projects/distill.mk
.PHONY: all
all:
${MAKE} prepare
${MAKE} data-tatoeba
${MAKE} train-tatoeba
${MAKE} eval-tatoeba
${MAKE} compare-tatoeba
${MAKE} eval-testsets-tatoeba
ttt:
echo "--${TATOEBA_LANGPAIRS}--"
## start unidirectional training job
## - make data first, then submit a job
.PHONY: tatoeba-job job slurmjob
@ -177,12 +176,19 @@ endif
${@:-tatoeba=}
## prepare data (config, train.dev.test data, labels)
## prepare data (fetch data and extract language labesl)
.PHONY: prepare tatoeba-prepare
prepare tatoeba-prepare: ${TATOEBA_LANGIDS_TRAINONLY}
${MAKE} fetch-datasets
${MAKE} langlabel-files
${MAKE} local-config
## prepare and make all data files (dev/test/train)
.PHONY: prepare-and-data tatoeba-prepare-and-data
prepare-and-data tatoeba-prepare-and-data: ${TATOEBA_LANGIDS_TRAINONLY}
${MAKE} fetch-datasets
${MAKE} langlabel-files
${MAKE} local-config-tatoeba
${MAKE} data-tatoeba
@ -256,9 +262,13 @@ MIN_TRGLANGS ?= 1
MAX_SRCLANGS ?= 7000
MAX_TRGLANGS ?= 7000
find-langgroup = $(filter ${OPUS_LANGS3},\
$(sort ${shell langgroup $(1) | xargs iso639 -m -n} ${1} ${2}))
# find-langgroup = $(filter ${OPUS_LANGS3},\
# $(sort ${shell langgroup $(1) | xargs iso639 -m -n} ${1} ${2}))
# find-langgroup = $(filter ${OPUS_LANGS3},$(sort ${shell langgroup $(1)} ${1} ${2}))
# find-langgroup = $(sort ${shell langgroup $(1)} ${1} ${2})
find-langgroup = $(filter $(sort ${shell langgroup $(1) | xargs iso639 -m -n} ${1} ${2}),${TATOEBA_LANGS})
find-srclanggroup = $(call find-langgroup,$(firstword ${subst -, ,${subst 2, ,${1}}}),${2})
find-trglanggroup = $(call find-langgroup,$(lastword ${subst -, ,${subst 2, ,${1}}}),${2})
@ -296,7 +306,7 @@ tatoeba-%-data:
SRCLANGS="$$S" TRGLANGS="$$T" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
tatoeba-prepare-groupsize-limits; )
tatoeba-prepare-and-data-groupsize-limits; )
## train a tatoeba model
@ -311,11 +321,7 @@ tatoeba-%-train:
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
tatoeba-prepare-groupsize-limits; \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
all-tatoeba-groupsize-limits; )
all-groupsize-limits; )
## start the training job