fixed multithreading issues with data recipe

This commit is contained in:
Joerg Tiedemann 2021-08-09 22:19:05 +03:00
parent fc8c2b33c0
commit 72e1bcb7ec
4 changed files with 300 additions and 282 deletions

View File

@ -326,8 +326,8 @@ train-and-eval-job:
#------------------------------------------------------------------------
.PHONY: data
data: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz \
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
data: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz
${MAKE} ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
${MAKE} ${TEST_SRC}.${PRE_SRC} ${TEST_TRG}
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)

View File

@ -315,7 +315,24 @@ PRE_TRG = ${SUBWORDS}${TRGBPESIZE:000=}k
## default name of the data set (and the model)
##-------------------------------------
DATASET ?= opus
TRAINSET_NAME ?= opus
DATASET ?= ${TRAINSET_NAME}
## dev and test data come from one specific data set
## if we have a bilingual model
ifeq (${words ${SRCLANGS}},1)
ifeq (${words ${TRGLANGS}},1)
DEVSET_NAME ?= ${DEVSET}
TESTSET_NAME ?= ${TESTSET}
endif
endif
## otherwise we give them a generic name
DEVSET_NAME ?= opus-dev
TESTSET_NAME ?= opus-test
## DATADIR = directory where the train/dev/test data are
## WORKDIR = directory used for training
@ -336,20 +353,6 @@ LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
## dev and test data come from one specific data set
## if we have a bilingual model
ifeq (${words ${SRCLANGS}},1)
ifeq (${words ${TRGLANGS}},1)
DEVSET_NAME ?= ${DEVSET}
TESTSET_NAME ?= ${TESTSET}
endif
endif
## otherwise we give them a generic name
DEVSET_NAME ?= opus-dev
TESTSET_NAME ?= opus-test
DEV_SRC ?= ${WORKDIR}/val/${DEVSET_NAME}.src
DEV_TRG ?= ${WORKDIR}/val/${DEVSET_NAME}.trg

File diff suppressed because it is too large Load Diff

View File

@ -10,7 +10,7 @@
tmpsrc=`mktemp`
tmptrg=`mktemp`
tmplang=`mktemp`
column=0
if [ "$1" == "kor" ] || [ "$1" == "ko" ]; then
column=1