skip word alignment if not necessary

This commit is contained in:
Joerg Tiedemann 2020-02-25 09:00:24 +02:00
parent d08fdd4040
commit 44182291dc
4 changed files with 11 additions and 4 deletions

View File

@ -145,8 +145,10 @@ include Makefile.slurm
data: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz \
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
${MAKE} ${TEST_SRC}.${PRE_SRC} ${TEST_TRG}
${MAKE} ${TRAIN_ALG}
${MAKE} ${MODEL_VOCAB}
ifeq (${MODELTYPE},transformer-align)
${MAKE} ${TRAIN_ALG}
endif
traindata: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz

View File

@ -137,9 +137,9 @@ EXTRA_TRAINSET =
## TESTSET= DEVSET, TRAINSET = OPUS - WMT-News,DEVSET.TESTSET
TESTSET = ${DEVSET}
TRAINSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
TRAINSET = $(filter-out WMT-News MPC1 ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
TUNESET = OpenSubtitles
MONOSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
MONOSET = $(filter-out WMT-News MPC1 ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
## 1 = use remaining data from dev/test data for training
USE_REST_DEVDATA = 1

View File

@ -85,7 +85,7 @@ ifeq (${wildcard ${BT_MODEL_START}},)
endif
endif
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
${MAKE} DATASET=opus+bt \
${MAKE} DATASET=${DATASET}+bt \
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
MARIAN_EARLY_STOPPING=15 \

View File

@ -286,6 +286,11 @@ enru-yandex:
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=ru TRGLANGS=en EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=4g MARIAN_EARLY_STOPPING=15 train.submit-multigpu
enru-yandex-bt:
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex data-bt
${MAKE} DATASET=opus+yandex MODELTYPE=transformer SRCLANGS=en TRGLANGS=ru EXTRA_TRAINSET=yandex \
WALLTIME=72 HPC_CORES=1 HPC_MEM=8g MARIAN_WORKSPACE=12000 MARIAN_EARLY_STOPPING=15 train-bt.submit-multigpu
enit:
${MAKE} SRCLANGS=en TRGLANGS=it traindata-spm