mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 22:14:14 +03:00
134 lines
4.3 KiB
Makefile
134 lines
4.3 KiB
Makefile
# -*-makefile-*-
|
|
|
|
|
|
## resume training on an existing model
|
|
resume:
|
|
if [ -e ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz ]; then \
|
|
cp ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz \
|
|
${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz; \
|
|
fi
|
|
sleep 1
|
|
rm -f ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
|
${MAKE} train
|
|
|
|
|
|
#------------------------------------------------------------------------
|
|
# vocabulary
|
|
#------------------------------------------------------------------------
|
|
|
|
## make vocabulary
|
|
## - no new vocabulary is created if the file already exists!
|
|
## - need to delete the file if you want to create a new one!
|
|
|
|
${MODEL_VOCAB}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
|
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
|
|
ifeq ($(wildcard ${MODEL_VOCAB}),)
|
|
ifneq (${MODEL_LATEST_VOCAB},)
|
|
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
|
else
|
|
mkdir -p ${dir $@}
|
|
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
|
|
endif
|
|
else
|
|
@echo "$@ already exists!"
|
|
@echo "WARNING! No new vocabulary is created even though the data has changed!"
|
|
@echo "WARNING! Delete the file if you want to start from scratch!"
|
|
touch $@
|
|
endif
|
|
|
|
|
|
## get vocabulary from sentence piece model
|
|
ifeq ($(USE_SPM_VOCAB),1)
|
|
${MODEL_SRCVOCAB}: ${SPMSRCMODEL}
|
|
cut -f1 < $<.vocab > $@
|
|
|
|
${MODEL_TRGVOCAB}: ${SPMTRGMODEL}
|
|
cut -f1 < $<.vocab > $@
|
|
endif
|
|
|
|
|
|
print-latest:
|
|
ifneq (${wildcard ${MODEL_LATEST}},)
|
|
ifeq (${wildcard ${MODEL_START}},)
|
|
@echo "cp ${MODEL_LATEST} ${MODEL_START}"
|
|
endif
|
|
endif
|
|
|
|
|
|
|
|
#------------------------------------------------------------------------
|
|
# training MarianNMT models
|
|
#------------------------------------------------------------------------
|
|
|
|
## NEW: take away dependency on ${MODEL_VOCAB}
|
|
## (will be created by marian if it does not exist)
|
|
|
|
|
|
## possible model variants
|
|
MARIAN_MODELS_DONE = ${WORKDIR}/${MODEL}.transformer.model${NR}.done \
|
|
${WORKDIR}/${MODEL}.transformer-align.model${NR}.done
|
|
|
|
MARIAN_TRAIN_PREREQS = ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
|
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
|
|
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
|
|
|
|
## dependencies and extra parameters
|
|
ifeq (${MODELTYPE},transformer-align)
|
|
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
|
|
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG}
|
|
endif
|
|
|
|
|
|
## train transformer model
|
|
${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS}
|
|
mkdir -p ${dir $@}
|
|
##--------------------------------------------------------------------
|
|
## in case we want to continue training from the latest existing model
|
|
## (check lib/config.mk to see how the latest model is found)
|
|
##--------------------------------------------------------------------
|
|
ifeq (${wildcard ${MODEL_START}},)
|
|
ifneq (${MODEL_LATEST},)
|
|
ifneq (${MODEL_LATEST_VOCAB},)
|
|
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
|
cp ${MODEL_LATEST} ${MODEL_START}
|
|
endif
|
|
endif
|
|
endif
|
|
##--------------------------------------------------------------------
|
|
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
|
|
${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \
|
|
--model $(@:.done=.npz) \
|
|
--type transformer \
|
|
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
|
|
--max-length 500 \
|
|
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
|
|
--mini-batch-fit \
|
|
-w ${MARIAN_WORKSPACE} \
|
|
--maxi-batch ${MARIAN_MAXI_BATCH} \
|
|
--early-stopping ${MARIAN_EARLY_STOPPING} \
|
|
--valid-freq ${MARIAN_VALID_FREQ} \
|
|
--save-freq ${MARIAN_SAVE_FREQ} \
|
|
--disp-freq ${MARIAN_DISP_FREQ} \
|
|
--valid-sets ${word 3,$^} ${word 4,$^} \
|
|
--valid-metrics perplexity \
|
|
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
|
|
--beam-size 12 --normalize 1 --allow-unk \
|
|
--log $(@:.model${NR}.done=.train${NR}.log) \
|
|
--valid-log $(@:.model${NR}.done=.valid${NR}.log) \
|
|
--enc-depth 6 --dec-depth 6 \
|
|
--transformer-heads 8 \
|
|
--transformer-postprocess-emb d \
|
|
--transformer-postprocess dan \
|
|
--transformer-dropout ${MARIAN_DROPOUT} \
|
|
--label-smoothing 0.1 \
|
|
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
|
|
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
|
|
--tied-embeddings-all \
|
|
--overwrite --keep-best \
|
|
--devices ${MARIAN_GPUS} \
|
|
--sync-sgd --seed ${SEED} \
|
|
--sqlite \
|
|
--tempdir ${TMPDIR} \
|
|
--exponential-smoothing
|
|
touch $@
|