OPUS-MT-train/lib/train.mk

# -*-makefile-*-


#------------------------------------------------------------------------
# vocabulary files:
#   - for SentencePiece models: take vocabulary from the spm-model
#   - otherwise: create vocab from training data
#   - always re-use existing vocabulary files (never overwrite!)
#   - copy an existing vocab file if MODEL_LATEST_VOCAB exists
#     (this is for continuing training with other pre-trained models)
#------------------------------------------------------------------------

## extract vocabulary from sentence piece model

${WORKDIR}/${MODEL}.src.vocab: ${SUBWORD_SRC_MODEL}
	cut -f1 < $<.vocab > $@
ifeq (${USE_TARGET_LABELS},1)
	echo "${TARGET_LABELS}" | tr ' ' "\n" >> $@
endif

${WORKDIR}/${MODEL}.trg.vocab: ${SUBWORD_TRG_MODEL}
	cut -f1 < $<.vocab > $@


ifneq ($(findstring spm,${SUBWORDS}),)

## make vocabulary from the source and target language specific
## sentence piece models (concatenate and yamlify)

${WORKDIR}/${MODEL}.vocab.yml: ${WORKDIR}/${MODEL}.src.vocab ${WORKDIR}/${MODEL}.trg.vocab
ifeq ($(wildcard $@),)
ifneq ($(wildcard ${MODEL_LATEST_VOCAB}),)
ifneq (${MODEL_LATEST_VOCAB},$@)
	cp ${MODEL_LATEST_VOCAB} $@
endif
else
	cat $^ | sort -u | ${REPOHOME}scripts/vocab2yaml.py > $@
endif
else
	@echo "$@ already exists! We will re-use it ..."
	touch $@
endif

else

## fallback: make vocabulary from the training data
## - no new vocabulary is created if the file already exists!
## - need to delete the file if you want to create a new one!

${WORKDIR}/${MODEL}.vocab.yml:	${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
				${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
ifeq ($(wildcard $@),)
ifneq ($(wildcard ${MODEL_LATEST_VOCAB}),)
ifneq (${MODEL_LATEST_VOCAB},$@)
	cp ${MODEL_LATEST_VOCAB} $@
endif
else
	mkdir -p ${dir $@}
	${LOAD_ENV} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
endif
else
	@echo "$@ already exists!"
	@echo "WARNING! No new vocabulary is created even though the data has changed!"
	@echo "WARNING! Delete the file if you want to start from scratch!"
	touch $@
endif
endif


#------------------------------------------------------------------------
# training MarianNMT models
#   - different kind of model types require different settings
#   - add word alignment to pre-requisites if necessary
#   - continue training from MODEL_LATEST (if it exists)
#   - initialise model with parameters from PRE_TRAINED_MODEL (if set)
#------------------------------------------------------------------------


## print the model that will be used to initalise training
## this needs to be compatible in architecture!

print-model-names:
	@echo "initial parameters from: ${PRE_TRAINED_MODEL}"
	@echo "       start with model: ${MODEL_LATEST}"
	@echo "         write model to: ${MODEL_START}"


## possible model variants
MARIAN_MODELS_DONE   = 	${patsubst %,${WORKDIR}/${MODEL}.%.model${NR}.done,${MODELTYPES}}

MARIAN_TRAIN_PREREQS = 	${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
			${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
			$(sort ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB})


## define validation and early-stopping parameters
## as well as pre-requisites for training the model
## TODO: do we want to add valid-metrics "ce-mean-words" and "bleu-detok"?

ifndef SKIP_VALIDATION
  MARIAN_TRAIN_PREREQS += ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
  MARIAN_STOP_CRITERIA = --early-stopping ${MARIAN_EARLY_STOPPING} \
        --valid-freq ${MARIAN_VALID_FREQ} \
        --valid-sets ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} \
        --valid-metrics perplexity \
        --valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
	--valid-max-length 100 \
	--valid-log ${WORKDIR}/${MODEL}.${MODELTYPE}.valid${NR}.log \
        --beam-size 6 --normalize 1 --allow-unk
  MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
else
  MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz
endif


## tie all embeddings if we have a common vocab
## for target and source language
## otherwise: only tie target embeddings
## TODO: if we use pre-defined tasks than tied-embeddings-all is set to true
##       How can we unset it if it should not be used?

MARIAN_TIE_EMBEDDINGS = --tied-embeddings-all

ifeq ($(USE_SPM_VOCAB),1)
ifneq (${USE_JOINT_SUBWORD_MODEL},1)
  MARIAN_TIE_EMBEDDINGS = --tied-embeddings
endif
endif


# start weights with a pre-trained model

ifneq (${wildcard ${PRE_TRAINED_MODEL}},)
  MARIAN_EXTRA += --pretrained-model ${PRE_TRAINED_MODEL}
endif


##------------------------------------------------
## transformer models (not using pre-defined tasks)
##
## dependencies and extra parameters
## for different models and guided alignment
##------------------------------------------------


## if substring '-align' is part of the MODELTYPE:
## add parameters and dependencies for guided alignment
ifneq ($(subst -align,,${MODELTYPE}),${MODELTYPE})
  MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
  MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG}
endif

ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny)
  MARIAN_ENC_DEPTH = 3
  MARIAN_DEC_DEPTH = 2
  MARIAN_ATT_HEADS = 8
  MARIAN_DIM_EMB = 256
  MARIAN_EXTRA += --transformer-decoder-autoreg rnn \
		--dec-cell ssru # --fp16
endif

## difference to student model in bergamot (tiny11):
# --transformer-dim-ffn 1536 --enc-depth 6 --transformer-ffn-activation relu
# 32000 vocab in total (tied source and target)
#    --mini-batch-fit -w 9000 --mini-batch 1000 --maxi-batch 1000 --devices $GPUS --sync-sgd --optimizer-delay 2 \
#    --learn-rate 0.0003 --lr-report --lr-warmup 16000 --lr-decay-inv-sqrt 32000 \
#    --cost-type ce-mean-words \
#    --optimizer-params 0.9 0.98 1e-09 --clip-norm 0

ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny11)
  MARIAN_ENC_DEPTH = 6
  MARIAN_DEC_DEPTH = 2
  MARIAN_ATT_HEADS = 8
  MARIAN_DIM_EMB = 256
  MARIAN_CLIP_NORM = 0
  MARIAN_EXTRA += --transformer-decoder-autoreg rnn \
		--dec-cell ssru --optimizer-delay 2 \
		 --transformer-dim-ffn 1536
# --dim-vocabs ${SUBWORD_SRCVOCAB_SIZE} ${SUBWORD_TRGVOCAB_SIZE}
# --fp16
endif


ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
  MARIAN_ENC_DEPTH = 6
  MARIAN_DEC_DEPTH = 2
  MARIAN_ATT_HEADS = 8
  MARIAN_DIM_EMB = 512
  MARIAN_EXTRA += --transformer-decoder-autoreg rnn --dec-cell ssru
  # --fp16
endif

##------------------------------------------------
## transformer-base
## transformer-big
##
## look at task aliases:
## https://github.com/marian-nmt/marian-dev/blob/master/src/common/aliases.cpp
##------------------------------------------------

ifeq ($(subst -align,,${MODELTYPE}),transformer-base)
  MARIAN_TRAINING_PARAMETER = --task transformer-base # --fp16
endif

ifeq ($(subst -align,,${MODELTYPE}),transformer-big)
  MARIAN_TRAINING_PARAMETER = --task transformer-big \
				--optimizer-delay 2 # --fp16
  GPUJOB_HPC_MEM = 16g
endif


##------------------------------------------------
## set training parameters
## (unless they are already set above)
##------------------------------------------------

MARIAN_TRAINING_PARAMETER ?= \
	--type transformer \
        --max-length ${MARIAN_MAX_LENGTH} \
	--maxi-batch ${MARIAN_MAXI_BATCH} \
        --mini-batch-fit \
	--max-length-factor 3 \
        --enc-depth ${MARIAN_ENC_DEPTH} \
	--dec-depth ${MARIAN_DEC_DEPTH} \
	--dim-emb ${MARIAN_DIM_EMB} \
        ${MARIAN_TIE_EMBEDDINGS} \
        --transformer-heads ${MARIAN_ATT_HEADS} \
        --transformer-dropout ${MARIAN_DROPOUT} \
        --transformer-postprocess-emb d \
        --transformer-postprocess dan \
	--label-smoothing 0.1 \
        --learn-rate 0.0003 \
	--lr-warmup 16000 \
	--lr-decay-inv-sqrt 16000 \
	--lr-report \
        --optimizer-params 0.9 0.98 1e-09 \
	--clip-norm ${MARIAN_CLIP_NORM} \
        --sync-sgd \
        --exponential-smoothing

## TODO: --fp16 seems to have changed from previous versions:
## --> cannot continue training with newer version
# old: --precision float16 float32 float32 --cost-scaling 7 2000 2 0.05 10 1
# new: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f
#
## --> leave it out for the time being?
## --> or: only add it of we don't continue training with existing models?
##     (it seems that it can take the info from the internal config info)


##------------------------------------------------
## finally: recipe for training the model
##------------------------------------------------

${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS}
	mkdir -p ${dir $@}
##--------------------------------------------------------------------
## in case we want to continue training from the latest existing model
## (check lib/config.mk to see how the latest model is found)
##--------------------------------------------------------------------
ifeq (${wildcard ${MODEL_START}},)
ifneq (${wildcard ${MODEL_LATEST}},)
ifneq (${MODEL_LATEST},${MODEL_START})
	cp ${MODEL_LATEST} ${MODEL_START}
endif
endif
endif
##--------------------------------------------------------------------
## remove yaml-file for parameters to avoid incompatibilities
## TODO: do we need this
	rm -f ${@:.done=.yml}
##--------------------------------------------------------------------
## TODO: LOAD_ENV - do we need to do that each time we call marian?
##       shouldn't that be rather the standard environdment that we
##       load anyway before calling make? It is already set in the
##       SLURM scripts ...
##--------------------------------------------------------------------
	${LOAD_ENV} && ${MARIAN_TRAIN} \
		${MARIAN_TRAINING_PARAMETER} \
		${MARIAN_EXTRA} \
		${MARIAN_STOP_CRITERIA} \
		${MARIAN_DATA_STORAGE} \
		--workspace ${MARIAN_WORKSPACE} \
		--model $(@:.done=.npz) \
		--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
		--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
		--save-freq ${MARIAN_SAVE_FREQ} \
		--disp-freq ${MARIAN_DISP_FREQ} \
		--log $(@:.model${NR}.done=.train${NR}.log) \
		--devices ${MARIAN_GPUS} \
		--seed ${SEED} \
		--tempdir ${TMPDIR} \
		--shuffle ${MARIAN_SHUFFLE} \
		--sharding ${MARIAN_SHARDING} \
		--overwrite \
		--keep-best
	touch $@


# extract lexical links
# --> required for shortlists
# NOTE: requires that extract_lex is installed
# NOTE: requires word alignment (TRAIN_ALG)

.PHONY: lex-s2t lex-t2s
lex-s2t: ${TRAIN_S2T}
lex-t2s: ${TRAIN_T2S}

${TRAIN_S2T}: ${TRAIN_ALG} ${TRAINDATA_SRC} ${TRAINDATA_TRG}
	mkdir -p ${LOCAL_TRAIN}.algtmp
	${GZCAT} $< > ${LOCAL_TRAIN}.algtmp/corpus.aln
	${GZCAT} ${word 2,$^} > ${LOCAL_TRAIN}.algtmp/corpus.src
	${GZCAT} ${word 3,$^} > ${LOCAL_TRAIN}.algtmp/corpus.trg
	${EXTRACT_LEX} 	${LOCAL_TRAIN}.algtmp/corpus.trg \
			${LOCAL_TRAIN}.algtmp/corpus.src \
			${LOCAL_TRAIN}.algtmp/corpus.aln \
			${LOCAL_TRAIN}.algtmp/lex.s2t \
			${LOCAL_TRAIN}.algtmp/lex.t2s
	${GZIP} -c ${LOCAL_TRAIN}.algtmp/lex.s2t > ${TRAIN_S2T}
	${GZIP} -c ${LOCAL_TRAIN}.algtmp/lex.t2s > ${TRAIN_T2S}
	rm -f 	${LOCAL_TRAIN}.algtmp/lex.s2t ${LOCAL_TRAIN}.algtmp/lex.t2s \
		${LOCAL_TRAIN}.algtmp/corpus.src ${LOCAL_TRAIN}.algtmp/corpus.trg \
		${LOCAL_TRAIN}.algtmp/corpus.aln
	rmdir ${LOCAL_TRAIN}.algtmp

${TRAIN_T2S}: ${TRAIN_S2T}
	echo "done!"