mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-27 11:03:13 +03:00
335 lines
11 KiB
Makefile
335 lines
11 KiB
Makefile
# -*-makefile-*-
|
|
|
|
|
|
#------------------------------------------------------------------------
|
|
# vocabulary files:
|
|
# - for SentencePiece models: take vocabulary from the spm-model
|
|
# - otherwise: create vocab from training data
|
|
# - always re-use existing vocabulary files (never overwrite!)
|
|
# - copy an existing vocab file if MODEL_LATEST_VOCAB exists
|
|
# (this is for continuing training with other pre-trained models)
|
|
#------------------------------------------------------------------------
|
|
|
|
## extract vocabulary from sentence piece model
|
|
|
|
${WORKDIR}/${MODEL}.src.vocab: ${SUBWORD_SRC_MODEL}
|
|
cut -f1 < $<.vocab > $@
|
|
ifeq (${USE_TARGET_LABELS},1)
|
|
echo "${TARGET_LABELS}" | tr ' ' "\n" >> $@
|
|
endif
|
|
|
|
${WORKDIR}/${MODEL}.trg.vocab: ${SUBWORD_TRG_MODEL}
|
|
cut -f1 < $<.vocab > $@
|
|
|
|
|
|
ifneq ($(findstring spm,${SUBWORDS}),)
|
|
|
|
## make vocabulary from the source and target language specific
|
|
## sentence piece models (concatenate and yamlify)
|
|
|
|
${WORKDIR}/${MODEL}.vocab.yml: ${WORKDIR}/${MODEL}.src.vocab ${WORKDIR}/${MODEL}.trg.vocab
|
|
ifeq ($(wildcard $@),)
|
|
ifneq ($(wildcard ${MODEL_LATEST_VOCAB}),)
|
|
ifneq (${MODEL_LATEST_VOCAB},$@)
|
|
cp ${MODEL_LATEST_VOCAB} $@
|
|
endif
|
|
else
|
|
cat $^ | sort -u | ${REPOHOME}scripts/vocab2yaml.py > $@
|
|
endif
|
|
else
|
|
@echo "$@ already exists! We will re-use it ..."
|
|
touch $@
|
|
endif
|
|
|
|
else
|
|
|
|
## fallback: make vocabulary from the training data
|
|
## - no new vocabulary is created if the file already exists!
|
|
## - need to delete the file if you want to create a new one!
|
|
|
|
${WORKDIR}/${MODEL}.vocab.yml: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
|
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
|
|
ifeq ($(wildcard $@),)
|
|
ifneq ($(wildcard ${MODEL_LATEST_VOCAB}),)
|
|
ifneq (${MODEL_LATEST_VOCAB},$@)
|
|
cp ${MODEL_LATEST_VOCAB} $@
|
|
endif
|
|
else
|
|
mkdir -p ${dir $@}
|
|
${LOAD_ENV} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
|
|
endif
|
|
else
|
|
@echo "$@ already exists!"
|
|
@echo "WARNING! No new vocabulary is created even though the data has changed!"
|
|
@echo "WARNING! Delete the file if you want to start from scratch!"
|
|
touch $@
|
|
endif
|
|
endif
|
|
|
|
|
|
#------------------------------------------------------------------------
|
|
# training MarianNMT models
|
|
# - different kind of model types require different settings
|
|
# - add word alignment to pre-requisites if necessary
|
|
# - continue training from MODEL_LATEST (if it exists)
|
|
# - initialise model with parameters from PRE_TRAINED_MODEL (if set)
|
|
#------------------------------------------------------------------------
|
|
|
|
|
|
## print the model that will be used to initalise training
|
|
## this needs to be compatible in architecture!
|
|
|
|
print-model-names:
|
|
@echo "initial parameters from: ${PRE_TRAINED_MODEL}"
|
|
@echo " start with model: ${MODEL_LATEST}"
|
|
@echo " write model to: ${MODEL_START}"
|
|
|
|
|
|
## possible model variants
|
|
MARIAN_MODELS_DONE = ${patsubst %,${WORKDIR}/${MODEL}.%.model${NR}.done,${MODELTYPES}}
|
|
|
|
MARIAN_TRAIN_PREREQS = ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
|
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
|
|
$(sort ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB})
|
|
|
|
|
|
## define validation and early-stopping parameters
|
|
## as well as pre-requisites for training the model
|
|
## TODO: do we want to add valid-metrics "ce-mean-words" and "bleu-detok"?
|
|
|
|
ifndef SKIP_VALIDATION
|
|
MARIAN_TRAIN_PREREQS += ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
|
|
MARIAN_STOP_CRITERIA = --early-stopping ${MARIAN_EARLY_STOPPING} \
|
|
--valid-freq ${MARIAN_VALID_FREQ} \
|
|
--valid-sets ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} \
|
|
--valid-metrics perplexity \
|
|
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
|
|
--valid-max-length 100 \
|
|
--valid-log ${WORKDIR}/${MODEL}.${MODELTYPE}.valid${NR}.log \
|
|
--beam-size 6 --normalize 1 --allow-unk
|
|
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
|
|
else
|
|
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz
|
|
endif
|
|
|
|
|
|
## tie all embeddings if we have a common vocab
|
|
## for target and source language
|
|
## otherwise: only tie target embeddings
|
|
## TODO: if we use pre-defined tasks than tied-embeddings-all is set to true
|
|
## How can we unset it if it should not be used?
|
|
|
|
MARIAN_TIE_EMBEDDINGS = --tied-embeddings-all
|
|
|
|
ifeq ($(USE_SPM_VOCAB),1)
|
|
ifneq (${USE_JOINT_SUBWORD_MODEL},1)
|
|
MARIAN_TIE_EMBEDDINGS = --tied-embeddings
|
|
endif
|
|
endif
|
|
|
|
|
|
# start weights with a pre-trained model
|
|
|
|
ifneq (${wildcard ${PRE_TRAINED_MODEL}},)
|
|
MARIAN_EXTRA += --pretrained-model ${PRE_TRAINED_MODEL}
|
|
endif
|
|
|
|
|
|
##------------------------------------------------
|
|
## transformer models (not using pre-defined tasks)
|
|
##
|
|
## dependencies and extra parameters
|
|
## for different models and guided alignment
|
|
##------------------------------------------------
|
|
|
|
|
|
## if substring '-align' is part of the MODELTYPE:
|
|
## add parameters and dependencies for guided alignment
|
|
ifneq ($(subst -align,,${MODELTYPE}),${MODELTYPE})
|
|
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
|
|
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG}
|
|
endif
|
|
|
|
ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny)
|
|
MARIAN_ENC_DEPTH = 3
|
|
MARIAN_DEC_DEPTH = 2
|
|
MARIAN_ATT_HEADS = 8
|
|
MARIAN_DIM_EMB = 256
|
|
MARIAN_EXTRA += --transformer-decoder-autoreg rnn \
|
|
--dec-cell ssru # --fp16
|
|
endif
|
|
|
|
## difference to student model in bergamot (tiny11):
|
|
# --transformer-dim-ffn 1536 --enc-depth 6 --transformer-ffn-activation relu
|
|
# 32000 vocab in total (tied source and target)
|
|
# --mini-batch-fit -w 9000 --mini-batch 1000 --maxi-batch 1000 --devices $GPUS --sync-sgd --optimizer-delay 2 \
|
|
# --learn-rate 0.0003 --lr-report --lr-warmup 16000 --lr-decay-inv-sqrt 32000 \
|
|
# --cost-type ce-mean-words \
|
|
# --optimizer-params 0.9 0.98 1e-09 --clip-norm 0
|
|
|
|
ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny11)
|
|
MARIAN_ENC_DEPTH = 6
|
|
MARIAN_DEC_DEPTH = 2
|
|
MARIAN_ATT_HEADS = 8
|
|
MARIAN_DIM_EMB = 256
|
|
MARIAN_CLIP_NORM = 0
|
|
MARIAN_EXTRA += --transformer-decoder-autoreg rnn \
|
|
--dec-cell ssru --optimizer-delay 2 \
|
|
--transformer-dim-ffn 1536
|
|
# --dim-vocabs ${SUBWORD_SRCVOCAB_SIZE} ${SUBWORD_TRGVOCAB_SIZE}
|
|
# --fp16
|
|
endif
|
|
|
|
|
|
ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
|
|
MARIAN_ENC_DEPTH = 6
|
|
MARIAN_DEC_DEPTH = 2
|
|
MARIAN_ATT_HEADS = 8
|
|
MARIAN_DIM_EMB = 512
|
|
MARIAN_EXTRA += --transformer-decoder-autoreg rnn --dec-cell ssru
|
|
# --fp16
|
|
endif
|
|
|
|
##------------------------------------------------
|
|
## transformer-base
|
|
## transformer-big
|
|
##
|
|
## look at task aliases:
|
|
## https://github.com/marian-nmt/marian-dev/blob/master/src/common/aliases.cpp
|
|
##------------------------------------------------
|
|
|
|
ifeq ($(subst -align,,${MODELTYPE}),transformer-base)
|
|
MARIAN_TRAINING_PARAMETER = --task transformer-base # --fp16
|
|
endif
|
|
|
|
ifeq ($(subst -align,,${MODELTYPE}),transformer-big)
|
|
MARIAN_TRAINING_PARAMETER = --task transformer-big \
|
|
--optimizer-delay 2 # --fp16
|
|
GPUJOB_HPC_MEM = 16g
|
|
endif
|
|
|
|
|
|
|
|
##------------------------------------------------
|
|
## set training parameters
|
|
## (unless they are already set above)
|
|
##------------------------------------------------
|
|
|
|
MARIAN_TRAINING_PARAMETER ?= \
|
|
--type transformer \
|
|
--max-length ${MARIAN_MAX_LENGTH} \
|
|
--maxi-batch ${MARIAN_MAXI_BATCH} \
|
|
--mini-batch-fit \
|
|
--max-length-factor 3 \
|
|
--enc-depth ${MARIAN_ENC_DEPTH} \
|
|
--dec-depth ${MARIAN_DEC_DEPTH} \
|
|
--dim-emb ${MARIAN_DIM_EMB} \
|
|
${MARIAN_TIE_EMBEDDINGS} \
|
|
--transformer-heads ${MARIAN_ATT_HEADS} \
|
|
--transformer-dropout ${MARIAN_DROPOUT} \
|
|
--transformer-postprocess-emb d \
|
|
--transformer-postprocess dan \
|
|
--label-smoothing 0.1 \
|
|
--learn-rate 0.0003 \
|
|
--lr-warmup 16000 \
|
|
--lr-decay-inv-sqrt 16000 \
|
|
--lr-report \
|
|
--optimizer-params 0.9 0.98 1e-09 \
|
|
--clip-norm ${MARIAN_CLIP_NORM} \
|
|
--sync-sgd \
|
|
--exponential-smoothing
|
|
|
|
## TODO: --fp16 seems to have changed from previous versions:
|
|
## --> cannot continue training with newer version
|
|
# old: --precision float16 float32 float32 --cost-scaling 7 2000 2 0.05 10 1
|
|
# new: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f
|
|
#
|
|
## --> leave it out for the time being?
|
|
## --> or: only add it of we don't continue training with existing models?
|
|
## (it seems that it can take the info from the internal config info)
|
|
|
|
|
|
|
|
|
|
##------------------------------------------------
|
|
## finally: recipe for training the model
|
|
##------------------------------------------------
|
|
|
|
${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS}
|
|
mkdir -p ${dir $@}
|
|
##--------------------------------------------------------------------
|
|
## in case we want to continue training from the latest existing model
|
|
## (check lib/config.mk to see how the latest model is found)
|
|
##--------------------------------------------------------------------
|
|
ifeq (${wildcard ${MODEL_START}},)
|
|
ifneq (${wildcard ${MODEL_LATEST}},)
|
|
ifneq (${MODEL_LATEST},${MODEL_START})
|
|
cp ${MODEL_LATEST} ${MODEL_START}
|
|
endif
|
|
endif
|
|
endif
|
|
##--------------------------------------------------------------------
|
|
## remove yaml-file for parameters to avoid incompatibilities
|
|
## TODO: do we need this
|
|
rm -f ${@:.done=.yml}
|
|
##--------------------------------------------------------------------
|
|
## TODO: LOAD_ENV - do we need to do that each time we call marian?
|
|
## shouldn't that be rather the standard environdment that we
|
|
## load anyway before calling make? It is already set in the
|
|
## SLURM scripts ...
|
|
##--------------------------------------------------------------------
|
|
${LOAD_ENV} && ${MARIAN_TRAIN} \
|
|
${MARIAN_TRAINING_PARAMETER} \
|
|
${MARIAN_EXTRA} \
|
|
${MARIAN_STOP_CRITERIA} \
|
|
${MARIAN_DATA_STORAGE} \
|
|
--workspace ${MARIAN_WORKSPACE} \
|
|
--model $(@:.done=.npz) \
|
|
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
|
|
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
|
|
--save-freq ${MARIAN_SAVE_FREQ} \
|
|
--disp-freq ${MARIAN_DISP_FREQ} \
|
|
--log $(@:.model${NR}.done=.train${NR}.log) \
|
|
--devices ${MARIAN_GPUS} \
|
|
--seed ${SEED} \
|
|
--tempdir ${TMPDIR} \
|
|
--shuffle ${MARIAN_SHUFFLE} \
|
|
--sharding ${MARIAN_SHARDING} \
|
|
--overwrite \
|
|
--keep-best
|
|
touch $@
|
|
|
|
|
|
|
|
|
|
|
|
# extract lexical links
|
|
# --> required for shortlists
|
|
# NOTE: requires that extract_lex is installed
|
|
# NOTE: requires word alignment (TRAIN_ALG)
|
|
|
|
.PHONY: lex-s2t lex-t2s
|
|
lex-s2t: ${TRAIN_S2T}
|
|
lex-t2s: ${TRAIN_T2S}
|
|
|
|
${TRAIN_S2T}: ${TRAIN_ALG} ${TRAINDATA_SRC} ${TRAINDATA_TRG}
|
|
mkdir -p ${LOCAL_TRAIN}.algtmp
|
|
${GZCAT} $< > ${LOCAL_TRAIN}.algtmp/corpus.aln
|
|
${GZCAT} ${word 2,$^} > ${LOCAL_TRAIN}.algtmp/corpus.src
|
|
${GZCAT} ${word 3,$^} > ${LOCAL_TRAIN}.algtmp/corpus.trg
|
|
${EXTRACT_LEX} ${LOCAL_TRAIN}.algtmp/corpus.trg \
|
|
${LOCAL_TRAIN}.algtmp/corpus.src \
|
|
${LOCAL_TRAIN}.algtmp/corpus.aln \
|
|
${LOCAL_TRAIN}.algtmp/lex.s2t \
|
|
${LOCAL_TRAIN}.algtmp/lex.t2s
|
|
${GZIP} -c ${LOCAL_TRAIN}.algtmp/lex.s2t > ${TRAIN_S2T}
|
|
${GZIP} -c ${LOCAL_TRAIN}.algtmp/lex.t2s > ${TRAIN_T2S}
|
|
rm -f ${LOCAL_TRAIN}.algtmp/lex.s2t ${LOCAL_TRAIN}.algtmp/lex.t2s \
|
|
${LOCAL_TRAIN}.algtmp/corpus.src ${LOCAL_TRAIN}.algtmp/corpus.trg \
|
|
${LOCAL_TRAIN}.algtmp/corpus.aln
|
|
rmdir ${LOCAL_TRAIN}.algtmp
|
|
|
|
${TRAIN_T2S}: ${TRAIN_S2T}
|
|
echo "done!"
|
|
|