mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-27 11:03:13 +03:00
360 lines
12 KiB
Makefile
360 lines
12 KiB
Makefile
# -*-makefile-*-
|
|
#
|
|
# recipes for specific tasks
|
|
#
|
|
|
|
|
|
include ${REPOHOME}lib/data.mk
|
|
include ${REPOHOME}lib/train.mk
|
|
include ${REPOHOME}lib/test.mk
|
|
include ${REPOHOME}lib/quantize.mk
|
|
|
|
include ${REPOHOME}lib/slurm.mk
|
|
include ${REPOHOME}lib/generic.mk
|
|
include ${REPOHOME}lib/misc.mk
|
|
|
|
include ${REPOHOME}lib/allas.mk
|
|
include ${REPOHOME}lib/dist.mk
|
|
|
|
|
|
#------------------------------------------------------------------------
|
|
# create a model-specific config file
|
|
#------------------------------------------------------------------------
|
|
|
|
.PHONY: config local-config
|
|
config local-config: ${WORKDIR}/${MODELCONFIG}
|
|
|
|
#------------------------------------------------------------------------
|
|
# make various data sets (and word alignment)
|
|
#------------------------------------------------------------------------
|
|
|
|
.PHONY: data
|
|
data:
|
|
@${MAKE} rawdata
|
|
@${MAKE} local-config
|
|
@${MAKE} traindata
|
|
@${MAKE} devdata
|
|
@${MAKE} testdata
|
|
@${MAKE} vocab
|
|
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)
|
|
@${MAKE} wordalign
|
|
endif
|
|
|
|
traindata: ${TRAINDATA_SRC} ${TRAINDATA_TRG}
|
|
testdata: ${TESTDATA_SRC} ${TESTDATA_TRG}
|
|
devdata: ${DEVDATA_SRC} ${DEVDATA_TRG}
|
|
devdata-raw: ${DEV_SRC} ${DEV_TRG}
|
|
|
|
wordalign: ${TRAIN_ALG}
|
|
|
|
|
|
## just report whether all necessary data sets exist
|
|
## --> usefule for the data-and-train-job recipe that
|
|
## decides whether to start a CPU job for creating
|
|
## data first before starting a GPU job for training
|
|
data-done:
|
|
if [ -e ${TESTDATA_SRC} ]; then \
|
|
if [ -e ${TESTDATA_TRG} ]; then \
|
|
if [ -e ${DEVDATA_SRC} ]; then \
|
|
if [ -e ${DEVDATA_TRG} ]; then \
|
|
if [ -e ${TRAINDATA_SRC} ]; then \
|
|
if [ -e ${TRAINDATA_TRG} ]; then \
|
|
if [ "$(filter align,${subst -, ,${MODELTYPE}})" == "align" ]; then \
|
|
if [ -e ${TRAIN_ALG} ]; then \
|
|
echo "all data sets exist"; \
|
|
fi \
|
|
else \
|
|
echo "all data sets exist"; \
|
|
fi \
|
|
fi \
|
|
fi \
|
|
fi \
|
|
fi \
|
|
fi \
|
|
fi
|
|
|
|
data-needed:
|
|
@echo ${TRAINDATA_SRC} ${TRAINDATA_TRG}
|
|
@echo ${DEVDATA_SRC} ${DEVDATA_TRG}
|
|
@echo ${TESTDATA_SRC} ${TESTDATA_TRG}
|
|
@echo ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
|
|
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)
|
|
@echo ${TRAIN_ALG}
|
|
endif
|
|
|
|
#------------------------------------------------------------------------
|
|
# train, translate and evaluate
|
|
#------------------------------------------------------------------------
|
|
|
|
## define how may repetitions of slurm jobs we
|
|
## can submit in case a jobs times out or breaks
|
|
## SLURM_REPEAT = current iteration
|
|
## SLURM_MAX_REPEAT = maximum number of iterations we allow
|
|
|
|
SLURM_REPEAT ?= 0
|
|
SLURM_MAX_REPEAT ?= 10
|
|
|
|
|
|
# train the model - if this is a slurm job (i.e. SLURM_JOBID is set):
|
|
# - submit another one that continues training in case the current one breaks
|
|
# - only continue a certain number of times to avoid infinte loops
|
|
train:
|
|
ifdef SLURM_JOBID
|
|
if [ ${SLURM_REPEAT} -lt ${SLURM_MAX_REPEAT} ]; then \
|
|
echo "submit job that continues to train in case the current one breaks or times out"; \
|
|
echo "current iteration: ${SLURM_REPEAT}"; \
|
|
make SLURM_REPEAT=$$(( ${SLURM_REPEAT} + 1 )) \
|
|
SBATCH_ARGS="-d afternotok:${SLURM_JOBID}" $@.submit${GPUJOB_SUBMIT}; \
|
|
else \
|
|
echo "reached maximum number of repeated slurm jobs: ${SLURM_REPEAT}"; \
|
|
fi
|
|
endif
|
|
${MAKE} ${MODEL_DONE}
|
|
|
|
vocab: ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
|
|
translate: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
|
|
eval: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.eval
|
|
compare: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
|
|
|
|
## ensemble of models (assumes to find them in subdirs of the WORKDIR)
|
|
translate-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}
|
|
eval-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}.eval
|
|
|
|
|
|
## combined tasks:
|
|
## train and evaluate
|
|
train-and-eval:
|
|
ifdef SLURM_JOBID
|
|
if [ ${SLURM_REPEAT} -lt ${SLURM_MAX_REPEAT} ]; then \
|
|
echo "submit job that continues to train in case the current one breaks or times out"; \
|
|
echo "current iteration: ${SLURM_REPEAT}"; \
|
|
make SBATCH_ARGS="-d afternotok:${SLURM_JOBID}" \
|
|
SLURM_REPEAT=$$(( ${SLURM_REPEAT} + 1 )) $@.submit${GPUJOB_SUBMIT}; \
|
|
else \
|
|
echo "reached maximum number of repeated slurm jobs: ${SLURM_REPEAT}"; \
|
|
fi
|
|
endif
|
|
${MAKE} ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
|
${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
|
|
${MAKE} eval-testsets
|
|
|
|
## train model and start back-translation jobs once the model is ready
|
|
## (requires to create a dist package)
|
|
## TODO: does this still work?
|
|
train-and-start-bt-jobs:
|
|
ifdef SLURM_JOBID
|
|
if [ ${SLURM_REPEAT} -lt ${SLURM_MAX_REPEAT} ]; then \
|
|
echo "submit job that continues to train in case the current one breaks or times out"; \
|
|
echo "current iteration: ${SLURM_REPEAT}"; \
|
|
make SBATCH_ARGS="-d afternotok:${SLURM_JOBID}" \
|
|
SLURM_REPEAT=$$(( ${SLURM_REPEAT} + 1 )) $@.submit${GPUJOB_SUBMIT}; \
|
|
else \
|
|
echo "reached maximum number of repeated slurm jobs: ${SLURM_REPEAT}"; \
|
|
fi
|
|
endif
|
|
${MAKE} ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
|
${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
|
|
${MAKE} local-dist
|
|
${MAKE} -C backtranslate MODELHOME=${MODELDIR} translate-all-wikis-jobs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#------------------------------------------------------------------------
|
|
# create slurm jobs
|
|
#------------------------------------------------------------------------
|
|
|
|
## copy different HPC params for jobs that need to wordalign data or not
|
|
ifeq ($(findstring align,${MODELTYPE}),align)
|
|
DATAJOB_HPCPARAMS = ${DATA_ALIGN_HPCPARAMS}
|
|
ALLJOB_HPCPARAMS = ${DATA_ALIGN_HPCPARAMS} ${TRAINJOB_HPCPARAMS}
|
|
else
|
|
DATAJOB_HPCPARAMS = ${DATA_PREPARE_HPCPARAMS}
|
|
ALLJOB_HPCPARAMS = ${DATA_PREPARE_HPCPARAMS} ${TRAINJOB_HPCPARAMS}
|
|
endif
|
|
|
|
# all-job:
|
|
# - check whether data files exist
|
|
# - if not: create a CPU job that makes the data and starts a training job after that
|
|
# - if yes: create the GPU training job (after checking that data sets are alright)
|
|
|
|
.PHONY: all-job
|
|
all-job:
|
|
@if [ "`${MAKE} -s data-done 2>/dev/null | grep 'data sets'`" == "all data sets exist" ]; then \
|
|
echo "........ all data files exist already!"; \
|
|
echo "........ submit a job for training the model!"; \
|
|
${MAKE} ${TRAINJOB_HPCPARAMS} train-and-eval.submit${GPUJOB_SUBMIT}; \
|
|
else \
|
|
echo "........ submit a CPU job for making data files first!"; \
|
|
echo "........ submit training job later!"; \
|
|
${MAKE} ${ALLJOB_HPCPARAMS} data-and-train-job.submitcpu; \
|
|
fi
|
|
|
|
|
|
# data-and-train job:
|
|
# - prepare data sets
|
|
# - create/submit the training job
|
|
# if this is inside a slurm job:
|
|
# --> immediately submit the training job
|
|
# with a dependency on the current one
|
|
# --> avoid to wait until we can queue the training job
|
|
|
|
.PHONY: data-and-train-job
|
|
data-and-train-job:
|
|
ifdef SLURM_JOBID
|
|
echo "submit training job after data creation job (${SLURM_JOBID})"
|
|
make ${TRAINJOB_HPCPARAMS} SBATCH_ARGS="-d afterok:${SLURM_JOBID}" train-and-eval.submit${GPUJOB_SUBMIT}
|
|
${MAKE} data
|
|
else
|
|
${MAKE} data
|
|
${MAKE} ${TRAINJOB_HPCPARAMS} train-and-eval.submit${GPUJOB_SUBMIT}
|
|
endif
|
|
|
|
|
|
# train-job:
|
|
# - create/submit a jobb for training only (no evaluation!)
|
|
.PHONY: train-job
|
|
train-job:
|
|
${MAKE} ${TRAINJOB_HPCPARAMS} train.submit${GPUJOB_SUBMIT}
|
|
|
|
# train-and-eval-job:
|
|
# - create/submit a jobb for training (+ evaluation)
|
|
.PHONY: train-and-eval-job
|
|
train-and-eval-job:
|
|
${MAKE} ${TRAINJOB_HPCPARAMS} train-and-eval.submit${GPUJOB_SUBMIT}
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------
|
|
# run everything including backtranslation of wiki-data
|
|
#
|
|
## TODO: need to refresh backtranslate/index.html from time to time!
|
|
## ---> necessary for fetching latest wikidump with the correct link
|
|
#---------------------------------------------------------------------
|
|
|
|
.PHONY: all-and-backtranslate
|
|
all-and-backtranslate: ${WORKDIR}/${MODELCONFIG}
|
|
${MAKE} data
|
|
${MAKE} train
|
|
${MAKE} eval
|
|
${MAKE} compare
|
|
${MAKE} local-dist
|
|
-for t in ${TRGLANGS}; do \
|
|
for s in ${SRCLANGS}; do \
|
|
if [ "$$s" != "$$t" ]; then \
|
|
${MAKE} -C backtranslate \
|
|
SRC=$$s TRG=$$t \
|
|
MODELHOME=${MODELDIR} \
|
|
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
|
|
all; \
|
|
fi \
|
|
done \
|
|
done
|
|
|
|
.PHONY: all-and-backtranslate-allwikis
|
|
all-and-backtranslate-allwikis: ${WORKDIR}/${MODELCONFIG}
|
|
${MAKE} data
|
|
${MAKE} train
|
|
${MAKE} eval
|
|
${MAKE} compare
|
|
${MAKE} local-dist
|
|
-for t in ${TRGLANGS}; do \
|
|
for s in ${SRCLANGS}; do \
|
|
if [ "$$s" != "$$t" ]; then \
|
|
${MAKE} -C backtranslate SRC=$$s TRG=$$t all-wikitext; \
|
|
${MAKE} -C backtranslate \
|
|
SRC=$$s TRG=$$t \
|
|
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
|
|
MODELHOME=${MODELDIR} \
|
|
translate-all-wikis; \
|
|
fi \
|
|
done \
|
|
done
|
|
|
|
.PHONY: all-and-backtranslate-allwikiparts
|
|
all-and-backtranslate-allwikiparts: ${WORKDIR}/${MODELCONFIG}
|
|
${MAKE} data
|
|
${MAKE} train
|
|
${MAKE} eval
|
|
${MAKE} compare
|
|
${MAKE} local-dist
|
|
-for t in ${TRGLANGS}; do \
|
|
for s in ${SRCLANGS}; do \
|
|
if [ "$$s" != "$$t" ]; then \
|
|
${MAKE} -C backtranslate SRC=$$s TRG=$$t all-wikitext; \
|
|
${MAKE} -C backtranslate \
|
|
SRC=$$s TRG=$$t \
|
|
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
|
|
MODELHOME=${MODELDIR} \
|
|
translate-all-wikiparts; \
|
|
fi \
|
|
done \
|
|
done
|
|
|
|
## train a model with backtranslations of wikipedia data
|
|
## (1) train a model in the opposite direction and backtranslate wikipedia data
|
|
## (2) train a model with backtranslated data
|
|
.PHONY: all-with-bt
|
|
all-with-bt:
|
|
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate
|
|
${MAKE} all-bt
|
|
|
|
## train a model with backtranslations of ALL wikimedia wiki data
|
|
.PHONY: all-with-bt-all
|
|
all-with-bt-all:
|
|
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate-allwikis
|
|
${MAKE} all-bt
|
|
|
|
## and now with all parts of all wikis
|
|
.PHONY: all-with-bt-allparts
|
|
all-with-bt-allparts:
|
|
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate-allwikiparts
|
|
${MAKE} all-bt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## job1: submit jobs to create data, train models, backtranslate all, and train again
|
|
|
|
job1: ${WORKDIR}/${MODELCONFIG}
|
|
${MAKE} HPC_MEM=12g HPC_CORES=4 job1-step1.submitcpu
|
|
|
|
job1-step1:
|
|
${MAKE} data
|
|
${MAKE} reverse-data
|
|
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" data
|
|
-for t in ${TRGLANGS}; do \
|
|
${MAKE} -C backtranslate SRC=${SRC} TRG=$$t all-wikitext; \
|
|
done
|
|
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
|
|
HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} job1-step2.submit${GPUJOB_SUBMIT}
|
|
|
|
job1-step2:
|
|
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
|
|
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
|
|
all-and-backtranslate-allwikis
|
|
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
|
|
HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} job1-step3.submit${GPUJOB_SUBMIT}
|
|
|
|
job1-step3:
|
|
${MAKE} all-bt
|
|
|
|
|
|
|
|
|
|
print-info:
|
|
@echo "model file: ${MODEL_START}"
|
|
@echo "source vocab: ${MODEL_SRCVOCAB}"
|
|
@echo "target vocab: ${MODEL_TRGVOCAB}"
|
|
@echo "final model file: ${MODEL_FINAL}"
|
|
@echo "latest compatible model: ${MODEL_LATEST}"
|
|
ls -t ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz \
|
|
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz \
|
|
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.best-perplexity.npz \
|
|
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.best-perplexity.npz
|