OPUS-MT-train/Makefile

# -*-makefile-*-
#
# train and test Opus-MT models using MarianNMT
#
#--------------------------------------------------------------------
#
# make all
#
# make data ............... create training data
# make train .............. train NMT model
# make translate .......... translate test set
# make eval ............... evaluate
#
# make all-job ............ create config, data and submit training job
# make train-job .......... submit training job
#
#--------------------------------------------------------------------
# general parameters / variables (see lib/config.mk)
#
# * most essential parameters (language IDs used in OPUS):
#
# SRCLANGS ................ set of source languages
# TRGLANGS ................ set of target languages
#
# * other important parameters (can leave defaults)
#
# MODELTYPE ............... transformer|transformer-align
# TRAINSET ................ set of corpora used for training (default = all of OPUS)
# TESTSET ................. test set corpus (default - subset of Tatoeba with some fallbacks)
# DEVSET .................. validation corpus (default - another subset of TESTSET)
#
# DEVSIZE ................. nr of sentences in validation data
# TESTSIZE ................ nr of sentences in test data
#
# TESTSMALLSIZE ........... reduced size for low-resource settings
# DEVSMALLSIZE ............ reduced size for low-resource settings
# DEVMINSIZE .............. minimum size for validation data
#--------------------------------------------------------------------
# lib/generic.mk
#
# There are implicit targets that define certain frequent tasks
# They typically modify certain settings and make another target
# with those modifiction. They can be used by adding a suffix to
# the actual target that needs to be done. For example,
# adding -RL triggers right-to-left models:
#
#   make train-RL
#   make eval-RL
#
# Another example would be to run something over a number of models,
# for example, translate and evaluate with those models:
#
#   make eval-allmodels.submit
#   make eval-allbilingual.submit     # only bilingual models
#   make eval-allmultlingual.submit   # only multilingual models
#--------------------------------------------------------------------
# lib/slurm.mk
#
# Defines generic targets for submitting jobs. They work in the
# same way as the generic targets in lib/generic.mk but submit a
# job using SLURM sbatch. This only works if the SLURM parameters
# are correctly set. Check lib/env.mk, lib/config.mk and lib/slurm.mk
#
#   %.submit ........ job on GPU nodes (for train and translate)
#   %.submitcpu ..... job on CPU nodes (for translate and eval)
#
# They can be combined with any other target, even with generic
# extensions described above. For exaple, subkit a job to train
# an en-ru right-to-left model for 24 hours you can run
#
#   make WALLTIME=24 SRCLANGS=en TRGLANGS=ru train-RL.submit
#
# Other extensions can be added to modify the SLURM job, for example
# to submit the same job to run on multiple GPUs on one node:
#
#   make WALLTIME=24 SRCLANGS=en TRGLANGS=ru train-RL.submit-multigpu
#
# There can also be targets that submit jobs via SLURM, for exampl
# the train-job target. This can be combined with starting a CPU
# job to create the data sets, which will then submit the train
# job on GPUs once the training data are ready. For example, to
# submit a job with 4 threads (using make -j 4) that will run
# the train-job target on a CPU node allocating 4 CPU cores you
# can do:
#
#   make HPC_CORES=4 SRCLANGS=en TRGLANGS=ru train-job-RL.submitcpu
#
#--------------------------------------------------------------------
# lib/dist.mk
#
# Targets to create and upload packages of trained models
#
#--------------------------------------------------------------------
# There are various special targets for specific and generic tasks.
# Look into the makefiles in lib/generic.mk and lib/models/*.mk
# Many of those targets can be further adjusted by setting certain variables
# Some examples are below but all of those things are subject to change ....
#
#
# * submit job to train a model in one specific translation direction
#   (make data on CPU and then start a job on a GPU node with 4 GPUs)
#   make SRCLANGS=en TRGLANGS=de unidrectional.submitcpu
#
# * submit jobs to train a model in both translation directions
#   (make data on CPU, reverse data and start 2 jobs on a GPU nodes with 4 GPUs each)
#   make SRCLANGS=en TRGLANGS=de bilingual.submitcpu
#
# * same as bilingual but guess some HPC settings based on data size
#   make SRCLANGS=en TRGLANGS=de bilingual-dynamic.submitcpu
#
# * submit jobs for all OPUS languages to PIVOT language in both directions using bilingual-dynamic
#   make PIVOT=en allopus2pivot              # run loop on command line
#   make PIVOT=en allopus2pivot.submitcpu    # submit the same as CPU-based job
#   make all2en.submitcpu                    # short form of the same
#
# * submit jobs for all combinations of OPUS languages (this is huge!)
#   (only if there is no train.submit in the workdir of the language pair)
#   make PIVOT=en allopus.submitcpu
#
# * submit a job to train a multilingual model with the same languages on both sides
#   make LANGS="en de fr" multilingual.submitcpu
#
#--------------------------------------------------------------------
# Ensembles: One can train a number of models and ensemble them.
# NOTE: make sure that the data files and vocabularies exist before
#       training models. Otherwise, thete could be a racing situation
#       when those jobs start simultaneously!!!
#
#
# make data
# make vocab
# make NR=1 train.submit
# make NR=2 train.submit
# make NR=3 train.submit
#
# make NR=1 eval.submit
# make NR=2 eval.submit
# make NR=3 eval.submit
#
# make eval-ensemble.submit
#
#--------------------------------------------------------------------

include lib/env.mk
include lib/config.mk
include lib/tasks.mk
include lib/projects.mk

.PHONY: all
all:
	${MAKE} rawdata
	${MAKE} local-config
	${MAKE} data
	${MAKE} train
	${MAKE} eval
	${MAKE} compare
	${MAKE} eval-testsets