new makefile structure

2025-01-08 10:48:25 +03:00 · 2020-05-03 21:46:30 +03:00 · 2020-05-03 21:46:30 +03:00 · 5404f515aa
commit 5404f515aa
parent 6b8e69269a
4 changed files with 134 additions and 114 deletions
--- a/186
+++ b/186
@ -1,24 +1,101 @@
 # -*-makefile-*-
 #
-# train Opus-MT models using MarianNMT
+# train and test Opus-MT models using MarianNMT
 #
 #--------------------------------------------------------------------
 #
-# (1) train NMT model
-#
-# make train .............. train NMT model for current language pair
-#
-# (2) translate and evaluate
+# make all
 #
+# make data ............... create training data
+# make train .............. train NMT model
 # make translate .......... translate test set
 # make eval ............... evaluate
 #
+# make train-job .......... create data and submit training job
+#
 #--------------------------------------------------------------------
+# general parameters / variables (see lib/config.mk)
 #
-#   Makefile.tasks ...... various common and specific tasks/experiments
-#   Makefile.generic .... generic targets (in form of prefixes to be added to other targets)
+# * most essential parameters (language IDs used in OPUS):
+#
+# SRCLANGS ................ set of source languages
+# TRGLANGS ................ set of target languages
+#
+# * other important parameters (can leave defaults)
+#
+# MODELTYPE ............... transformer|transformer-align
+# TRAINSET ................ set of corpora used for training (default = all of OPUS)
+# TESTSET ................. test set corpus (default - subset of Tatoeba with some fallbacks)
+# DEVSET .................. validation corpus (default - another subset of TESTSET)
+#
+# DEVSIZE ................. nr of sentences in validation data
+# TESTSIZE ................ nr of sentences in test data
+# HELDOUTSIZE ............. nr of sentence in heldout data from each train corpus
+#
+# TESTSMALLSIZE ........... reduced size for low-resource settings
+# DEVSMALLSIZE ............ reduced size for low-resource settings
+# DEVMINSIZE .............. minimum size for validation data
+#--------------------------------------------------------------------
+# lib/generic.mk
+#
+# There are implicit targets that define certain frequent tasks
+# They typically modify certain settings and make another target
+# with those modifiction. They can be used by adding a suffix to
+# the actual target that needs to be done. For example, 
+# adding -RL triggers right-to-left models:
+#
+#   make train-RL
+#   make eval-RL
+#
+# Another example would be to run something over a number of models,
+# for example, translate and evaluate with those models:
+#
+#   make eval-allmodels.submit
+#   make eval-allbilingual.submit     # only bilingual models
+#   make eval-allmultlingual.submit   # only multilingual models
+#--------------------------------------------------------------------
+# lib/slurm.mk
+#
+# Defines generic targets for submitting jobs. They work in the 
+# same way as the generic targets in lib/generic.mk but submit a
+# job using SLURM sbatch. This only works if the SLURM parameters
+# are correctly set. Check lib/env.mk, lib/config.mk and lib/slurm.mk
+#
+#   %.submit ........ job on GPU nodes (for train and translate)
+#   %.submitcpu ..... job on CPU nodes (for translate and eval)
+#
+# They can be combined with any other target, even with generic 
+# extensions described above. For exaple, subkit a job to train
+# an en-ru right-to-left model for 24 hours you can run
+#
+#   make WALLTIME=24 SRCLANGS=en TRGLANGS=ru train-RL.submit
+#
+# Other extensions can be added to modify the SLURM job, for example
+# to submit the same job to run on multiple GPUs on one node:
+#
+#   make WALLTIME=24 SRCLANGS=en TRGLANGS=ru train-RL.submit-multigpu
+#
+# There can also be targets that submit jobs via SLURM, for exampl
+# the train-job target. This can be combined with starting a CPU
+# job to create the data sets, which will then submit the train
+# job on GPUs once the training data are ready. For example, to
+# submit a job with 4 threads (using make -j 4) that will run
+# the train-job target on a CPU node allocating 4 CPU cores you
+# can do:
+#
+#   make HPC_CORES=4 SRCLANGS=en TRGLANGS=ru train-job-RL.submitcpu
+#
+#--------------------------------------------------------------------
+# lib/dist.mk
+#
+# Targets to create and upload packages of trained models
+#
+#--------------------------------------------------------------------
+# There are various special targets for specific and generic tasks.
+# Look into the makefiles in lib/generic.mk and lib/models/*.mk
+# Many of those targets can be further adjusted by setting certain variables
+# Some examples are below but all of those things are subject to change ....
 #
-# Examples from Makefile.tasks:
 #
 # * submit job to train a model in one specific translation direction
 #   (make data on CPU and then start a job on a GPU node with 4 GPUs)
@ -44,41 +121,14 @@
 #   make LANGS="en de fr" multilingual.submitcpu
 #
 #--------------------------------------------------------------------
-# Some examples using generic extensions
-#
-# * submit job to train en-ru with backtranslation data from backtranslate/
-#   make HPC_CORES=4 WALLTIME=24 SRCLANGS=en TRGLANGS=ru unidirectional-add-backtranslations.submitcpu
-#
-# * submit job that evaluates all currently trained models:
-#   make eval-allmodels.submit
-#   make eval-allbilingual.submit   # only bilingual models
-#   make eval-allbilingual.submit   # only multilingual models
-#
-#--------------------------------------------------------------------
-#
-# general parameters / variables (see Makefile.config)
-#   SRCLANGS ............ set source language(s)      (en)
-#   TRGLANGS ............ set target language(s)      (de)
-#
+# Ensembles: One can train a number of models and ensemble them.
+# NOTE: make sure that the data files and vocabularies exist before
+#       training models. Otherwise, thete could be a racing situation
+#       when those jobs start simultaneously!!!
 # 
-# submit jobs by adding suffix to make-target to be run
-#   .submit ........ job on GPU nodes (for train and translate)
-#   .submitcpu ..... job on CPU nodes (for translate and eval)
 #
-# for example:
-#    make train.submit
-#
-# run a multigpu job, for example
-#    make train-multigpu.submit
-#    make train-twogpu.submit
-#    make train-gpu01.submit
-#    make train-gpu23.submit
-#
-#
-# typical procedure: train and evaluate en-de with 3 models in ensemble
-#
-# make data.submitcpu
-# make vocab.submit
+# make data
+# make vocab
 # make NR=1 train.submit
 # make NR=2 train.submit
 # make NR=3 train.submit
@ -86,50 +136,20 @@
 # make NR=1 eval.submit
 # make NR=2 eval.submit
 # make NR=3 eval.submit
+#
 # make eval-ensemble.submit
 #
-#
-# include right-to-left models:
-#
-# make NR=1 train-RL.submit
-# make NR=2 train-RL.submit
-# make NR=3 train-RL.submit
-#
-#
-#--------------------------------------------------------------------
-# train several versions of the same model (for ensembling)
-#
-#   make NR=1 ....
-#   make NR=2 ....
-#   make NR=3 ....
-#
-# DANGER: problem with vocabulary files if you start them simultaneously
-#         --> racing situation for creating them between the processes
-#
-#--------------------------------------------------------------------
-# resume training
-#
-#   make resume
-#
-#--------------------------------------------------------------------
-# translate with ensembles of models
-#
-#   make translate-ensemble
-#   make eval-ensemble
-#
-# this only makes sense if there are several models
-# (created with different NR)
 #--------------------------------------------------------------------


 # check and adjust lib/env.mk and lib/config.mk
-# add specific tasks in lib/tasks.mk
-

 include lib/env.mk
 include lib/config.mk

-## load model-specific configuration parameters
+# load model-specific configuration parameters
+# if they exist in the work directory
+
 ifneq ($(wildcard ${WORKDIR}/config.mk),)
  include ${WORKDIR}/config.mk
 endif
@ -170,8 +190,20 @@ include lib/models/simplify.mk
 # include Makefile.slurm


+.PHONY: all
+all: ${WORKDIR}/config.mk
+	${MAKE} data
+	${MAKE} train
+	${MAKE} eval
+	${MAKE} compare
+
+.PHONY: train-job
+train-job: ${WORKDIR}/config.mk
+	${MAKE} data
+	${MAKE} HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} train.submit${GPUJOB_SUBMIT}
+
 #------------------------------------------------------------------------
-# make various data sets
+# make various data sets (and word alignment)
 #------------------------------------------------------------------------


--- a/README.md
+++ b/README.md
@ -8,13 +8,20 @@ This package includes scripts for training NMT models using MarianNMT and OPUS d
 Essential files for making new models:

 * `Makefile`: top-level makefile
-* `Makefile.env`: system-specific environment (now based on CSC machines)
-* `Makefile.config`: essential model configuration
-* `Makefile.data`: data pre-processing tasks
-* `Makefile.doclevel`: experimental document-level models
-* `Makefile.tasks`: tasks for training specific models and other things (this frequently changes)
-* `Makefile.dist`: make packages for distributing models (CSC ObjectStorage based)
-* `Makefile.slurm`: submit jobs with SLURM
+* `lib/env.mk`: system-specific environment (now based on CSC machines)
+* `lib/config.mk`: essential model configuration
+* `lib/data.mk`: data pre-processing tasks
+* `lib/generic.mk`: generic implicit rules that can extend other tasks
+* `lib/dist.mk`: make packages for distributing models (CSC ObjectStorage based)
+* `lib/slurm.mk`: submit jobs with SLURM
+
+There are also make targets for specific models and tasks. Look into `lib/models/` to see what has been defined already. 
+Note that this frequently changes! There is, for example:
+
+* `lib/models/multilingua.mk`: various multilingual models
+* `lib/models/celtic.mk`: data and models for Celtic languages
+* `lib/models/doclevel.mk`: experimental document-level models
+

 Run this if you want to train a model, for example for translating English to French:

--- a/lib/config.mk
+++ b/lib/config.mk
@ -10,13 +10,8 @@
 SRCLANGS = sv
 TRGLANGS = fi

-ifndef SRC
-  SRC := ${firstword ${SRCLANGS}}
-endif
-ifndef TRG
-  TRG := ${lastword ${TRGLANGS}}
-endif
-
+SRC ?= ${firstword ${SRCLANGS}}
+TRG ?= ${lastword ${TRGLANGS}}


 # sorted languages and langpair used to match resources in OPUS
@ -29,15 +24,9 @@ LANGPAIRSTR = ${LANGSRCSTR}-${LANGTRGSTR}


 ## for monolingual things
-ifndef LANGS
-  LANGS := ${SRCLANGS}
-endif
-ifndef LANGID
-  LANGID := ${firstword ${LANGS}}
-endif
-ifndef LANGSTR
-  LANGSTR = ${subst ${SPACE},+,$(LANGS)}
-endif
+LANGS ?= ${SRCLANGS}
+LANGID ?= ${firstword ${LANGS}}
+LANGSTR ?= ${subst ${SPACE},+,$(LANGS)}


 ## for same language pairs: add numeric extension
@ -103,6 +92,7 @@ HELDOUTSIZE = ${DEVSIZE}
 ## - check that data exist
 ## - check that there are at least 2 x DEVMINSIZE examples
 ## TODO: this does not work well for multilingual models!
+## TODO: find a better solution than looking into *.info files (use OPUS API?)

 ifneq ($(wildcard ${OPUSHOME}/Tatoeba/latest/moses/${LANGPAIR}.txt.zip),)
 ifeq ($(shell if (( `head -1 ${OPUSHOME}/Tatoeba/latest/info/${LANGPAIR}.txt.info` \
@ -175,9 +165,7 @@ BPESIZE    = 32000
 SRCBPESIZE = ${BPESIZE}
 TRGBPESIZE = ${BPESIZE}

-ifndef VOCABSIZE
-  VOCABSIZE  = $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))
-endif
+VOCABSIZE  ?= $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))

 ## for document-level models
 CONTEXT_SIZE = 100
--- a/lib/env.mk
+++ b/lib/env.mk
@ -33,18 +33,11 @@ WALLTIME    = 72

 ## set variables with HPC prefix

-ifndef HPC_TIME
-  HPC_TIME = ${WALLTIME}:00
-endif
-
-ifndef HPC_CORES
-  HPC_CORES = ${THREADS}
-endif
-
-ifndef HPC_MEM
-  HPC_MEM = ${MEM}
-endif
+HPC_TIME  ?= ${WALLTIME}:00
+HPC_CORES ?= ${THREADS}
+HPC_MEM   ?= ${MEM}

+GPUJOB_HPC_MEM ?= 4g