OPUS-MT-train/Makefile.config

# -*-makefile-*-
#
# model configurations
#


# SRCLANGS = da no sv
# TRGLANGS = fi

SRCLANGS = sv
TRGLANGS = fi

ifndef SRC
  SRC := ${firstword ${SRCLANGS}}
endif
ifndef TRG
  TRG := ${lastword ${TRGLANGS}}
endif


# sorted languages and langpair used to match resources in OPUS
SORTLANGS = $(sort ${SRC} ${TRG})
SPACE     = $(empty) $(empty)
LANGPAIR  = ${firstword ${SORTLANGS}}-${lastword ${SORTLANGS}}
LANGSTR   = ${subst ${SPACE},+,$(SRCLANGS)}-${subst ${SPACE},+,$(TRGLANGS)}


## for same language pairs: add numeric extension
ifeq (${SRC},$(TRG))
  SRCEXT = ${SRC}1
  TRGEXT = ${SRC}2
else
  SRCEXT = ${SRC}
  TRGEXT = ${TRG}
endif


## all of OPUS (NEW: don't require MOSES format)
# OPUSCORPORA  = ${patsubst %/latest/moses/${LANGPAIR}.txt.zip,%,\
#		${patsubst ${OPUSHOME}/%,%,\
#		${shell ls ${OPUSHOME}/*/latest/moses/${LANGPAIR}.txt.zip}}}
OPUSCORPORA  = ${patsubst %/latest/xml/${LANGPAIR}.xml.gz,%,\
		${patsubst ${OPUSHOME}/%,%,\
		${shell ls ${OPUSHOME}/*/latest/xml/${LANGPAIR}.xml.gz}}}


ALL_LANG_PAIRS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
ALL_BILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" |  grep -v -- '\+'}
ALL_MULTILINGUAL_MODELS = ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -- '\+'}

# ALL_BILINGUAL_MODELS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old | grep -v -- '\+'}
# ALL_MULTILINGUAL_MODELS = ${shell ls ${WORKHOME} | grep -- '-' | grep -v old | grep -- '\+'}


## size of dev data, test data and BPE merge operations

DEVSIZE     = 5000
TESTSIZE    = 5000

## NEW: significantly reduce devminsize
## (= absolute minimum we need as devdata)
## NEW: define an alternative small size for DEV and TEST
## OLD DEVMINSIZE:
# DEVMINSIZE  = 1000

DEVSMALLSIZE  = 1000
TESTSMALLSIZE = 1000
DEVMINSIZE    = 250

## size of heldout data for each sub-corpus
## (only if there is at least twice as many examples in the corpus)
HELDOUTSIZE = ${DEVSIZE}

##----------------------------------------------------------------------------
## train/dev/test data
##----------------------------------------------------------------------------

## dev/test data: default = Tatoeba otherwise, GlobalVoices, JW300, GNOME or bibl-uedin
## - check that data exist
## - check that there are at least 2 x DEVMINSIZE examples
## TODO: this does not work well for multilingual models!

ifneq ($(wildcard ${OPUSHOME}/Tatoeba/latest/moses/${LANGPAIR}.txt.zip),)
ifeq ($(shell if (( `head -1 ${OPUSHOME}/Tatoeba/latest/info/${LANGPAIR}.txt.info` \
		    > $$((${DEVMINSIZE} + ${DEVMINSIZE})) )); then echo "ok"; fi),ok)
  DEVSET = Tatoeba
endif
endif

## backoff to GlobalVoices
ifndef DEVSET
ifneq ($(wildcard ${OPUSHOME}/GlobalVoices/latest/moses/${LANGPAIR}.txt.zip),)
ifeq ($(shell if (( `head -1 ${OPUSHOME}/GlobalVoices/latest/info/${LANGPAIR}.txt.info` \
		    > $$((${DEVMINSIZE} + ${DEVMINSIZE})) )); then echo "ok"; fi),ok)
  DEVSET = GlobalVoices
endif
endif
endif

## backoff to JW300
ifndef DEVSET
ifneq ($(wildcard ${OPUSHOME}/JW300/latest/xml/${LANGPAIR}.xml.gz),)
ifeq ($(shell if (( `sed -n 2p ${OPUSHOME}/JW300/latest/info/${LANGPAIR}.info` \
		    > $$((${DEVMINSIZE} + ${DEVMINSIZE})) )); then echo "ok"; fi),ok)
  DEVSET = JW300
endif
endif
endif

## otherwise: bible-uedin
ifndef DEVSET
  DEVSET = bible-uedin
endif

## in case we want to use some additional data sets
EXTRA_TRAINSET =

## TESTSET= DEVSET, TRAINSET = OPUS - WMT-News,DEVSET.TESTSET
TESTSET  = ${DEVSET}
TRAINSET = $(filter-out WMT-News ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
TUNESET  = OpenSubtitles


## 1 = use remaining data from dev/test data for training
USE_REST_DEVDATA = 1


##----------------------------------------------------------------------------
## pre-processing and vocabulary
##----------------------------------------------------------------------------

BPESIZE    = 32000
SRCBPESIZE = ${BPESIZE}
TRGBPESIZE = ${BPESIZE}

ifndef VOCABSIZE
  VOCABSIZE  = $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))
endif

## for document-level models
CONTEXT_SIZE = 100

## pre-processing type
PRE       = norm
PRE_SRC   = spm${SRCBPESIZE:000=}k
PRE_TRG   = spm${TRGBPESIZE:000=}k


##-------------------------------------
## default name of the data set (and the model)
##-------------------------------------

ifndef DATASET
  DATASET = opus
endif

##-------------------------------------
## OLD OLD OLD
## name of the data set (and the model)
##  - single corpus = use that name
##  - multiple corpora = opus
## add also vocab size to the name
##-------------------------------------

# ifndef DATASET
# ifeq (${words ${TRAINSET}},1)
#   DATASET = ${TRAINSET}
# else
#   DATASET = opus
# endif
# endif


## DATADIR = directory where the train/dev/test data are
## WORKDIR = directory used for training

DATADIR = ${WORKHOME}/data
WORKDIR = ${WORKHOME}/${LANGSTR}


## data sets
TRAIN_BASE = ${WORKDIR}/train/${DATASET}
TRAIN_SRC  = ${TRAIN_BASE}.src
TRAIN_TRG  = ${TRAIN_BASE}.trg
TRAIN_ALG  = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.src-trg.alg.gz

## training data in local space
LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGSTR}/train/${DATASET}.trg

TUNE_SRC  = ${WORKDIR}/tune/${TUNESET}.src
TUNE_TRG  = ${WORKDIR}/tune/${TUNESET}.trg

DEV_SRC   = ${WORKDIR}/val/${DEVSET}.src
DEV_TRG   = ${WORKDIR}/val/${DEVSET}.trg

TEST_SRC  = ${WORKDIR}/test/${TESTSET}.src
TEST_TRG  = ${WORKDIR}/test/${TESTSET}.trg


## heldout data directory (keep one set per data set)
HELDOUT_DIR  = ${WORKDIR}/heldout

MODEL_SUBDIR =
MODEL        = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
MODELTYPE    = transformer-align
NR           = 1

MODEL_BASENAME  = ${MODEL}.${MODELTYPE}.model${NR}
MODEL_VALIDLOG  = ${MODEL}.${MODELTYPE}.valid${NR}.log
MODEL_TRAINLOG  = ${MODEL}.${MODELTYPE}.train${NR}.log
MODEL_FINAL     = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
MODEL_VOCABTYPE = yml
MODEL_VOCAB     = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
MODEL_DECODER   = ${MODEL_FINAL}.decoder.yml


## test set translation and scores

TEST_TRANSLATION = ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
TEST_EVALUATION  = ${TEST_TRANSLATION}.eval
TEST_COMPARISON  = ${TEST_TRANSLATION}.compare


## parameters for running Marian NMT

MARIAN_GPUS             = 0
MARIAN_EXTRA            =
MARIAN_VALID_FREQ       = 10000
MARIAN_SAVE_FREQ        = ${MARIAN_VALID_FREQ}
MARIAN_DISP_FREQ        = ${MARIAN_VALID_FREQ}
MARIAN_EARLY_STOPPING   = 10
MARIAN_VALID_MINI_BATCH = 16
MARIAN_MAXI_BATCH       = 500
MARIAN_DROPOUT          = 0.1

MARIAN_DECODER_GPU    = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src
MARIAN_DECODER_CPU    = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}

## TODO: currently marianNMT crashes with workspace > 26000
ifeq (${GPU},p100)
  MARIAN_WORKSPACE = 13000
else ifeq (${GPU},v100)
  # MARIAN_WORKSPACE = 30000
  # MARIAN_WORKSPACE = 26000
  # MARIAN_WORKSPACE = 24000
  # MARIAN_WORKSPACE = 18000
  MARIAN_WORKSPACE = 16000
else
  MARIAN_WORKSPACE = 10000
endif


ifeq (${shell nvidia-smi | grep failed | wc -l},1)
  MARIAN = ${MARIANCPU}
  MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_CPU}
  MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
endif


ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
	MARIAN_TRAIN_WEIGHTS = --data-weighting ${TRAIN_WEIGHTS}
endif


### training a model with Marian NMT
##
## NR allows to train several models for proper ensembling
## (with shared vocab)
##
## DANGER: if several models are started at the same time
## then there is some racing issue with creating the vocab!

ifdef NR
  SEED=${NR}${NR}${NR}${NR}
else
  SEED=1234
endif