mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-27 11:03:13 +03:00
866 lines
29 KiB
Makefile
866 lines
29 KiB
Makefile
# -*-makefile-*-
|
|
#
|
|
# model and environment configurations
|
|
#
|
|
|
|
|
|
# load model-specific configuration parameters
|
|
# if they exist in the work directory
|
|
|
|
##---------------------------------------------------------------
|
|
## default name of the data set (and the model)
|
|
##---------------------------------------------------------------
|
|
|
|
TRAINSET_NAME ?= opus
|
|
DATASET ?= ${TRAINSET_NAME}
|
|
|
|
## various ways of setting the model languages
|
|
##
|
|
## (1) explicitly set source and target languages, for example:
|
|
## SRCLANGS="da no sv" TRGLANGS="fi da"
|
|
##
|
|
## (2) specify language pairs, for example:
|
|
## LANGPAIRS="de-en fi-sv da-es"
|
|
## this will set SRCLANGS="de fi da" TRGLANGS="en sv es"
|
|
##
|
|
|
|
## if LANGPAIRS are set and the model is not supposed to be SYMMETRIC
|
|
## then set SRCLANGS and TRGLANGS to the languages in LANGPAIRS
|
|
ifdef LANGPAIRS
|
|
SRCLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f1 -d '-'}}
|
|
TRGLANGS ?= ${sort ${shell echo "${LANGPAIRS}" | tr ' ' "\n" | cut -f2 -d '-'}}
|
|
endif
|
|
|
|
|
|
## LANGPAISTR is used as a sub-dir in WORKHOME
|
|
SPACE := $(empty) $(empty)
|
|
LANGSRCSTR ?= ${subst ${SPACE},+,$(SRCLANGS)}
|
|
LANGTRGSTR ?= ${subst ${SPACE},+,$(TRGLANGS)}
|
|
LANGPAIRSTR ?= ${LANGSRCSTR}-${LANGTRGSTR}
|
|
WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
|
|
|
|
## default model type
|
|
MODELTYPE = transformer-align
|
|
|
|
|
|
|
|
MODELCONFIG = ${DATASET}${MODEL_VARIANT}.${MODELTYPE}.mk
|
|
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
|
|
include ${WORKDIR}/${MODELCONFIG}
|
|
endif
|
|
|
|
|
|
|
|
## some pre-defined language sets
|
|
include ${REPOHOME}lib/langsets.mk
|
|
|
|
|
|
## supported model types
|
|
## configuration for each type is in lib/train.mk
|
|
|
|
MODELTYPES = transformer \
|
|
transformer-align \
|
|
transformer-base \
|
|
transformer-base-align \
|
|
transformer-big \
|
|
transformer-big-align \
|
|
transformer-small \
|
|
transformer-small-align \
|
|
transformer-tiny \
|
|
transformer-tiny-align \
|
|
transformer-tiny11 \
|
|
transformer-tiny11-align
|
|
|
|
|
|
|
|
|
|
## clean-corpus script parameters
|
|
## (for filtering subword-segmented bitexts)
|
|
##
|
|
## (TODO: should MIN_NTOKENS be 1?)
|
|
# MIN_NR_TOKENS = 0
|
|
# MAX_NR_TOKENS = 250
|
|
MIN_NR_TOKENS = 1
|
|
MAX_NR_TOKENS = 500
|
|
NR_TOKEN_RATIO = 2
|
|
MAX_TOKEN_LENGTH = 100
|
|
|
|
## default values in the original script:
|
|
##
|
|
# MAX_TOKEN_LENGTH = 1000
|
|
# NR_TOKEN_RATIO = 9
|
|
|
|
|
|
|
|
## name of the model-specific configuration file
|
|
## NEW: make it more model specific
|
|
#
|
|
# MODELCONFIG ?= config.mk
|
|
|
|
|
|
|
|
|
|
## set SRC and TRG unless they are specified already
|
|
ifneq (${words ${SRCLANGS}},1)
|
|
SRC ?= multi
|
|
else
|
|
SRC ?= ${SRCLANGS}
|
|
endif
|
|
ifneq (${words ${TRGLANGS}},1)
|
|
TRG ?= multi
|
|
else
|
|
TRG ?= ${TRGLANGS}
|
|
endif
|
|
|
|
|
|
##----------------------------------------------------------------------
|
|
## SKIP_LANGPAIRS can be used to skip certain language pairs
|
|
## in data preparation for multilingual models
|
|
## ---> this can be good to skip BIG language pairs
|
|
## that would very much dominate all the data
|
|
## must be a pattern that can be matched by egrep
|
|
## e.g. en-de|en-fr
|
|
##
|
|
## SKIP_SAME_LANG - set to 1 to skip data with the same language
|
|
## on both sides
|
|
##----------------------------------------------------------------------
|
|
|
|
SKIP_LANGPAIRS ?= "nothing"
|
|
SKIP_SAME_LANG ?= 0
|
|
|
|
|
|
##----------------------------------------------------------------------
|
|
## set SHUFFLE_DATA if you want to shuffle data for
|
|
## each language pair to be added to the training data
|
|
## --> especially useful in connection with FIT_DATA_SIZE
|
|
## set DATA_IS_SHUFFLED=1 if the training data is already shuffled
|
|
## --> useful to avoid shuffling when training sentence piece model
|
|
## NEW (2021-12-16): SHUFFLE_DATA is now set by default
|
|
## --> can now also avoid sqlite and data shuffling inside MarianNMT
|
|
## --> is that a problem (would MarianNMT use different random shuffles / epoch?)
|
|
##----------------------------------------------------------------------
|
|
|
|
SHUFFLE_DATA ?= 1
|
|
# DATA_IS_SHUFFLED ?= 1
|
|
|
|
## devtest data is shuffled by default
|
|
SHUFFLE_DEVDATA ?= 1
|
|
|
|
## shuffle multilingual training data to mix language examples
|
|
SHUFFLE_MULTILINGUAL_DATA ?= 1
|
|
|
|
##----------------------------------------------------------------------
|
|
## set FIT_DATA_SIZE to a specific value to fit the training data
|
|
## to a certain number of lines for each language pair in the collection
|
|
## --> especially useful for multilingual models for balancing the
|
|
## the size for each language pair
|
|
## the script does both, over- and undersampling
|
|
##----------------------------------------------------------------------
|
|
|
|
# FIT_DATA_SIZE ?= 100000
|
|
|
|
## similar for the dev data: set FIT_DEVDATA_SIZE to
|
|
## balance the size of the devdata for each language pair
|
|
##
|
|
# FIT_DEVDATA_SIZE =
|
|
|
|
## define a default dev size fit for multilingual models
|
|
## TODO: is 1000 too small? or too big?
|
|
## TODO: should this depend on the number of languages involved?
|
|
|
|
ifneq (${words ${TRGLANGS}},1)
|
|
FIT_DEVDATA_SIZE ?= 1000
|
|
endif
|
|
ifneq (${words ${SRCLANGS}},1)
|
|
FIT_DEVDATA_SIZE ?= 1000
|
|
endif
|
|
|
|
## maximum number of repeating the same data set
|
|
## in oversampling
|
|
MAX_OVER_SAMPLING ?= 50
|
|
|
|
|
|
##----------------------------------------------------------------------
|
|
## set CHECK_TRAINDATA_SIZE if you want to check that each
|
|
## bitext has equal number of lines in source and target
|
|
## ---> this only prints a warning if not
|
|
##----------------------------------------------------------------------
|
|
# CHECK_TRAINDATA_SIZE = 1
|
|
|
|
|
|
# sorted languages and langpair used to match resources in OPUS
|
|
SORTLANGS = $(sort ${SRC} ${TRG})
|
|
SORTSRC = ${firstword ${SORTLANGS}}
|
|
SORTTRG = ${lastword ${SORTLANGS}}
|
|
LANGPAIR = ${SORTSRC}-${SORTTRG}
|
|
SORTED_LANGPAIR = ${SORTSRC}-${SORTTRG}
|
|
|
|
|
|
## for monolingual things
|
|
LANGS ?= ${SRCLANGS}
|
|
LANGID ?= ${firstword ${LANGS}}
|
|
LANGSTR ?= ${subst ${SPACE},+,$(LANGS)}
|
|
|
|
|
|
## for same language pairs: add numeric extension
|
|
## (this is neccessary to keep source and target files separate)
|
|
ifeq (${SRC},$(TRG))
|
|
SRCEXT = ${SRC}1
|
|
TRGEXT = ${SRC}2
|
|
SORTSRCEXT = ${SORTSRC}1
|
|
SORTTRGEXT = ${SORTSRC}2
|
|
else
|
|
SRCEXT = ${SRC}
|
|
TRGEXT = ${TRG}
|
|
SORTSRCEXT = ${SORTSRC}
|
|
SORTTRGEXT = ${SORTTRG}
|
|
endif
|
|
|
|
## set a flag to use target language labels
|
|
## in multi-target models
|
|
ifneq (${words ${TRGLANGS}},1)
|
|
USE_TARGET_LABELS = 1
|
|
TARGET_LABELS ?= $(patsubst %,>>%<<,${TRGLANGS})
|
|
endif
|
|
|
|
|
|
## size of dev data, test data and BPE merge operations
|
|
## NEW default size = 2500 (keep more for training for small languages)
|
|
## NOTE: size will be increased to 5000 for Tatoeba
|
|
|
|
DEVSIZE ?= 2500
|
|
TESTSIZE ?= 2500
|
|
|
|
## set some additional thresholds for
|
|
## the size of test and dev data
|
|
## DEVMINSIZE is the absolute minimum we require
|
|
## to run any training procedures
|
|
|
|
DEVSMALLSIZE ?= 1000
|
|
TESTSMALLSIZE ?= 1000
|
|
DEVMINSIZE ?= 250
|
|
|
|
|
|
## set additional argument options for opus_read (if it is used)
|
|
## e.g. OPUSREAD_ARGS = -a certainty -tr 0.3
|
|
OPUSREAD_ARGS =
|
|
|
|
|
|
##----------------------------------------------------------------------------
|
|
## resources in OPUS
|
|
##----------------------------------------------------------------------------
|
|
|
|
|
|
## get available data from the OPUS-API
|
|
|
|
OPUSAPI = http://opus.nlpl.eu/opusapi/
|
|
OPUSAPI_WGET = ${WGET} -qq --no-check-certificate -O - ${OPUSAPI}?
|
|
|
|
get-opus-mono = ${shell ${OPUSAPI_WGET}source=${1}\&corpora=True | ${JQ} '.corpora[]' | tr '"' ' '}
|
|
get-opus-bitexts = ${shell ${OPUSAPI_WGET}source=${1}\&target=${2}\&corpora=True | ${JQ} '.corpora[]' | tr '"' ' '}
|
|
get-bigger-bitexts = ${shell ${OPUSAPI_WGET}source=${1}\&target=${2}\&preprocessing=xml\&version=latest | \
|
|
${JQ} -r '.corpora[1:] | .[] | select(.source!="") | select(.target!="") | select(.alignment_pairs>${3}) | .corpus' }
|
|
get-opus-langs = ${shell ${OPUSAPI_WGET}languages=True | ${JQ} '.languages[]' | tr '"' ' '}
|
|
get-opus-version = ${shell ${OPUSAPI_WGET}source=${1}\&target=${2}\&corpus=${3}\&preprocessing=xml\&version=latest | ${JQ} '.corpora[] | .version' | sed 's/"//g' | head -1}
|
|
get-elra-bitexts = ${shell ${OPUSAPI_WGET}source=${1}\&target=${2}\&corpora=True | \
|
|
${JQ} '.corpora[]' | tr '"' ' ' | grep '^ *ELR[CA][-_]'}
|
|
|
|
|
|
## start of some functions to check whether there is a resource for downloading
|
|
## open question: links to the latest release do not exist in the storage
|
|
## --> would it be better to get that done via the OPUS API?
|
|
|
|
OPUS_STORE = https://object.pouta.csc.fi/OPUS-
|
|
url-status = ${shell curl -Is -K HEAD ${1} | head -1}
|
|
url-exists = ${shell if [ "${call url-status,${1}}" == "HTTP/1.1 200 OK" ]; then echo 1; else echo 0; fi}
|
|
resource-url = ${shell echo "${OPUS_STORE}${3}/${call get-opus-version,${1},${2},${3}}/moses/${1}-${2}.txt.zip"}
|
|
|
|
|
|
## exclude certain data sets
|
|
# EXCLUDE_CORPORA ?= WMT-News MPC1 ${call get-elra-bitexts,${SRC},${TRG}}
|
|
EXCLUDE_CORPORA ?= WMT-News MPC1
|
|
|
|
# all matching corpora in OPUS except for some that we want to exclude
|
|
OPUSCORPORA = $(filter-out ${EXCLUDE_CORPORA},${call get-opus-bitexts,${SRC},${TRG}})
|
|
|
|
## monolingual data
|
|
OPUSMONOCORPORA = $(filter-out ${EXCLUDE_CORPORA},${call get-opus-mono,${LANGID}})
|
|
|
|
## all languages in OPUS
|
|
## TODO: do we need this?
|
|
OPUSLANGS := ${call get-opus-langs}
|
|
|
|
OPUS_LANGS3 := ${sort ${filter-out xxx,${shell ${GET_ISO_CODE} ${OPUSLANGS}}}}
|
|
OPUS_LANG_PARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANGS3} 2>/dev/null}}
|
|
OPUS_LANG_GRANDPARENTS := ${sort ${shell langgroup -p -n ${OPUS_LANG_PARENTS} 2>/dev/null}}
|
|
OPUS_LANG_GROUPS := ${sort ${OPUS_LANG_PARENTS} ${OPUS_LANG_GRANDPARENTS}}
|
|
|
|
|
|
##----------------------------------------------------------------------------
|
|
## train/dev/test data
|
|
##----------------------------------------------------------------------------
|
|
|
|
|
|
## select a suitable DEVSET
|
|
## - POTENTIAL_DEVSETS lists more or less reliable corpora (in order of priority)
|
|
## - BIGGER_BITEXTS lists all bitext with more than DEVSMALLSIZE sentence pairs
|
|
## - SMALLER_BITEXTS lists potentially smaller bitexts but at least DEVMINSIZE big
|
|
## - DEVSET is the first of the potential devset that exists with sufficient size
|
|
## TODO: what do we do if there is no devset?
|
|
|
|
POTENTIAL_DEVSETS = Tatoeba GlobalVoices infopankki wikimedia TED2020 Europarl OpenSubtitles JW300 bible-uedin
|
|
BIGGER_BITEXTS := ${call get-bigger-bitexts,${SRC},${TRG},${DEVSMALLSIZE}}
|
|
SMALLER_BITEXTS := ${call get-bigger-bitexts,${SRC},${TRG},${DEVMINSIZE}}
|
|
DEVSET ?= ${firstword ${filter ${POTENTIAL_DEVSETS},${BIGGER_BITEXTS}} \
|
|
${filter ${POTENTIAL_DEVSETS},${SMALLER_BITEXTS}}}
|
|
|
|
print-potential-datasets:
|
|
@echo "bigger : ${BIGGER_BITEXTS}"
|
|
@echo "smaller : ${SMALLER_BITEXTS}"
|
|
@echo "selected: ${DEVSET}"
|
|
|
|
|
|
## increase dev/test sets for Tatoeba (very short sentences!)
|
|
ifeq (${DEVSET},Tatoeba)
|
|
DEVSIZE = 5000
|
|
TESTSIZE = 5000
|
|
endif
|
|
|
|
|
|
## in case we want to use some additional data sets
|
|
# EXTRA_TRAINSET =
|
|
|
|
## TESTSET= DEVSET, TRAINSET = OPUS - WMT-News,DEVSET.TESTSET
|
|
TESTSET ?= ${DEVSET}
|
|
TRAINSET ?= $(filter-out ${EXCLUDE_CORPORA} ${DEVSET} ${TESTSET},${OPUSCORPORA} ${EXTRA_TRAINSET})
|
|
MONOSET ?= $(filter-out ${EXCLUDE_CORPORA} ${DEVSET} ${TESTSET},${OPUSMONOCORPORA} ${EXTRA_TRAINSET})
|
|
|
|
## 1 = use remaining data from dev/test data for training
|
|
USE_REST_DEVDATA ?= 1
|
|
|
|
|
|
|
|
## for model fine-tuning
|
|
|
|
TUNE_SRC ?= ${SRC}
|
|
TUNE_TRG ?= ${TRG}
|
|
|
|
TUNE_DOMAIN ?= OpenSubtitles
|
|
TUNE_FIT_DATA_SIZE ?= 1000000
|
|
|
|
TUNE_VALID_FREQ ?= 1000
|
|
TUNE_DISP_FREQ ?= 1000
|
|
TUNE_SAVE_FREQ ?= 1000
|
|
TUNE_EARLY_STOPPING ?= 5
|
|
TUNE_GPUJOB_SUBMIT ?=
|
|
|
|
|
|
|
|
## existing projects in WORKHOME
|
|
ALL_LANG_PAIRS := ${shell ls ${WORKHOME} 2>/dev/null | grep -- '-' | grep -v old}
|
|
ALL_BILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
|
|
ALL_MULTILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -- '\+'}
|
|
|
|
|
|
##----------------------------------------------------------------------------
|
|
## pre-processing and vocabulary
|
|
##----------------------------------------------------------------------------
|
|
|
|
## joint source+target sentencepiece model
|
|
ifeq (${USE_JOINT_SUBWORD_MODEL},1)
|
|
SUBWORDS = jointspm
|
|
endif
|
|
|
|
## type of subword segmentation (bpe|spm)
|
|
## model vocabulary size (NOTE: BPESIZE is used as default)
|
|
SUBWORDS ?= spm
|
|
BPESIZE ?= 32000
|
|
SRCBPESIZE ?= ${BPESIZE}
|
|
TRGBPESIZE ?= ${BPESIZE}
|
|
SUBWORD_VOCAB_SIZE ?= ${BPESIZE}
|
|
SUBWORD_SRCVOCAB_SIZE ?= ${SUBWORD_VOCAB_SIZE}
|
|
SUBWORD_TRGVOCAB_SIZE ?= ${SUBWORD_VOCAB_SIZE}
|
|
|
|
SUBWORD_MODEL_NAME ?= opus
|
|
|
|
ifeq (${SUBWORDS},bpe)
|
|
BPESRCMODEL = ${WORKDIR}/train/${SUBWORD_MODEL_NAME}.src.bpe${SUBWORD_SRCVOCAB_SIZE:000=}k-model
|
|
BPETRGMODEL = ${WORKDIR}/train/${SUBWORD_MODEL_NAME}.trg.bpe${SUBWORD_TRGVOCAB_SIZE:000=}k-model
|
|
BPE_MODEL = ${WORKDIR}/train/${SUBWORD_MODEL_NAME}.bpe${SUBWORD_VOCAB_SIZE:000=}k-model
|
|
SUBWORD_SRC_MODEL = ${BPESRCMODEL}
|
|
SUBWORD_TRG_MODEL = ${BPETRGMODEL}
|
|
else
|
|
SPMSRCMODEL = ${WORKDIR}/train/${SUBWORD_MODEL_NAME}.src.${SUBWORDS}${SUBWORD_SRCVOCAB_SIZE:000=}k-model
|
|
SPMTRGMODEL = ${WORKDIR}/train/${SUBWORD_MODEL_NAME}.trg.${SUBWORDS}${SUBWORD_TRGVOCAB_SIZE:000=}k-model
|
|
SPM_MODEL = ${WORKDIR}/train/${SUBWORD_MODEL_NAME}.${SUBWORDS}${SUBWORD_VOCAB_SIZE:000=}k-model
|
|
SUBWORD_SRC_MODEL = ${SPMSRCMODEL}
|
|
SUBWORD_TRG_MODEL = ${SPMTRGMODEL}
|
|
SUBWORD_SRC_VOCAB = ${SPMSRCMODEL}.vocab
|
|
SUBWORD_TRG_VOCAB = ${SPMTRGMODEL}.vocab
|
|
endif
|
|
|
|
|
|
## don't delete subword models!
|
|
.PRECIOUS: ${SUBWORD_SRC_MODEL} ${SUBWORD_TRG_MODEL}
|
|
|
|
## size of the joined vocabulary
|
|
## TODO: heuristically add 1,000 to cover language labels is a bit ad-hoc
|
|
VOCABSIZE ?= $$((${SUBWORD_SRCVOCAB_SIZE} + ${SUBWORD_TRGVOCAB_SIZE} + 1000))
|
|
|
|
## for document-level models
|
|
CONTEXT_SIZE ?= 100
|
|
|
|
|
|
## pre-processing/data-cleanup type
|
|
## PRE .......... apply basic normalisation scripts
|
|
## CLEAN_TYPE ... clean = simple noise filtering
|
|
## strict = some additional cleanup based on test set stats
|
|
## CLEAN_TESTDATA_TYPE should stay as 'clean' because
|
|
## we need those data sets to get the parameters
|
|
## for the strict mode
|
|
|
|
PRE ?= simple
|
|
CLEAN_TRAINDATA_TYPE ?= strict
|
|
CLEAN_DEVDATA_TYPE ?= strict
|
|
CLEAN_TESTDATA_TYPE ?= clean
|
|
|
|
|
|
## subword splitting type
|
|
PRE_SRC = ${SUBWORDS}${SUBWORD_SRCVOCAB_SIZE:000=}k
|
|
PRE_TRG = ${SUBWORDS}${SUBWORD_TRGVOCAB_SIZE:000=}k
|
|
|
|
|
|
|
|
## dev and test data come from one specific data set
|
|
## if we have a bilingual model
|
|
|
|
ifeq (${words ${SRCLANGS}},1)
|
|
ifeq (${words ${TRGLANGS}},1)
|
|
DEVSET_NAME ?= ${DEVSET}
|
|
TESTSET_NAME ?= ${TESTSET}
|
|
endif
|
|
endif
|
|
|
|
## otherwise we give them a generic name
|
|
|
|
DEVSET_NAME ?= opus-dev
|
|
TESTSET_NAME ?= opus-test
|
|
|
|
|
|
## DATADIR = directory where the train/dev/test data are
|
|
## TODO: MODELDIR still in use?
|
|
## TODO: SPMDIR still in use? (monolingual sp models)
|
|
|
|
DATADIR = ${WORKHOME}/data
|
|
MODELDIR = ${WORKHOME}/models/${LANGPAIRSTR}
|
|
SPMDIR = ${WORKHOME}/SentencePieceModels
|
|
|
|
## train data sets (word alignment for the guided alignment option)
|
|
TRAIN_BASE = ${WORKDIR}/train/${DATASET}
|
|
TRAIN_SRC = ${TRAIN_BASE}.src
|
|
TRAIN_TRG = ${TRAIN_BASE}.trg
|
|
TRAIN_ALG = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.src-trg.alg.gz
|
|
TRAIN_S2T = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.s2t.gz
|
|
TRAIN_T2S = ${TRAIN_BASE}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.t2s.gz
|
|
|
|
## data sets that are pre-processed and ready to be used
|
|
TRAINDATA_SRC = ${TRAIN_SRC}.clean.${PRE_SRC}.gz
|
|
TRAINDATA_TRG = ${TRAIN_TRG}.clean.${PRE_TRG}.gz
|
|
DEVDATA_SRC = ${DEV_SRC}.${PRE_SRC}
|
|
DEVDATA_TRG = ${DEV_TRG}.${PRE_TRG}
|
|
TESTDATA_SRC = ${TEST_SRC}.${PRE_SRC}
|
|
TESTDATA_TRG = ${TEST_TRG}
|
|
|
|
## training data in local space
|
|
LOCAL_TRAIN_SRC = ${TMPWORKDIR}/${LANGPAIRSTR}/train/${DATASET}.src
|
|
LOCAL_TRAIN_TRG = ${TMPWORKDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
|
|
LOCAL_TRAIN = ${TMPWORKDIR}/${LANGPAIRSTR}/train/${DATASET}
|
|
LOCAL_MONO_DATA = ${TMPWORKDIR}/${LANGSTR}/train/${DATASET}.mono
|
|
|
|
## dev and test data
|
|
DEV_SRC ?= ${WORKDIR}/val/${DEVSET_NAME}.src
|
|
DEV_TRG ?= ${WORKDIR}/val/${DEVSET_NAME}.trg
|
|
|
|
TEST_SRC ?= ${WORKDIR}/test/${TESTSET_NAME}.src
|
|
TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
|
|
|
|
|
|
## home directories for back and forward translation
|
|
BACKTRANS_HOME ?= backtranslate
|
|
FORWARDTRANS_HOME ?= ${BACKTRANS_HOME}
|
|
PIVOTTRANS_HOME ?= pivoting
|
|
|
|
|
|
|
|
## model basename and optional sub-dir
|
|
## NR is used to create model ensembles
|
|
## NR is also used to generate a seed value for initialisation
|
|
|
|
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}
|
|
NR = 1
|
|
|
|
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
|
|
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
|
|
MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
|
|
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
|
|
MODEL_DONE = ${WORKDIR}/${MODEL_BASENAME}.done
|
|
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
|
|
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
|
|
|
|
## quantized models
|
|
MODEL_BIN = ${WORKDIR}/${MODEL_BASENAME}.intgemm8.bin
|
|
MODEL_BIN_ALPHAS = ${WORKDIR}/${MODEL_BASENAME}.intgemm8.alphas.bin
|
|
MODEL_BIN_TUNED = ${WORKDIR}/${MODEL_BASENAME}.intgemm8tuned.bin
|
|
MODEL_BIN_TUNED_ALPHAS = ${WORKDIR}/${MODEL_BASENAME}.intgemm8tuned.alphas.bin
|
|
MODEL_INTGEMM8TUNED = ${WORKDIR}/${MODEL_BASENAME}.intgemm8tuned.npz
|
|
|
|
## lexical short-lists
|
|
SHORTLIST_NRVOC = 100
|
|
SHORTLIST_NRTRANS = 100
|
|
MODEL_BIN_SHORTLIST = ${WORKDIR}/${MODEL}.lex-s2t-${SHORTLIST_NRVOC}-${SHORTLIST_NRTRANS}.bin
|
|
|
|
|
|
|
|
|
|
.PRECIOUS: ${MODEL_FINAL} ${MODEL_BIN}
|
|
|
|
|
|
## for sentence-piece models: get plain text vocabularies
|
|
## for others: extract vocabulary from training data with MarianNMT
|
|
## backwards compatibility: if there is already a vocab-file then use it
|
|
|
|
# ifeq (${SUBWORDS},spm)
|
|
# ifeq ($(wildcard ${WORKDIR}/${MODEL}.vocab.yml),)
|
|
# USE_SPM_VOCAB ?= 1
|
|
# endif
|
|
# endif
|
|
|
|
## use vocab from sentence piece instead of
|
|
## marian_vocab from training data
|
|
|
|
ifeq ($(USE_SPM_VOCAB),1)
|
|
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.yml
|
|
MODEL_SRCVOCAB = ${WORKDIR}/${MODEL}.src.vocab
|
|
MODEL_TRGVOCAB = ${WORKDIR}/${MODEL}.trg.vocab
|
|
else
|
|
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.yml
|
|
MODEL_SRCVOCAB = ${MODEL_VOCAB}
|
|
MODEL_TRGVOCAB = ${MODEL_VOCAB}
|
|
endif
|
|
|
|
|
|
|
|
# find the latest model that has the same modeltype/modelvariant with or without guided alignment
|
|
# to be used if the flag CONTINUE_EXISTING is set to 1
|
|
# - without guided alignment (remove if part of the current): ${subst -align,,${MODELTYPE}}
|
|
# - with guided alignment (remove and add again): ${subst -align,,${MODELTYPE}}-align
|
|
#
|
|
# Don't use the ones that are tuned for a specific language pair or domain!
|
|
|
|
ifeq (${CONTINUE_EXISTING},1)
|
|
MODEL_LATEST = $(firstword \
|
|
${shell ls -t ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz \
|
|
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz \
|
|
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz.best-perplexity.npz \
|
|
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz.best-perplexity.npz \
|
|
2>/dev/null | grep -v 'tuned4' })
|
|
MODEL_LATEST_VOCAB = $(shell echo "${MODEL_LATEST}" | \
|
|
sed 's|\.${PRE_SRC}-${PRE_TRG}\..*$$|.${PRE_SRC}-${PRE_TRG}.vocab.yml|')
|
|
MARIAN_EARLY_STOPPING = 15
|
|
endif
|
|
|
|
|
|
|
|
## test set translation and scores
|
|
|
|
TEST_TRANSLATION = ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
|
|
TEST_EVALUATION = ${TEST_TRANSLATION}.eval
|
|
TEST_COMPARISON = ${TEST_TRANSLATION}.compare
|
|
|
|
|
|
|
|
## parameters for running Marian NMT
|
|
|
|
MARIAN_GPUS ?= 0
|
|
MARIAN_EXTRA =
|
|
MARIAN_VALID_FREQ ?= 10000
|
|
MARIAN_SAVE_FREQ ?= ${MARIAN_VALID_FREQ}
|
|
MARIAN_DISP_FREQ ?= ${MARIAN_VALID_FREQ}
|
|
MARIAN_EARLY_STOPPING ?= 10
|
|
MARIAN_VALID_MINI_BATCH ?= 16
|
|
MARIAN_MAXI_BATCH ?= 500
|
|
MARIAN_DROPOUT ?= 0.1
|
|
MARIAN_MAX_LENGTH ?= 500
|
|
MARIAN_ENC_DEPTH ?= 6
|
|
MARIAN_DEC_DEPTH ?= 6
|
|
MARIAN_ATT_HEADS ?= 8
|
|
MARIAN_DIM_EMB ?= 512
|
|
MARIAN_CLIP_NORM ?= 5
|
|
|
|
## default = shuffle data and batches
|
|
## (set to batches or none to change this)
|
|
# MARIAN_SHUFFLE ?= data
|
|
MARIAN_SHUFFLE ?= batches
|
|
|
|
## default: use sqlite database to store data
|
|
## remove this to use regular temp data
|
|
## set to --shuffle-in-ram to keep all shuffled data in RAM
|
|
# MARIAN_DATA_STORAGE ?= --sqlite
|
|
|
|
|
|
## set to global for lower memory usage in multiprocess training
|
|
## TODO: does this parameter really work?
|
|
MARIAN_SHARDING ?= local
|
|
|
|
|
|
## TODO: currently marianNMT crashes with workspace > 26000 (does it?)
|
|
## TODO: move this to individual env settings?
|
|
## problem: we need to know MODELTYPE before we can set this
|
|
ifeq (${GPU},p100)
|
|
MARIAN_WORKSPACE = 13000
|
|
|
|
else ifeq (${GPU},a100)
|
|
ifeq ($(subst -align,,${MODELTYPE}),transformer-big)
|
|
MARIAN_WORKSPACE = 15000
|
|
else ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
|
|
MARIAN_WORKSPACE = 10000
|
|
else ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny)
|
|
MARIAN_WORKSPACE = 10000
|
|
else ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny11)
|
|
MARIAN_WORKSPACE = 10000
|
|
else
|
|
MARIAN_WORKSPACE = 20000
|
|
endif
|
|
|
|
else ifeq (${GPU},v100)
|
|
ifeq ($(subst -align,,${MODELTYPE}),transformer-big)
|
|
MARIAN_WORKSPACE = 15000
|
|
else ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
|
|
MARIAN_WORKSPACE = 10000
|
|
else ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny)
|
|
MARIAN_WORKSPACE = 10000
|
|
else ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny11)
|
|
MARIAN_WORKSPACE = 10000
|
|
else
|
|
MARIAN_WORKSPACE = 20000
|
|
endif
|
|
|
|
else
|
|
MARIAN_WORKSPACE = 10000
|
|
endif
|
|
|
|
|
|
## TODO: do we need to reduce workspace for decoding?
|
|
# MARIAN_DECODER_WORKSPACE = $$((${MARIAN_WORKSPACE} / 2))
|
|
MARIAN_DECODER_WORKSPACE = 10000
|
|
|
|
|
|
## weights associated with training examples
|
|
ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
|
|
MARIAN_TRAIN_WEIGHTS = --data-weighting ${TRAIN_WEIGHTS}
|
|
endif
|
|
|
|
|
|
## NR allows to train several models for proper ensembling
|
|
## (with shared vocab)
|
|
## DANGER: if several models are started at the same time
|
|
## then there is some racing issue with creating the vocab!
|
|
|
|
ifdef NR
|
|
SEED=${NR}${NR}${NR}${NR}
|
|
else
|
|
SEED=1234
|
|
endif
|
|
|
|
|
|
## decoder flags (CPU and GPU variants)
|
|
|
|
MARIAN_BEAM_SIZE = 4
|
|
MARIAN_MINI_BATCH = 256
|
|
MARIAN_MAXI_BATCH = 512
|
|
# MARIAN_MINI_BATCH = 512
|
|
# MARIAN_MAXI_BATCH = 1024
|
|
# MARIAN_MINI_BATCH = 768
|
|
# MARIAN_MAXI_BATCH = 2048
|
|
|
|
|
|
ifeq ($(GPU_AVAILABLE),1)
|
|
MARIAN_SCORER_FLAGS = -n1 -d ${MARIAN_GPUS} \
|
|
--quiet-translation -w ${MARIAN_DECODER_WORKSPACE} \
|
|
--mini-batch ${MARIAN_MINI_BATCH} --maxi-batch ${MARIAN_MAXI_BATCH} --maxi-batch-sort src
|
|
MARIAN_DECODER_FLAGS = -b ${MARIAN_BEAM_SIZE} -n1 -d ${MARIAN_GPUS} \
|
|
--quiet-translation -w ${MARIAN_DECODER_WORKSPACE} \
|
|
--mini-batch ${MARIAN_MINI_BATCH} --maxi-batch ${MARIAN_MAXI_BATCH} --maxi-batch-sort src \
|
|
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
|
|
# --fp16
|
|
else
|
|
MARIAN_SCORER_FLAGS = -n1 --cpu-threads ${HPC_CORES} \
|
|
--quiet-translation \
|
|
--mini-batch ${HPC_CORES} --maxi-batch 100 --maxi-batch-sort src
|
|
MARIAN_DECODER_FLAGS = -b ${MARIAN_BEAM_SIZE} -n1 --cpu-threads ${HPC_CORES} \
|
|
--quiet-translation \
|
|
--mini-batch ${HPC_CORES} --maxi-batch 100 --maxi-batch-sort src \
|
|
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
|
|
MARIAN_EXTRA = --cpu-threads ${HPC_CORES}
|
|
endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## make some data size-specific configuration parameters
|
|
## TODO: is it OK to delete LOCAL_TRAIN data?
|
|
|
|
SMALLEST_TRAINSIZE ?= 10000
|
|
SMALL_TRAINSIZE ?= 100000
|
|
MEDIUM_TRAINSIZE ?= 500000
|
|
LARGE_TRAINSIZE ?= 1000000
|
|
LARGEST_TRAINSIZE ?= 10000000
|
|
|
|
${WORKDIR}/${MODELCONFIG}:
|
|
@echo ".... create model configuration file '$@'"
|
|
@mkdir -p ${dir $@}
|
|
@if [ -e ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz ]; then \
|
|
${MAKE} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq \
|
|
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq; \
|
|
s=`${GZCAT} ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz | head -10000001 | wc -l`; \
|
|
S=`cat ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.charfreq | wc -l`; \
|
|
T=`cat ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.charfreq | wc -l`; \
|
|
else \
|
|
${MAKE} ${LOCAL_TRAIN_SRC}; \
|
|
${MAKE} ${LOCAL_TRAIN_SRC}.charfreq ${LOCAL_TRAIN_TRG}.charfreq; \
|
|
s=`head -10000001 ${LOCAL_TRAIN_SRC} | wc -l`; \
|
|
S=`cat ${LOCAL_TRAIN_SRC}.charfreq | wc -l`; \
|
|
T=`cat ${LOCAL_TRAIN_TRG}.charfreq | wc -l`; \
|
|
fi; \
|
|
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
|
|
echo "# ${LANGPAIRSTR} training data bigger than ${LARGEST_TRAINSIZE}" > $@; \
|
|
echo "GPUJOB_HPC_MEM = 16g" >> $@; \
|
|
echo "GPUJOB_SUBMIT = -gpu01" >> $@; \
|
|
echo "SUBWORD_VOCAB_SIZE = ${SUBWORD_VOCAB_SIZE}" >> $@; \
|
|
echo "DEVSIZE = ${DEVSIZE}" >> $@; \
|
|
echo "TESTSIZE = ${TESTSIZE}" >> $@; \
|
|
echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \
|
|
elif [ $$s -gt ${LARGE_TRAINSIZE} ]; then \
|
|
echo "# ${LANGPAIRSTR} training data bigger than ${LARGE_TRAINSIZE}" > $@; \
|
|
echo "GPUJOB_HPC_MEM = 12g" >> $@; \
|
|
echo "GPUJOB_SUBMIT = " >> $@; \
|
|
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
|
|
echo "SUBWORD_VOCAB_SIZE = ${SUBWORD_VOCAB_SIZE}" >> $@; \
|
|
echo "DEVSIZE = ${DEVSIZE}" >> $@; \
|
|
echo "TESTSIZE = ${TESTSIZE}" >> $@; \
|
|
echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \
|
|
elif [ $$s -gt ${MEDIUM_TRAINSIZE} ]; then \
|
|
echo "# ${LANGPAIRSTR} training data bigger than ${MEDIUM_TRAINSIZE}" > $@; \
|
|
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
|
|
echo "GPUJOB_SUBMIT = " >> $@; \
|
|
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
|
|
echo "MARIAN_WORKSPACE = 10000" >> $@; \
|
|
echo "SUBWORD_VOCAB_SIZE = 12000" >> $@; \
|
|
echo "DEVSIZE = ${DEVSIZE}" >> $@; \
|
|
echo "TESTSIZE = ${TESTSIZE}" >> $@; \
|
|
echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \
|
|
elif [ $$s -gt ${SMALL_TRAINSIZE} ]; then \
|
|
echo "# ${LANGPAIRSTR} training data bigger than ${SMALL_TRAINSIZE}" > $@; \
|
|
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
|
|
echo "GPUJOB_SUBMIT = " >> $@; \
|
|
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
|
|
echo "MARIAN_WORKSPACE = 5000" >> $@; \
|
|
echo "MARIAN_VALID_MINI_BATCH = 8" >> $@; \
|
|
echo "SUBWORD_VOCAB_SIZE = 4000" >> $@; \
|
|
echo "DEVSIZE = 1000" >> $@; \
|
|
echo "TESTSIZE = 1000" >> $@; \
|
|
echo "DEVMINSIZE = 250" >> $@; \
|
|
elif [ $$s -gt ${SMALLEST_TRAINSIZE} ]; then \
|
|
echo "# ${LANGPAIRSTR} training data less than ${SMALLEST_TRAINSIZE}" > $@; \
|
|
echo "GPUJOB_HPC_MEM = 4g" >> $@; \
|
|
echo "GPUJOB_SUBMIT = " >> $@; \
|
|
echo "MARIAN_VALID_FREQ = 1000" >> $@; \
|
|
echo "MARIAN_WORKSPACE = 3500" >> $@; \
|
|
echo "MARIAN_DROPOUT = 0.5" >> $@; \
|
|
echo "MARIAN_VALID_MINI_BATCH = 4" >> $@; \
|
|
echo "SUBWORD_VOCAB_SIZE = 1000" >> $@; \
|
|
echo "DEVSIZE = 500" >> $@; \
|
|
echo "TESTSIZE = 1000" >> $@; \
|
|
echo "DEVMINSIZE = 100" >> $@; \
|
|
else \
|
|
echo "${LANGPAIRSTR} too small"; \
|
|
fi; \
|
|
if [ -e $@ ]; then \
|
|
if [ $$S -gt 1000 ]; then \
|
|
echo "SUBWORD_SRCVOCAB_SIZE = 32000" >> $@; \
|
|
fi; \
|
|
if [ $$T -gt 1000 ]; then \
|
|
echo "SUBWORD_TRGVOCAB_SIZE = 32000" >> $@; \
|
|
fi; \
|
|
fi
|
|
@echo "SRCLANGS = ${SRCLANGS}" >> $@
|
|
@echo "TRGLANGS = ${TRGLANGS}" >> $@
|
|
@echo "SKIPLANGS = ${SKIPLANGS}" >> $@
|
|
@echo "LANGPAIRSTR = ${LANGPAIRSTR}" >> $@
|
|
@echo "DATASET = ${DATASET}" >> $@
|
|
@echo "TRAINSET = ${TRAINSET}" >> $@
|
|
@echo "DEVSET = ${DEVSET}" >> $@
|
|
@echo "TESTSET = ${TESTSET}" >> $@
|
|
@echo "PRE = ${PRE}" >> $@
|
|
@echo "SUBWORDS = ${SUBWORDS}" >> $@
|
|
ifdef SHUFFLE_DATA
|
|
@echo "SHUFFLE_DATA = ${SHUFFLE_DATA}" >> $@
|
|
endif
|
|
ifdef FIT_DATA_SIZE
|
|
@echo "FIT_DATA_SIZE = ${FIT_DATA_SIZE}" >> $@
|
|
endif
|
|
ifdef FIT_DEVDATA_SIZE
|
|
@echo "FIT_DEVDATA_SIZE = ${FIT_DEVDATA_SIZE}" >> $@
|
|
endif
|
|
@echo "MAX_OVER_SAMPLING = ${MAX_OVER_SAMPLING}" >> $@
|
|
@echo "USE_REST_DEVDATA = ${USE_REST_DEVDATA}" >> $@
|
|
ifdef USE_TARGET_LABELS
|
|
@echo "USE_TARGET_LABELS = ${USE_TARGET_LABELS}" >> $@
|
|
endif
|
|
ifdef USE_SPM_VOCAB
|
|
@echo "USE_SPM_VOCAB = ${USE_SPM_VOCAB}" >> $@
|
|
endif
|
|
ifdef USE_JOINT_SUBWORD_MODEL
|
|
@echo "USE_JOINT_SUBWORD_MODEL = ${USE_JOINT_SUBWORD_MODEL}" >> $@
|
|
endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
################################################################
|
|
### DEPRECATED? ################################################
|
|
################################################################
|
|
|
|
## list of all languages in OPUS
|
|
## TODO: do we still need this?
|
|
## --> see OPUSLANGS which is directly taken from the API
|
|
opus-langs.txt:
|
|
${WGET} -O $@.tmp ${OPUSAPI}?languages=true
|
|
grep '",' $@.tmp | tr '",' ' ' | sort | tr "\n" ' ' | sed 's/ */ /g' > $@
|
|
rm -f $@.tmp
|
|
|
|
## all language pairs in opus in one file
|
|
## TODO: do we need this file?
|
|
opus-langpairs.txt:
|
|
for l in ${OPUS_LANGS}; do \
|
|
${WGET} -O $@.tmp ${OPUSAPI}?source=$$l\&languages=true; \
|
|
grep '",' $@.tmp | tr '",' ' ' | sort | tr "\n" ' ' | sed 's/ */ /g' > $@.tmp2; \
|
|
for t in `cat $@.tmp2`; do \
|
|
if [ $$t \< $$l ]; then \
|
|
echo "$$t-$$l" >> $@.all; \
|
|
else \
|
|
echo "$$l-$$t" >> $@.all; \
|
|
fi \
|
|
done; \
|
|
rm -f $@.tmp $@.tmp2; \
|
|
done
|
|
tr ' ' "\n" < $@.all |\
|
|
sed 's/ //g' | sort -u | tr "\n" ' ' > $@
|
|
rm -f $@.all
|
|
|
|
|