mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-12-24 18:23:33 +03:00
269 lines
7.5 KiB
Makefile
269 lines
7.5 KiB
Makefile
# -*-makefile-*-
|
|
#
|
|
# generic implic targets that make our life a bit easier
|
|
|
|
|
|
|
|
|
|
## extension -all: run something over all language pairs, e.g.
|
|
## make wordalign-all
|
|
## this goes sequentially over all language pairs
|
|
## for the parallelizable version of this: look at %-all-parallel
|
|
%-all:
|
|
for l in ${ALL_LANG_PAIRS}; do \
|
|
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
|
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-all=}; \
|
|
done
|
|
|
|
# run something over all language pairs that have trained models
|
|
## - make eval-allmodels
|
|
## - make dist-allmodels
|
|
%-allmodels:
|
|
for l in ${ALL_LANG_PAIRS}; do \
|
|
m=`find ${WORKHOME}/$$l -maxdepth 1 -name '*.best-perplexity.npz' -printf "%f\n"`; \
|
|
for i in $$m; do \
|
|
s=`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`; \
|
|
t=`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`; \
|
|
d=`echo $$i | cut -f1 -d.`; \
|
|
x=`echo $$i | cut -f2 -d. | cut -f1 -d-`; \
|
|
y=`echo $$i | cut -f2 -d. | cut -f2 -d-`; \
|
|
v=`echo $$i | cut -f3 -d.`; \
|
|
echo "model = $$i"; \
|
|
echo "dataset = $$d"; \
|
|
echo "src-lang = $$s"; \
|
|
echo "trg-lang = $$t"; \
|
|
echo "pre-src = $$x"; \
|
|
echo "pre-trg = $$y"; \
|
|
echo "type = $$v"; \
|
|
${MAKE} \
|
|
SRCLANGS="$$s" TRGLANGS="$$t" \
|
|
DATASET=$$d \
|
|
PRE_SRC=$$x PRE_TRG=$$y \
|
|
MODELTYPE=$$v ${@:-allmodels=}; \
|
|
done \
|
|
done
|
|
|
|
## OLD: doesn't work for different model variants
|
|
##
|
|
# %-allmodels:
|
|
# for l in ${ALL_LANG_PAIRS}; do \
|
|
# if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
|
# ${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
|
# TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmodels=}; \
|
|
# fi \
|
|
# done
|
|
|
|
|
|
listallmodels:
|
|
@m=`find ${WORKDIR} -maxdepth 1 -name '*.best-perplexity.npz' -printf "%f\n"`; \
|
|
for i in $$m; do \
|
|
d=`echo $$i | cut -f1 -d.`; \
|
|
s=`echo $$i | cut -f2 -d. | cut -f1 -d-`; \
|
|
t=`echo $$i | cut -f2 -d. | cut -f1 -d-`; \
|
|
v=`echo $$i | cut -f3 -d.`; \
|
|
echo "model = $$i"; \
|
|
echo "dataset = $$d"; \
|
|
echo "pre-src = $$s"; \
|
|
echo "pre-trg = $$t"; \
|
|
echo "type = $$v"; \
|
|
done
|
|
|
|
|
|
|
|
## only bilingual models
|
|
%-allbilingual:
|
|
for l in ${ALL_BILINGUAL_MODELS}; do \
|
|
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
|
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
|
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allbilingual=}; \
|
|
fi \
|
|
done
|
|
|
|
## only bilingual models
|
|
%-allmultilingual:
|
|
for l in ${ALL_MULTILINGUAL_MODELS}; do \
|
|
if [ `find ${WORKHOME}/$$l -name '*.${PRE_SRC}-${PRE_TRG}.*.best-perplexity.npz' | wc -l` -gt 0 ]; then \
|
|
${MAKE} SRCLANGS="`echo $$l | cut -f1 -d'-' | sed 's/\\+/ /g'`" \
|
|
TRGLANGS="`echo $$l | cut -f2 -d'-' | sed 's/\\+/ /g'`" ${@:-allmultilingual=}; \
|
|
fi \
|
|
done
|
|
|
|
|
|
## run something over all language pairs but make it possible to do it in parallel, for example
|
|
## - make dist-all-parallel
|
|
%-all-parallel:
|
|
${MAKE} $(subst -all-parallel,,${patsubst %,$@__%-run-for-langpair,${ALL_LANG_PAIRS}})
|
|
|
|
## run a command that includes the langpair, for example
|
|
## make wordalign__en-da+sv-run-for-langpair ...... runs wordalign with SRCLANGS="en" TRGLANGS="da sv"
|
|
## What is this good for?
|
|
## ---> can run many lang-pairs in parallel instead of having a for loop and run sequencetially
|
|
%-run-for-langpair:
|
|
${MAKE} SRCLANGS='$(subst +, ,$(firstword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
|
|
TRGLANGS='$(subst +, ,$(lastword $(subst -, ,${lastword ${subst __, ,${@:-run-for-langpair=}}})))' \
|
|
${shell echo $@ | sed 's/__.*$$//'}
|
|
|
|
|
|
## right-to-left model
|
|
%-RL:
|
|
${MAKE} MODEL=${MODEL}-RL \
|
|
MARIAN_EXTRA="${MARIAN_EXTRA} --right-left" \
|
|
${@:-RL=}
|
|
|
|
|
|
|
|
## include all backtranslation data as well in training
|
|
## start from the pre-trained opus model if it exists
|
|
|
|
BT_MODEL = ${MODEL_SUBDIR}opus+bt${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
|
|
BT_MODEL_BASE = ${BT_MODEL}.${MODELTYPE}.model${NR}
|
|
BT_MODEL_START = ${WORKDIR}/${BT_MODEL_BASE}.npz
|
|
BT_MODEL_VOCAB = ${WORKDIR}/${BT_MODEL}.vocab.${MODEL_VOCABTYPE}
|
|
|
|
# %-add-backtranslations:
|
|
%-bt:
|
|
ifneq (${wildcard ${MODEL_FINAL}},)
|
|
ifeq (${wildcard ${BT_MODEL_START}},)
|
|
cp ${MODEL_FINAL} ${BT_MODEL_START}
|
|
cp ${MODEL_VOCAB} ${BT_MODEL_VOCAB}
|
|
endif
|
|
endif
|
|
rm -f ${WORKHOME}/${LANGPAIRSTR}/train.submit
|
|
${MAKE} DATASET=${DATASET}+bt \
|
|
CLEAN_TRAIN_SRC="${CLEAN_TRAIN_SRC} ${BACKTRANS_SRC}" \
|
|
CLEAN_TRAIN_TRG="${CLEAN_TRAIN_TRG} ${BACKTRANS_TRG}" \
|
|
MARIAN_EARLY_STOPPING=15 \
|
|
${@:-bt=}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## run a multigpu job (2 or 4 GPUs)
|
|
|
|
%-multigpu %-0123:
|
|
${MAKE} NR_GPUS=4 MARIAN_GPUS='0 1 2 3' $(subst -gpu0123,,${@:-multigpu=})
|
|
|
|
%-twogpu %-gpu01:
|
|
${MAKE} NR_GPUS=2 MARIAN_GPUS='0 1' $(subst -gpu01,,${@:-twogpu=})
|
|
|
|
%-gpu23:
|
|
${MAKE} NR_GPUS=2 MARIAN_GPUS='2 3' ${@:-gpu23=}
|
|
|
|
|
|
## run on CPUs (translate-cpu, eval-cpu, translate-ensemble-cpu, ...)
|
|
%-cpu:
|
|
${MAKE} MARIAN=${MARIANCPU} \
|
|
LOADMODS='${LOADCPU}' \
|
|
MARIAN_DECODER_FLAGS="${MARIAN_DECODER_CPU}" \
|
|
${@:-cpu=}
|
|
|
|
|
|
## document level models
|
|
%-doc:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
|
|
PRE=norm \
|
|
PRE_SRC=spm${SRCBPESIZE:000=}k.doc${CONTEXT_SIZE} \
|
|
PRE_TRG=spm${TRGBPESIZE:000=}k.doc${CONTEXT_SIZE} \
|
|
${@:-doc=}
|
|
|
|
|
|
## sentence-piece models
|
|
%-spm:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm} \
|
|
PRE=norm \
|
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
|
${@:-spm=}
|
|
|
|
## sentence-piece models with space-separated strings
|
|
%-nospace:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-nospace} \
|
|
PRE=simple \
|
|
SPMEXTRA=--split_by_whitespace=false \
|
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
|
${@:-nospace=}
|
|
|
|
|
|
## with SPM models trained on monolingual data
|
|
%-monospm: ${SPMSRCMONO} ${SPMTRGMONO}
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-monospm} \
|
|
SPMSRCMODEL=${SPMSRCMONO} \
|
|
SPMTRGMODEL=${SPMTRGMONO} \
|
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
|
${@:-monospm=}
|
|
|
|
|
|
%-spm-noalign:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-spm-noalign} \
|
|
MODELTYPE=transformer \
|
|
PRE=norm \
|
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
|
${@:-spm-noalign=}
|
|
|
|
|
|
## sentence-piece models with langid-filtering (new default)
|
|
%-langid:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
|
|
PRE=simple \
|
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
|
${@:-langid=}
|
|
|
|
## sentence-piece models with langid-filtering (new default)
|
|
%-langid-noalign:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-langid} \
|
|
MODELTYPE=transformer \
|
|
PRE=simple \
|
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
|
${@:-langid-noalign=}
|
|
|
|
|
|
|
|
## BPE models
|
|
%-bpe:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe} \
|
|
PRE=tok \
|
|
MODELTYPE=transformer \
|
|
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
|
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
|
${@:-bpe=}
|
|
|
|
%-bpe-align:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-align} \
|
|
PRE=tok \
|
|
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
|
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
|
${@:-bpe-align=}
|
|
|
|
%-bpe-memad:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-memad} \
|
|
PRE=tok \
|
|
MODELTYPE=transformer \
|
|
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
|
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
|
${@:-bpe-memad=}
|
|
|
|
%-bpe-old:
|
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-bpe-old} \
|
|
PRE=tok \
|
|
MODELTYPE=transformer \
|
|
PRE_SRC=bpe${SRCBPESIZE:000=}k \
|
|
PRE_TRG=bpe${TRGBPESIZE:000=}k \
|
|
${@:-bpe-old=}
|
|
|
|
|
|
## for the inbuilt sentence-piece segmentation:
|
|
# PRE_SRC=txt PRE_TRG=txt
|
|
# MARIAN=${MARIAN}-spm
|
|
# MODEL_VOCABTYPE=spm
|
|
|
|
|
|
|
|
|