Merge branch 'puhti' of github.com:Helsinki-NLP/OPUS-MT-train into puhti

This commit is contained in:
Joerg Tiedemann 2022-04-09 20:33:10 +03:00
commit 06e2590a1e
7 changed files with 105 additions and 7 deletions

View File

@ -557,8 +557,8 @@ ifeq (${CONTINUE_EXISTING},1)
MODEL_LATEST = $(firstword \
${shell ls -t ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.best-perplexity.npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.best-perplexity.npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz.best-perplexity.npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz.best-perplexity.npz \
2>/dev/null | grep -v 'tuned4' })
MODEL_LATEST_VOCAB = $(shell echo "${MODEL_LATEST}" | \
sed 's|\.${PRE_SRC}-${PRE_TRG}\..*$$|.${PRE_SRC}-${PRE_TRG}.vocab.yml|')

View File

@ -119,6 +119,14 @@ ifeq (${USE_PIVOTING},1)
endif
# additional data sets that might be available ...
ifeq (${USE_EXTRA_BITEXTS},1)
EXTRA_BITEXTS_SRC = ${sort ${wildcard ${DATADIR}/extra/${SRC}-${TRG}/*.${SRCEXT}.gz}}
EXTRA_BITEXTS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${EXTRA_BITEXTS_SRC}}
endif
print-datasets:
-@for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
@ -144,6 +152,10 @@ print-datasets-current-langpair:
@echo "pivot-based translation data:"
@echo ${PIVOTING_SRC}
@echo ${PIVOTING_TRG}
@echo "extra bitexts:"
@echo ${EXTRA_BITEXTS_SRC}
@echo ${EXTRA_BITEXTS_TRG}
##-------------------------------------------------------------
## data sets (train/dev/test)
@ -153,9 +165,9 @@ print-datasets-current-langpair:
## with some basic pre-processing (see lib/preprocess.mk)
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${SRCEXT}.gz,${TRAINSET}} \
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC}
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC} ${EXTRA_BITEXTS_SRC}
CLEAN_TRAIN_TRG = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${TRGEXT}.gz,${TRAINSET}} \
${BACKTRANS_TRG} ${FORWARDTRANS_TRG} ${FORWARDTRANSMONO_TRG} ${PIVOTING_TRG}
${BACKTRANS_TRG} ${FORWARDTRANS_TRG} ${FORWARDTRANSMONO_TRG} ${PIVOTING_TRG} ${EXTRA_BITEXTS_TRG}
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_DEVDATA_TYPE}.${SRCEXT}.gz,${DEVSET}}
CLEAN_DEV_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_DEV_SRC}}

1
lib/env/mahti.mk vendored
View File

@ -8,6 +8,7 @@ DATA_PREPARE_HPCPARAMS = CPUJOB_HPC_CORES=2 CPUJOB_HPC_MEM=16g
DATA_ALIGN_HPCPARAMS = CPUJOB_HPC_CORES=128 CPUJOB_HPC_JOBS=20 CPUJOB_HPC_MEM=128g
# CSCPROJECT = project_2002982
CSCPROJECT = project_2002688
# CSCPROJECT = project_2005625
WORKHOME = ${shell realpath ${PWD}/work}

View File

@ -245,6 +245,11 @@ listallmodels:
${MAKE} DATASET=${DATASET}+bt USE_BACKTRANS=1 SHUFFLE_TRAINING_DATA=1 ${@:-bt=}
## include additional bitexts
%-xb:
${MAKE} DATASET=${DATASET}+xb USE_EXTRA_BITEXTS=1 SHUFFLE_TRAINING_DATA=1 ${@:-xb=}
## adding a pivot language to the model
## --> add pivot language to each side (source and target)
## --> only start the task if the pivot language adds anything on either side
@ -285,7 +290,8 @@ FT_SELECTED ?= 95
@for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ -e ${FORWARDTRANS_HOME}/$$s-$$t/latest ]; then \
if [ ! -e `ls ${FORWARDTRANS_HOME}/$$s-$$t/latest/*.best${FT_SELECTED}.gz | head -1` ]; then \
if [ `ls ${FORWARDTRANS_HOME}/$$s-$$t/latest/ | grep "best${FT_SELECTED}.gz" | wc -l` -eq 0 ]; then \
echo "... extract best translations from $$s-$$t forward translations"; \
${MAKE} -C ${FORWARDTRANS_HOME} SRC=$$s TRG=$$t \
RETAIN=${FT_SELECTED} extract-best-translations; \
fi \

View File

@ -86,9 +86,14 @@ elg-ukr-students:
elg-test-tiny2:
${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=eng test-tiny11-student
${MAKE} EMAIL= SRCLANGS=eng TRGLANGS=ukr test-tiny11-student
${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=deu STUDENT_DATA=ftbest-bt-nopar test-tiny11-student
${MAKE} EMAIL= SRCLANGS=eng TRGLANGS=ukr STUDENT_DATA=ftbest-bt-nopar test-tiny11-student
${MAKE} EMAIL= SRCLANGS=deu TRGLANGS=ukr test-tiny11-student
${MAKE} EMAIL= SRCLANGS=deu TRGLANGS=ukr STUDENT_DATA=ftbest-bt-nopar test-tiny11-student
${MAKE} EMAIL= SRCLANGS=deu TRGLANGS=ukr STUDENT_DATA=ftbest-ftmono-nopar test-tiny11-student
${MAKE} EMAIL= SRCLANGS=deu TRGLANGS=ukr STUDENT_DATA=pft-pbt-bt test-tiny11-student
${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=deu test-tiny11-student
${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=deu STUDENT_DATA=ftbest-bt-nopar test-tiny11-student
${MAKE} EMAIL= SRCLANGS=ukr TRGLANGS=deu STUDENT_DATA=ftbest-ftmono-nopar test-tiny11-student
${MAKE} EMAIL= SRCLANGS="ces slk" TRGLANGS=ukr STUDENT_DATA=pft-pbt-bt test-tiny11-student
${MAKE} EMAIL= SRCLANGS=gmq TRGLANGS=ukr STUDENT_DATA=pft-pbt-bt test-tiny11-student
@ -185,6 +190,14 @@ elg-ukr2lit-tiny11:
${MAKE} STUDENT_DATA=pft-pbt-bt SRCLANGS=ukr TRGLANGS=lit train-tiny11-student
elg-pol2ukr-student2:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MARIAN_EARLY_STOPPING=15 CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt-xb SRCLANGS=pol TRGLANGS=ukr train-tiny11-student
elg-ukr2pol-student2:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MARIAN_EARLY_STOPPING=15 CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=pft-pbt-bt-xb SRCLANGS=ukr TRGLANGS=pol train-tiny11-student
elg-ces_slk2ukr-tiny11:
@ -228,6 +241,33 @@ elg-ukr2deu-student3:
elg-fin2ukr-student2:
${MAKE} SUBWORD_VOCAB_SIZE=16000 MARIAN_EARLY_STOPPING=15 SRCLANGS=fin TRGLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student
elg-fin2ukr-student:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=fin TRGLANGS=ukr CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student
elg-ukr2fin-student:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=fin CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student
elg-zle2fin-student:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MARIAN_EARLY_STOPPING=15 \
DATA_PREPARE_HPCPARAMS='${DATA_PREPARE_HPCPARAMS} CPUJOB_HPC_DISK=1000' \
DATA_ALIGN_HPCPARAMS="${DATA_ALIGN_HPCPARAMS} CPUJOB_HPC_DISK=1000" \
CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 \
STUDENT_DATA=ftbest-ftmono-nopar SRCLANGS="ukr rus" TRGLANGS=fin \
LANGPAIRSTR="zle-fin-tiny" train-tiny11-student
elg-fin2rus-student:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=fin TRGLANGS=rus CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student
elg-rus2fin-student:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=rus TRGLANGS=fin CHECK_TRAINDATA_SIZE=1 CLEAN_CORPUS_TRAINING_DATA=1 STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student
elg-spa2ukr-student:
${MAKE} MARIAN_EARLY_STOPPING=15 SRCLANGS=spa TRGLANGS=ukr train-tiny11-student
@ -252,6 +292,9 @@ elg-ukr2eng-student:
elg-eng2ukr-student2:
${MAKE} MARIAN_EARLY_STOPPING=15 STUDENT_DATA=ftbest-bt-nopar SRCLANGS=eng TRGLANGS=ukr train-tiny11-student
elg-ukr2eng-student2:
${MAKE} CONTINUE_EXISTING=1 MARIAN_EARLY_STOPPING=15 SRCLANGS=ukr TRGLANGS=eng STUDENT_DATA=ftbest-ftmono-nopar train-tiny11-student
## missing evaluations and dist packages
@ -282,6 +325,14 @@ elg-dist-missing:
elg-zle2fin-pivot:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-zle2fin-trainjob-pbt-pft-bt
elg-fin2zle-pivot:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big tatoeba-fin2zle-trainjob-pbt-pft-bt
elg-continue-missing:
for l in deu fra ita por spa; do \
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2zle-trainjob; \
@ -373,6 +424,12 @@ elg-eval-big2zle:
${MAKE} MODELTYPE=transformer-big tatoeba-$${l}2zle-eval-testsets; \
done
elg-eng2zle-xb:
${MAKE} MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big CONTINUE_EXISTING=1 tatoeba-eng2zle-trainjob-bt-xb
elg-zle2eng-xb:
${MAKE} MARIAN_EARLY_STOPPING=25 MARIAN_EXTRA=--no-restore-corpus MODELTYPE=transformer-big CONTINUE_EXISTING=1 tatoeba-zle2eng-trainjob-bt-xb
elg-pivot-eval:
for l in dan swe fin deu ron tur; do \

View File

@ -345,3 +345,15 @@ job1-step3:
${MAKE} all-bt
print-info:
@echo "model file: ${MODEL_START}"
@echo "source vocab: ${MODEL_SRCVOCAB}"
@echo "target vocab: ${MODEL_TRGVOCAB}"
@echo "final model file: ${MODEL_FINAL}"
@echo "latest compatible model: ${MODEL_LATEST}"
ls -t ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.best-perplexity.npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.best-perplexity.npz

View File

@ -348,7 +348,6 @@ tatoeba-%-train:
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
all-groupsize-limits; )
## start the training job
## - create config file
## - create data sets
@ -383,6 +382,17 @@ tatoeba-%-pivotlang:
fi
tatoeba-%-info:
-( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-info,%,$@))); \
t=$(lastword $(subst 2, ,$(patsubst tatoeba-%-info,%,$@))); \
S="${call find-srclanggroup,${patsubst tatoeba-%-info,%,$@},${PIVOT}}"; \
T="${call find-trglanggroup,${patsubst tatoeba-%-info,%,$@},${PIVOT}}"; \
${MAKE} LANGPAIRSTR=$$s-$$t SRCLANGS="$$S" TRGLANGS="$$T" \
TATOEBA_SRCLANG_GROUP="`langgroup -n $$s`" \
TATOEBA_TRGLANG_GROUP="`langgroup -n $$t`" \
print-info; )
## evaluate with the model-specific test set
tatoeba-%-eval:
( s=$(firstword $(subst 2, ,$(patsubst tatoeba-%-eval,%,$@))); \