small changes

This commit is contained in:
Joerg Tiedemann 2022-02-01 15:05:31 +02:00
parent 7de13c11f0
commit 39f6b1cec6
6 changed files with 61 additions and 81 deletions

View File

@ -445,7 +445,7 @@ TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
# MODEL_SUBDIR =
# MODEL_VARIANT =
MODEL = ${MODEL_SUBDIR}${DATASET}${MODEL_VARIANT}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
@ -497,21 +497,23 @@ endif
## latest model with the same pre-processing but any data or modeltype
## except for models that include the string 'tuned4' (fine-tuned models)
## also allow models that are of the same type but with/without guided alignment
## --> this will be used if the flag CONTINUE_EXISTING is set on
# find the latest model that has the same modeltype/modelvariant with or without guided alignment
# to be used if the flag CONTINUE_EXISTING is set to 1
# - without guided alignment (remove if part of the current): ${subst -align,,${MODELTYPE}}
# - with guided alignment (remove and add again): ${subst -align,,${MODELTYPE}}-align
#
# Don't use the ones that are tuned for a specific language pair or domain!
ifeq (${CONTINUE_EXISTING},1)
MODEL_LATEST = $(firstword \
${shell ls -t ${WORKDIR}/*.${PRE_SRC}-${PRE_TRG}.*.model[0-9].npz \
${WORKDIR}/*.${PRE_SRC}-${PRE_TRG}.*.best-perplexity.npz \
2>/dev/null | grep -v 'tuned4' | \
egrep '${MODELTYPE}|${MODELTYPE}-align|${subst -align,,${MODELTYPE}}' })
${shell ls -t ${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.model[0-9].npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.model[0-9].npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}.best-perplexity.npz \
${WORKDIR}/*${MODEL_VARIANT}.${PRE_SRC}-${PRE_TRG}.${subst -align,,${MODELTYPE}}-align.best-perplexity.npz \
2>/dev/null | grep -v 'tuned4' })
MODEL_LATEST_VOCAB = $(shell echo "${MODEL_LATEST}" | \
sed 's|\.${PRE_SRC}-${PRE_TRG}\..*$$|.${PRE_SRC}-${PRE_TRG}.vocab.yml|')
MODEL_LATEST_OPTIMIZER = $(shell echo "${MODEL_LATEST}" | \
sed 's|.best-perplexity.npz|.optimizer.npz|')
MARIAN_EARLY_STOPPING = 15
endif

View File

@ -89,6 +89,13 @@ ifneq (${USE_FORWARDTRANS_SELECTED},)
FORWARDTRANS_TRG += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${TRGEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}}
endif
## selected by "raw" (unnormalised) scores
ifneq (${USE_FORWARDTRANS_SELECTED_RAW},)
FORWARDTRANS_SRC += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.rawbest${USE_FORWARDTRANS_SELECTED_RAW}.gz}}
FORWARDTRANS_TRG += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${TRGEXT}.rawbest${USE_FORWARDTRANS_SELECTED_RAW}.gz}}
endif
# forward-translation data of monolingual data (source-to-target)
ifeq (${USE_FORWARDTRANSMONO},1)
FORWARDTRANSMONO_SRC = ${sort ${wildcard ${BACKTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.gz}}

4
lib/env/puhti.mk vendored
View File

@ -5,8 +5,8 @@
CSCPROJECT = project_2003288
# CSCPROJECT = project_2002688
# CSCPROJECT = project_2003288
CSCPROJECT = project_2002688
# CSCPROJECT = project_2000309
# CSCPROJECT = project_2002982
WORKHOME = ${shell realpath ${PWD}/work}

View File

@ -240,33 +240,9 @@ listallmodels:
## include all backtranslation data as well in training
## start from the pre-trained opus model if it exists
BT_MODEL = ${MODEL_SUBDIR}${DATASET}+bt${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
BT_MODEL_BASE = ${BT_MODEL}.${MODELTYPE}.model${NR}
BT_MODEL_START = ${WORKDIR}/${BT_MODEL_BASE}.npz
BT_MODEL_VOCAB = ${WORKDIR}/${BT_MODEL}.vocab.yml
BT_MARIAN_EARLY_STOPPING = 15
BT_CONTINUE_EXISTING = 1
# %-add-backtranslations:
%-bt:
ifneq (${wildcard ${MODEL_FINAL}},)
ifeq (${wildcard ${BT_MODEL_START}},)
cp ${MODEL_FINAL} ${BT_MODEL_START}
ifeq (${wildcard ${MODEL_VOCAB}},)
cp ${MODEL_VOCAB} ${BT_MODEL_VOCAB}
endif
endif
endif
${MAKE} DATASET=${DATASET}+bt \
USE_BACKTRANS=1 \
CONTINUE_EXISTING=${BT_CONTINUE_EXISTING} \
MARIAN_EARLY_STOPPING=${BT_MARIAN_EARLY_STOPPING} \
${@:-bt=}
# MODELCONFIG=config-bt.mk \
${MAKE} DATASET=${DATASET}+bt USE_BACKTRANS=1 ${@:-bt=}
## adding a pivot language to the model
@ -291,36 +267,11 @@ PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG}
${@:-pivotlang=}; \
fi
# MODELCONFIG=${MODELCONFIG:.mk=+${PIVOT_LANG}.mk} \
## add forward translations
FT_MODEL = ${MODEL_SUBDIR}${DATASET}+ft${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
FT_MODEL_BASE = ${FT_MODEL}.${MODELTYPE}.model${NR}
FT_MODEL_START = ${WORKDIR}/${FT_MODEL_BASE}.npz
FT_MODEL_VOCAB = ${WORKDIR}/${FT_MODEL}.vocab.yml
FT_MARIAN_EARLY_STOPPING = 15
%-ft:
ifneq (${wildcard ${MODEL_FINAL}},)
ifeq (${wildcard ${FT_MODEL_START}},)
cp ${MODEL_FINAL} ${FT_MODEL_START}
ifeq (${wildcard ${MODEL_VOCAB}},)
cp ${MODEL_VOCAB} ${FT_MODEL_VOCAB}
endif
endif
endif
${MAKE} DATASET=${DATASET}+ft \
USE_FORWARDTRANS=1 \
CONTINUE_EXISTING=1 \
MARIAN_EARLY_STOPPING=${FT_MARIAN_EARLY_STOPPING} \
${@:-ft=}
# MODELCONFIG=config-ft.mk \
${MAKE} DATASET=${DATASET}+ft USE_FORWARDTRANS=1 ${@:-ft=}
# use a selected set of forward translation
@ -340,8 +291,18 @@ FT_SELECTED ?= 95
USE_FORWARDTRANS_SELECTED=${FT_SELECTED} \
${@:-ftbest=}
%-ftrawbest:
@for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ -e ${FORWARDTRANS_HOME}/$$s-$$t/latest ]; then \
${MAKE} -C ${FORWARDTRANS_HOME} SRC=$$s TRG=$$t \
RETAIN=${FT_SELECTED} extract-rawbest-translations; \
fi \
done \
done
${MAKE} DATASET=${DATASET}+ftraw${FT_SELECTED} \
USE_FORWARDTRANS_SELECTED_RAW=${FT_SELECTED} \
${@:-ftrawbest=}
## add forward translation of monolingual data

View File

@ -3,6 +3,7 @@
#
STUDENT_DATA = ftbest-nopar
STUDENT_CEFILTER = 95
STUDENT_VOCAB = separate-spm
# STUDENT_VOCAB = joint-spm
@ -41,37 +42,46 @@ fineng-test-student:
train-student:
make ${STUDENT_HPCPARAMS} FT_SELECTED=${STUDENT_CEFILTER} \
all-job-ftbest-nopar-${STUDENT_VOCAB}-tatoeba
all-job-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba
test-student:
make FT_SELECTED=${STUDENT_CEFILTER} HPC_MEM=20g WALLTIME=2 \
eval-ftbest-nopar-${STUDENT_VOCAB}-tatoeba.submit \
eval-testsets-ftbest-nopar-${STUDENT_VOCAB}-tatoeba.submit
eval-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba.submit \
eval-testsets-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba.submit
release-student:
make ${STUDENT_HPCPARAMS} FT_SELECTED=${STUDENT_CEFILTER} \
release-ftbest-nopar-${STUDENT_VOCAB}-tatoeba
release-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba
quantizeonly-student:
make FT_SELECTED=${STUDENT_CEFILTER} quantize-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba
quantize-student:
make FT_SELECTED=${STUDENT_CEFILTER} HPC_MEM=20g WALLTIME=2 \
lexical-shortlist-ftbest-nopar-${STUDENT_VOCAB}-tatoeba \
quantize-ftbest-nopar-${STUDENT_VOCAB}-tatoeba \
quantize-alphas-ftbest-nopar-${STUDENT_VOCAB}-tatoeba
lexical-shortlist-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba \
quantize-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba \
quantize-alphas-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba
quantize-finetuned-student:
make FT_SELECTED=${STUDENT_CEFILTER} HPC_MEM=20g WALLTIME=2 \
quantize-tuned-alphas-ftbest-nopar-${STUDENT_VOCAB}-tatoeba
quantize-tuned-alphas-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba
test-quantized-student:
make FT_SELECTED=${STUDENT_CEFILTER} HPC_MEM=20g WALLTIME=2 \
test-intgemm8-all-ftbest-nopar-${STUDENT_VOCAB}-tatoeba \
test-intgemm8-all-shortlist-ftbest-nopar-${STUDENT_VOCAB}-tatoeba
test-intgemm8-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba \
test-intgemm8-shortlist-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba
test-quantized-finetuned-student:
test-quantized-all-student:
make FT_SELECTED=${STUDENT_CEFILTER} HPC_MEM=20g WALLTIME=2 \
test-intgemm8-alltuned-ftbest-nopar-${STUDENT_VOCAB}-tatoeba \
test-intgemm8-alltuned-shortlist-ftbest-nopar-${STUDENT_VOCAB}-tatoeba
test-intgemm8-all-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba \
test-intgemm8-all-shortlist-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba
test-quantized-all-finetuned-student:
make FT_SELECTED=${STUDENT_CEFILTER} HPC_MEM=20g WALLTIME=2 \
test-intgemm8-alltuned-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba \
test-intgemm8-alltuned-shortlist-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba

View File

@ -21,7 +21,7 @@ endif
## (most of them relate to CPU resources like MEM, CORES, ...)
## typically we model single node jobs, which can still have multiple GPUs!
GPUJOB_HPC_QUEUE ?= ${HPC_GPUQUEUE}
GPUJOB_HPC_MEM ?= 4g
GPUJOB_HPC_MEM ?= 8g
GPUJOB_HPC_NODES ?= 1
GPUJOB_HPC_CORES ?= 1
GPUJOB_HPC_THREADS ?= ${GPUJOB_HPC_CORES}
@ -111,7 +111,7 @@ endif
echo '${HPC_CPU_EXTRA1}' >> $@
echo '${HPC_CPU_EXTRA2}' >> $@
echo '${HPC_CPU_EXTRA3}' >> $@
echo '${LOAD_GPU_ENV}' >> $@
echo '${LOAD_CPU_ENV}' >> $@
echo 'cd $${SLURM_SUBMIT_DIR:-.}' >> $@
echo 'pwd' >> $@
echo 'echo "Starting at `date`"' >> $@