24x12 transformer model added

This commit is contained in:
Joerg Tiedemann 2023-03-20 23:55:58 +02:00
parent fd13234eb2
commit 7174c98af4
12 changed files with 152 additions and 27 deletions

@ -1 +1 @@
Subproject commit 899f4b6c0abc66013d0546d36a6681f69e40bcbb
Subproject commit 50ac071d6d3c85efc2aa7ab379ba1863c7322d5d

View File

@ -64,6 +64,8 @@ MODELTYPES = transformer \
transformer-base-align \
transformer-big \
transformer-big-align \
transformer-24x12 \
transformer-24x12-align \
transformer-small \
transformer-small-align \
transformer-tiny \

View File

@ -105,6 +105,7 @@ export TMPWORKDIR
SCRIPTDIR ?= ${REPOHOME}scripts
TOOLSDIR ?= ${REPOHOME}tools
MONITOR ?= ${shell which monitor 2>/dev/null || echo ${TOOLSDIR}/monitor}
ISO639 ?= ${shell which iso639 2>/dev/null || echo 'perl ${TOOLSDIR}/LanguageCodes/ISO-639-3/bin/iso639'}
PIGZ ?= ${shell which pigz 2>/dev/null || echo ${TOOLSDIR}/pigz/pigz}
TERASHUF ?= ${shell which terashuf 2>/dev/null || echo ${TOOLSDIR}/terashuf/terashuf}

View File

@ -287,7 +287,7 @@ PIVOT_LANG ?= ${DEFAULT_PIVOT_LANG}
FT_SELECTED ?= 95
%-ftbest:
@for s in ${SRCLANGS}; do \
@-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ -e ${FORWARDTRANS_HOME}/$$s-$$t/latest ]; then \
if [ `ls ${FORWARDTRANS_HOME}/$$s-$$t/latest/ | grep "best${FT_SELECTED}.gz" | wc -l` -eq 0 ]; then \
@ -304,7 +304,7 @@ FT_SELECTED ?= 95
${@:-ftbest=}
%-ftrawbest:
@for s in ${SRCLANGS}; do \
@-for s in ${SRCLANGS}; do \
for t in ${TRGLANGS}; do \
if [ -e ${FORWARDTRANS_HOME}/$$s-$$t/latest ]; then \
${MAKE} -C ${FORWARDTRANS_HOME} SRC=$$s TRG=$$t \

View File

@ -38,32 +38,43 @@ fineng-test-student:
HPLTLANGS = eus swa glg zho
euseng-train-tinystudent:
make SRCLANGS=eus TRGLANGS=eng train-tiny11-student
hplt-train-tinystudents:
for l in ${HPLTLANGS}; do \
make SRCLANGS=$$l TRGLANGS=eng train-tiny11-student; \
make SRCLANGS=$$l TRGLANGS=eng train-small-student; \
done
swaeng-train-tinystudent:
make SRCLANGS=swa TRGLANGS=eng train-tiny11-student
# make HPC_MEM=32g GPUJOB_HPC_MEM=32g SRCLANGS=$$l TRGLANGS=eng train-base-student; \
glgeng-train-tinystudent:
make SRCLANGS=glg TRGLANGS=eng train-tiny11-student
hplt-quantize-students:
for l in ${HPLTLANGS}; do \
make SRCLANGS=$$l TRGLANGS=eng quantize-tiny11-student; \
make SRCLANGS=$$l TRGLANGS=eng quantize-small-student; \
done
hplt-test-quantized-students:
for l in ${HPLTLANGS}; do \
make SRCLANGS=$$l TRGLANGS=eng test-quantized-tiny11-student; \
make SRCLANGS=$$l TRGLANGS=eng test-quantized-small-student; \
done
euseng-train-smallstudent:
make SRCLANGS=eus TRGLANGS=eng train-small-student
swaeng-train-smallstudent:
make SRCLANGS=swa TRGLANGS=eng train-small-student
glgeng-train-smallstudent:
make SRCLANGS=glg TRGLANGS=eng train-small-student
hplt-release-students:
for l in ${HPLTLANGS}; do \
make SRCLANGS=$$l TRGLANGS=eng release-tiny11-student; \
make SRCLANGS=$$l TRGLANGS=eng release-small-student; \
done
## generic recipes for training and testing student models
data-student:
make ${STUDENT_HPCPARAMS} FT_SELECTED=${STUDENT_CEFILTER} \
data-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba
train-student:
make ${STUDENT_HPCPARAMS} FT_SELECTED=${STUDENT_CEFILTER} \
all-job-${STUDENT_DATA}-${STUDENT_VOCAB}-tatoeba

View File

@ -79,6 +79,42 @@ roa2eng:
fin2eng-extended:
${MAKE} MODELTYPE=transformer-big CONTINUE_EXISTING=1 DATASET=${DATASET}+news tatoeba-fin2eng-trainjob-bt
eng2fin-extended:
${MAKE} MODELTYPE=transformer-big CONTINUE_EXISTING=1 DATASET=${DATASET}+news tatoeba-eng2fin-trainjob-bt
swe2fin-extended:
${MAKE} MODELTYPE=transformer-big CONTINUE_EXISTING=1 DATASET=${DATASET}+news tatoeba-swe2fin-trainjob-bt-pbt
fin2swe-extended:
${MAKE} MODELTYPE=transformer-big CONTINUE_EXISTING=1 tatoeba-fin2swe-trainjob-bt-pbt
fin2eng-24x12:
${MAKE} MODELTYPE=transformer-24x12 DATASET=${DATASET}+news \
GPUJOB_HPC_CORES=4 GPUJOB_HPC_MEM=32g \
GPUJOB_SUBMIT=-gpu0123 \
MARIAN_WORKSPACE=15000 tatoeba-fin2eng-trainjob-bt
fin2swe-24x12:
${MAKE} MODELTYPE=transformer-24x12 \
GPUJOB_HPC_CORES=4 GPUJOB_HPC_MEM=32g \
GPUJOB_SUBMIT=-gpu0123 \
MARIAN_WORKSPACE=15000 tatoeba-fin2swe-trainjob-bt-pbt
elg-release-models:
make MODELTYPE=transformer-big release-all-improved-models-bt
make MODELTYPE=transformer-big release-all-improved-models

View File

@ -1,6 +1,4 @@
#-------------------------------------------------------------------
# important secondary langs in Finland
#-------------------------------------------------------------------

View File

@ -150,6 +150,7 @@ ifneq ($(subst -align,,${MODELTYPE}),${MODELTYPE})
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG}
endif
ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny)
MARIAN_ENC_DEPTH = 3
MARIAN_DEC_DEPTH = 2
@ -211,6 +212,17 @@ ifeq ($(subst -align,,${MODELTYPE}),transformer-big)
endif
ifeq ($(subst -align,,${MODELTYPE}),transformer-24x12)
MARIAN_ENC_DEPTH = 24
MARIAN_DEC_DEPTH = 12
MARIAN_ATT_HEADS = 8
MARIAN_DIM_EMB = 2024
MARIAN_EXTRA += --optimizer-delay 2 --fp16
GPUJOB_HPC_MEM = 32g
endif
##------------------------------------------------
## set training parameters
@ -280,7 +292,7 @@ endif
## load anyway before calling make? It is already set in the
## SLURM scripts ...
##--------------------------------------------------------------------
${LOAD_ENV} && ${MARIAN_TRAIN} \
${LOAD_ENV} && ${MONITOR} ${MARIAN_TRAIN} \
${MARIAN_TRAINING_PARAMETER} \
${MARIAN_EXTRA} \
${MARIAN_STOP_CRITERIA} \
@ -298,7 +310,7 @@ endif
--shuffle ${MARIAN_SHUFFLE} \
--sharding ${MARIAN_SHARDING} \
--overwrite \
--keep-best
--keep-best 2>>$(@:.done=.log) 1>&2
touch $@

View File

@ -3,4 +3,4 @@
# USAGE postprocess.sh < input > output
#
sed 's/ //g;s/▁/ /g'
sed 's/ //g;s/▁/ /g;s/^ *//'

View File

@ -98,6 +98,11 @@ MODELNAME := ${patsubst %.zip,%,${notdir ${MODELZIP}}}
MULTI_TARGET_MODEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep 'use-target-labels' | wc -l}
ifneq (${MULTI_TARGET_MODEL},0)
TARGET_LANG_LABEL := ${shell ${WGET} -qq -O - ${MODELINFO} | grep -o '>>${TRG}.*<<' | head -1}
ifeq (${TARGET_LANG_LABEL},)
ifneq ($(wildcard ${LANGPAIR}/${MODELNAME}/*.vocab.yml),)
TARGET_LANG_LABEL := $(shell grep -o '>>${TRG}.*<<' $(wildcard ${LANGPAIR}/${MODELNAME}/*.vocab.yml) | head -1)
endif
endif
endif
RELEASED_BITEXTS := $(patsubst %.tar,%,${shell ${WGET} -qq -O - ${TATOEBA_GITRAW}/Wiki.md | \
@ -137,6 +142,7 @@ all: prepare
${MAKE} translate-all-parts
${MAKE} score-translations
${MAKE} sort-scored-translations
${MAKE} extract-best-translations
.PHONY: mtmodel
@ -192,6 +198,12 @@ else
REV_TRG_PREPROCESS_ARGS = ${SRC} ${REV_LANGPAIR}/${REV_MODELNAME}/target.spm
endif
print-reverse-modelinfo:
@echo ${REV_MODELNAME}
@echo ${REV_MODELZIP}
@echo ${MODELINFO}
@echo "multi-target model: ${REV_MULTI_TARGET_MODEL}"
## score translations with reverse translation model
## normalize scores (see https://github.com/browsermt/students)

View File

@ -42,7 +42,8 @@ BT_CWORK_ONTAINER = project-Tatoeba-MT-bt
## split size in nr-of-lines
## default part to be selected = aa
SPLIT_SIZE ?= 1000000
SPLIT_SIZE ?= 1000000
MAX_NR_OF_PARTS ?= 50
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
@ -97,8 +98,8 @@ BITEXT_LATEST_TRG = ${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}
BITEXT_LATEST_README = ${OUTPUT_DIR}/latest/README.md
## all parts of the bitext
PARTS = $(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}})
## all parts of the bitext (restricted to a specific max number of parts)
PARTS = $(wordlist 1,${MAX_NR_OF_PARTS},$(subst .,,${suffix ${basename ${wildcard ${BITEXT_PRE:${PART}.gz=}??.gz}}}))
ALL_BITEXT_LATEST_SRC = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${SRC}.gz,${PARTS}}
ALL_BITEXT_LATEST_TRG = ${patsubst %,${OUTPUT_DIR}/latest/Tatoeba-train.${PIVOT}-${SRC}-${TRG}.%.${TRG}.gz,${PARTS}}
@ -140,7 +141,7 @@ ${MODELDIR}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
${WGET} -O ${dir $@}/model.zip ${MODELZIP}
cd ${dir $@} && unzip model.zip
cd ${dir $@} && unzip -n model.zip
rm -f ${dir $@}/model.zip
mv ${dir $@}/preprocess.sh ${dir $@}/preprocess-old.sh
sed 's#perl -C -pe.*$$#perl -C -pe "s/(?!\\n)\\p{C}/ /g;" |#' \

52
tools/monitor Executable file
View File

@ -0,0 +1,52 @@
#!/usr/bin/bash
energy_counter() {
python3 << END
import sys
from pynvml import (
nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
nvmlDeviceGetTotalEnergyConsumption, nvmlShutdown
)
nvmlInit()
deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
handle = nvmlDeviceGetHandleByIndex(i)
energy = nvmlDeviceGetTotalEnergyConsumption(handle)
print(f" energy counter GPU {i}: {energy} mJ", file=sys.stderr)
nvmlShutdown()
END
}
COMMAND=$@
TIME=$(which time || echo "time")
NVIDIA_GPU_QUERY=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,power.draw,memory.total,memory.free,memory.used
tmpfile=$(mktemp)
if command -v nvidia-smi &> /dev/null
then
nvidia-smi --query-gpu=${NVIDIA_GPU_QUERY} --format=csv -l 1 > ${tmpfile}.gpu &
echo " - energy-comsumption counter (start): " >&2
energy_counter
fi
${TIME} -v -o ${tmpfile} $@
echo " - resources used according to time:" >&2
cat ${tmpfile} >&2
rm -f ${tmpfile}
if command -v nvidia-smi &> /dev/null
then
kill %1
echo " - energy-comsumption counter (end): " >&2
energy_counter
echo " - GPU utlization:" >&2
cat ${tmpfile}.gpu >&2
rm -f ${tmpfile}.gpu
fi