This commit is contained in:
Joerg Tiedemann 2021-12-10 19:19:51 +02:00
parent a941317cef
commit 17ecdf2719
15 changed files with 3022 additions and 344 deletions

275
Makefile
View File

@ -141,40 +141,11 @@
#
#--------------------------------------------------------------------
## model-specific configuration file
MODELCONFIG = config.mk
# check and adjust lib/env.mk and lib/config.mk
include lib/env.mk
.PHONY: install
install: install-prerequisites
# If we need prerequisites, that has to happen before including eg. config.mk
include lib/config.mk
# load model-specific configuration parameters
# if they exist in the work directory
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
include ${WORKDIR}/${MODELCONFIG}
endif
include lib/data.mk
include lib/train.mk
include lib/test.mk
include lib/misc.mk
include lib/dist.mk
include lib/slurm.mk
include lib/allas.mk
include lib/generic.mk
include lib/langsets.mk
include lib/tasks.mk
include lib/projects.mk
.PHONY: all
all: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
@ -183,247 +154,3 @@ all: ${WORKDIR}/${MODELCONFIG}
${MAKE} compare
${MAKE} eval-testsets
#---------------------------------------------------------------------
# run everything including backtranslation of wiki-data
#
## TODO: need to refresh backtranslate/index.html from time to time!
## ---> necessary for fetching latest wikidump with the correct link
#---------------------------------------------------------------------
.PHONY: all-and-backtranslate
all-and-backtranslate: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
-for t in ${TRGLANGS}; do \
for s in ${SRCLANGS}; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} -C backtranslate \
SRC=$$s TRG=$$t \
MODELHOME=${MODELDIR} \
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
all; \
fi \
done \
done
.PHONY: all-and-backtranslate-allwikis
all-and-backtranslate-allwikis: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
-for t in ${TRGLANGS}; do \
for s in ${SRCLANGS}; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} -C backtranslate SRC=$$s TRG=$$t all-wikitext; \
${MAKE} -C backtranslate \
SRC=$$s TRG=$$t \
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
MODELHOME=${MODELDIR} \
translate-all-wikis; \
fi \
done \
done
.PHONY: all-and-backtranslate-allwikiparts
all-and-backtranslate-allwikiparts: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
-for t in ${TRGLANGS}; do \
for s in ${SRCLANGS}; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} -C backtranslate SRC=$$s TRG=$$t all-wikitext; \
${MAKE} -C backtranslate \
SRC=$$s TRG=$$t \
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
MODELHOME=${MODELDIR} \
translate-all-wikiparts; \
fi \
done \
done
## train a model with backtranslations of wikipedia data
## (1) train a model in the opposite direction and backtranslate wikipedia data
## (2) train a model with backtranslated data
.PHONY: all-with-bt
all-with-bt:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate
${MAKE} all-bt
## train a model with backtranslations of ALL wikimedia wiki data
.PHONY: all-with-bt-all
all-with-bt-all:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate-allwikis
${MAKE} all-bt
## and now with all parts of all wikis
.PHONY: all-with-bt-allparts
all-with-bt-allparts:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate-allwikiparts
${MAKE} all-bt
## job1: submit jobs to create data, train models, backtranslate all, and train again
job1: ${WORKDIR}/${MODELCONFIG}
${MAKE} HPC_MEM=12g HPC_CORES=4 job1-step1.submitcpu
job1-step1:
${MAKE} data
${MAKE} reverse-data
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" data
-for t in ${TRGLANGS}; do \
${MAKE} -C backtranslate SRC=${SRC} TRG=$$t all-wikitext; \
done
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} job1-step2.submit${GPUJOB_SUBMIT}
job1-step2:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
all-and-backtranslate-allwikis
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} job1-step3.submit${GPUJOB_SUBMIT}
job1-step3:
${MAKE} all-bt
#------------------------------------------------------------------------
# create slurm jobs
#------------------------------------------------------------------------
.PHONY: all-job
all-job: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train-and-eval-job
.PHONY: train-job
train-job:
${MAKE} HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} train.submit${GPUJOB_SUBMIT}
.PHONY: train-and-eval-job
train-and-eval-job:
${MAKE} HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} train-and-eval.submit${GPUJOB_SUBMIT}
#------------------------------------------------------------------------
# make various data sets (and word alignment)
#------------------------------------------------------------------------
.PHONY: data
data: ${TRAINDATA_SRC} ${TRAINDATA_TRG}
${MAKE} ${DEVDATA_SRC} ${DEVDATA_TRG}
${MAKE} ${TESTDATA_SRC} ${TESTDATA_TRG}
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)
${MAKE} ${TRAIN_ALG}
endif
traindata: ${TRAINDATA_SRC} ${TRAINDATA_TRG}
testdata: ${TESTDATA_SRC} ${TESTDATA_TRG}
devdata: ${DEVDATA_SRC} ${DEVDATA_TRG}
devdata-raw: ${DEV_SRC} ${DEV_TRG}
wordalign: ${TRAIN_ALG}
#------------------------------------------------------------------------
# train, translate and evaluate
#------------------------------------------------------------------------
## other model types
vocab: ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
train: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
translate: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
eval: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.eval
compare: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
## ensemble of models (assumes to find them in subdirs of the WORKDIR)
translate-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}
eval-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}.eval
## combined tasks:
## train and evaluate
train-and-eval: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
${MAKE} eval-testsets
## train model and start back-translation jobs once the model is ready
## (requires to create a dist package)
train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
${MAKE} local-dist
${MAKE} -C backtranslate MODELHOME=${MODELDIR} translate-all-wikis-jobs
## OBSOLETE
# ALL_RELEASED_MODELS = ${wildcard models-tatoeba/*/*.zip}
# ALL_VOCABS_FIXED = ${patsubst %.zip,%.fixed-vocab,${ALL_RELEASED_MODELS}}
# fix-released-vocabs: ${ALL_VOCABS_FIXED}
# %.fixed-vocab: %.zip
# @( v=`unzip -l $< | grep 'vocab.yml$$' | sed 's/^.* //'`; \
# if [ "$$v" != "" ]; then \
# unzip $< $$v; \
# python3 scripts/fix_vocab.py $$v; \
# if [ -e $$v.bak ]; then \
# echo "update $$v in $<"; \
# zip $< $$v $$v.bak; \
# else \
# echo "vocab $$v is fine in $<"; \
# fi; \
# rm -f $$v $$v.bak; \
# fi )
# ALL_VOCABS_REFIXED = ${patsubst %.zip,%.refixed-vocab,${ALL_RELEASED_MODELS}}
# refix-released-vocabs: ${ALL_VOCABS_REFIXED}
# %.refixed-vocab: %.zip
# @echo "checking $<"
# @( v=`unzip -l $< | grep 'vocab.yml.bak$$' | sed 's/^.* //'`; \
# if [ "$$v" != "" ]; then \
# unzip -o $< $$v; \
# if [ `grep -v '^"' $$v | wc -l` -gt 0 ]; then \
# echo "$</$$v has items that do not start with quotes!"; \
# o=`echo $$v | sed 's/.bak//'`; \
# unzip -o $< $$o; \
# if [ `diff $$o $$v | wc -l` -gt 0 ]; then \
# mv $$o $$o.bak2; \
# mv $$v $$o; \
# zip $< $$o $$o.bak2; \
# rm -f $$o $$.bak2; \
# else \
# echo "vocabs are the same"; \
# fi \
# else \
# echo "$$v fix was fine"; \
# fi; \
# rm -f $$v; \
# fi )

View File

@ -5,11 +5,13 @@
#
PWD := ${shell pwd}
TOOLSDIR := ${PWD}/../tools
REPOHOME := ${PWD}/../
TOOLSDIR := ${REPOHOME}tools
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
SRC ?= af
TRG ?= en

View File

@ -5,11 +5,13 @@
#
PWD := ${shell pwd}
TOOLSDIR := ${PWD}/../tools
REPOHOME := ${PWD}/../
TOOLSDIR := ${REPOHOME}tools
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
SRC = fin
TRG = eng

View File

@ -1,7 +1,8 @@
include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
REPOHOME := ${PWD}/../
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
SRC = en

View File

@ -41,9 +41,10 @@
# --> need to adjust preprocess-scripts for those models
#
include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
REPOHOME := ${PWD}/../
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
MODEL = news

View File

@ -1,15 +1,40 @@
# -*-makefile-*-
#
# model configurations
# model and environment configurations
#
## some pre-defined language sets
include ${REPOHOME}lib/langsets.mk
## supported model types
## configuration for each type is in lib/train.mk
MODELTYPES = transformer \
transformer-align \
transformer-base \
transformer-base-align \
transformer-big \
transformer-big-align \
transformer-small-align \
transformer-tiny \
transformer-tiny-align
## default model type
MODELTYPE = transformer-align
NR = 1
## name of the model-specific configuration file
MODELCONFIG ?= config.mk
## various ways of setting the model languages
##
## (1) explicitly set source and target languages, for example:
## SRCLANGS="da no sv" TRGLANGS="fi da"
##
@ -377,26 +402,6 @@ TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
MODEL_SUBDIR =
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
## supported model types
## configuration for each type is in lib/train.mk
MODELTYPES = transformer \
transformer-align \
transformer-base \
transformer-base-align \
transformer-big \
transformer-big-align \
transformer-small-align \
transformer-tiny \
transformer-tiny-align
## default model type
MODELTYPE = transformer-align
NR = 1
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
@ -472,7 +477,20 @@ MARIAN_ENC_DEPTH ?= 6
MARIAN_DEC_DEPTH ?= 6
MARIAN_ATT_HEADS ?= 8
MARIAN_DIM_EMB ?= 512
MARIAN_CLIP_NORM ?= 5
## default = shuffle data and batches
## (set to batches or none to change this)
MARIAN_SHUFFLE ?= data
## default: use sqlite database to store data
## remove this to use regular temp data
## set to --shuffle-in-ram to keep all shuffled data in RAM
MARIAN_DATA_STORAGE ?= --sqlite
## set to global for lower memory usage in multiprocess training
## TODO: does this parameter really work?
MARIAN_SHARDING ?= local
## TODO: currently marianNMT crashes with workspace > 26000 (does it?)
@ -483,24 +501,24 @@ ifeq (${GPU},p100)
else ifeq (${GPU},a100)
ifeq ($(subst -align,,${MODELTYPE}),transformer-big)
MARIAN_WORKSPACE = 20000
MARIAN_WORKSPACE = 15000
else ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
MARIAN_WORKSPACE = 10000
else ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny)
MARIAN_WORKSPACE = 10000
else
MARIAN_WORKSPACE = 30000
MARIAN_WORKSPACE = 25000
endif
else ifeq (${GPU},v100)
ifeq ($(subst -align,,${MODELTYPE}),transformer-big)
MARIAN_WORKSPACE = 20000
MARIAN_WORKSPACE = 15000
else ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
MARIAN_WORKSPACE = 10000
else ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny)
MARIAN_WORKSPACE = 10000
else
MARIAN_WORKSPACE = 24000
MARIAN_WORKSPACE = 25000
endif
else
@ -508,6 +526,10 @@ else
endif
## TODO: do we need to reduce workspace for decoding?
# MARIAN_DECODER_WORKSPACE = $$((${MARIAN_WORKSPACE} / 2))
MARIAN_DECODER_WORKSPACE = 10000
## weights associated with training examples
ifneq ("$(wildcard ${TRAIN_WEIGHTS})","")
@ -531,7 +553,7 @@ endif
ifeq ($(GPU_AVAILABLE),1)
MARIAN_DECODER_FLAGS = -b 4 -n1 -d ${MARIAN_GPUS} --fp16 \
--quiet-translation -w ${MARIAN_WORKSPACE} \
--quiet-translation -w ${MARIAN_DECODER_WORKSPACE} \
--mini-batch 768 --maxi-batch 2048 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
else
@ -543,6 +565,16 @@ else
endif
# load model-specific configuration parameters
# if they exist in the work directory
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
include ${WORKDIR}/${MODELCONFIG}
endif
## make some data size-specific configuration parameters
## TODO: is it OK to delete LOCAL_TRAIN data?
@ -662,6 +694,8 @@ endif
################################################################
### DEPRECATED? ################################################
################################################################
@ -694,4 +728,3 @@ opus-langpairs.txt:
rm -f $@.all

View File

@ -6,6 +6,7 @@
#
SHELL := /bin/bash
PWD ?= ${shell pwd}
# job-specific settings (overwrite if necessary)
# HPC_EXTRA: additional SBATCH commands
@ -154,8 +155,6 @@ endif
MULTEVALHOME = ${APPLHOME}/multeval
## install prerequisites
PREREQ_TOOLS := $(lastword ${ISO639}) ${ATOOLS} ${PIGZ} ${TERASHUF} ${JQ} ${MARIAN} ${EFLOMAL} ${TMX2MOSES}
@ -177,8 +176,8 @@ export PERL_MB_OPT := --install_base "${HOME}/perl5"
export PERL_MM_OPT := INSTALL_BASE=${HOME}/perl5
PHONY: install-prerequisites install-prereq install-requirements
install-prerequisites install-prereq install-requirements:
PHONY: install install-prerequisites install-prereq install-requirements
install install-prerequisites install-prereq install-requirements:
${PIP} install --user -r requirements.txt
${MAKE} install-perl-modules
${MAKE} ${PREREQ_TOOLS}

2
lib/env/mahti.mk vendored
View File

@ -43,7 +43,7 @@ GPU_MODULES = gcc/10.3.0 cuda/11.4.2 cudnn/8.0.4.30-11.0-linux-x64 openblas/0.
LOAD_CPU_ENV = module load ${CPU_MODULES}
LOAD_GPU_ENV = module load ${GPU_MODULES}
ifdef HPC_DISK
ifneq (${HPC_DISK},)
HPC_GPU_ALLOCATION = --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}
endif

2
lib/env/puhti.mk vendored
View File

@ -30,7 +30,7 @@ GPU_MODULES = gcc/8.3.0 cuda/10.1.168 cudnn/7.6.1.34-10.1 intel-mkl/2019.0.4 pyt
LOAD_CPU_ENV = module load ${CPU_MODULES} && module list
LOAD_GPU_ENV = module load ${GPU_MODULES} && module list
ifdef HPC_DISK
ifneq (${HPC_DISK},)
HPC_GPU_ALLOCATION = --gres=gpu:${GPU}:${NR_GPUS},nvme:${HPC_DISK}
HPC_CPU_EXTRA1 = \#SBATCH --gres=nvme:${HPC_DISK}
endif

View File

@ -112,3 +112,56 @@ fix-spm-models:
fi; \
cd ../..; \
done
## OBSOLETE
# ALL_RELEASED_MODELS = ${wildcard models-tatoeba/*/*.zip}
# ALL_VOCABS_FIXED = ${patsubst %.zip,%.fixed-vocab,${ALL_RELEASED_MODELS}}
# fix-released-vocabs: ${ALL_VOCABS_FIXED}
# %.fixed-vocab: %.zip
# @( v=`unzip -l $< | grep 'vocab.yml$$' | sed 's/^.* //'`; \
# if [ "$$v" != "" ]; then \
# unzip $< $$v; \
# python3 scripts/fix_vocab.py $$v; \
# if [ -e $$v.bak ]; then \
# echo "update $$v in $<"; \
# zip $< $$v $$v.bak; \
# else \
# echo "vocab $$v is fine in $<"; \
# fi; \
# rm -f $$v $$v.bak; \
# fi )
# ALL_VOCABS_REFIXED = ${patsubst %.zip,%.refixed-vocab,${ALL_RELEASED_MODELS}}
# refix-released-vocabs: ${ALL_VOCABS_REFIXED}
# %.refixed-vocab: %.zip
# @echo "checking $<"
# @( v=`unzip -l $< | grep 'vocab.yml.bak$$' | sed 's/^.* //'`; \
# if [ "$$v" != "" ]; then \
# unzip -o $< $$v; \
# if [ `grep -v '^"' $$v | wc -l` -gt 0 ]; then \
# echo "$</$$v has items that do not start with quotes!"; \
# o=`echo $$v | sed 's/.bak//'`; \
# unzip -o $< $$o; \
# if [ `diff $$o $$v | wc -l` -gt 0 ]; then \
# mv $$o $$o.bak2; \
# mv $$v $$o; \
# zip $< $$o $$o.bak2; \
# rm -f $$o $$.bak2; \
# else \
# echo "vocabs are the same"; \
# fi \
# else \
# echo "$$v fix was fine"; \
# fi; \
# rm -f $$v; \
# fi )

View File

@ -203,6 +203,22 @@ TATOEBA_PARAMS := DATASET=${TATOEBA_DATASET} \
DEFAULT_PIVOT_LANG=${TATOEBA_PIVOT} \
MIN_BLEU_SCORE=${TATOEBA_MIN_BLEU}
MARIAN_SHUFFLE=data
MARIAN_DATA_STORAGE=--sqlite
HPC_DISK=500
## unless we have multilingual models:
## no need to shuffle data again, just shuffle batches
## no need to store data in sqlite databases
ifeq (${words ${SRCLANGS}},1)
ifeq (${words ${TRGLANGS}},1)
# TATOEBA_PARAMS += MARIAN_SHUFFLE=batches MARIAN_DATA_STORAGE= HPC_DISK=
MARIAN_SHUFFLE=batches
MARIAN_DATA_STORAGE=
HPC_DISK=
endif
endif
@ -1834,7 +1850,9 @@ KEEP_LANGIDS = bos_Cyrl cmn cnr cnr_Latn csb diq dnj dty fas fqs ful fur
nor nor_Latn oss_Latn pan plt pnb_Guru pob prs qug quw quy quz qvi rmn rmy ruk san swa swc \
syr syr_Syrc tgk_Latn thy tlh tmh toi tuk_Cyrl urd_Deva xal_Latn yid_Latn zho zlm
SKIP_LANGIDS = ${filter-out ${KEEP_LANGIDS},${TRAIN_ONLY_LANGIDS}} \
ang ara_Latn arq_Latn apc_Latn bul_Latn ell_Latn eng_Tibt eng_Zinh heb_Latn hun_Zinh nob_Hebr rus_Latn
ang ara_Latn arq_Latn apc_Latn bul_Latn ell_Latn eng_Tibt \
eng_Zinh heb_Latn hun_Zinh nob_Hebr rus_Latn \
..._Qa[ab][a-x] ..._Zinh ..._Zmth ..._Zsym ..._Zxxx ..._Zyyy ..._Zzzz
SKIP_LANGIDS_PATTERN = ^\(${subst ${SPACE},\|,${SKIP_LANGIDS}}\)$$
## modify language IDs in training data to adjust them to test sets
@ -1847,16 +1865,16 @@ FIXLANGIDS = | sed 's/zho\(.*\)_HK/yue\1/g;s/zho\(.*\)_CN/cmn\1/g;s/zho\(.*\)_T
| sed 's/\_[A-Z][A-Z]//g' \
| sed 's/\-[a-z]*//g' \
| sed 's/\_Brai//g' \
| sed 's/bul_Latn/bul/g' \
| sed 's/jpn_[A-Za-z]*/jpn/g' \
| sed 's/kor_[A-Za-z]*/kor/g' \
| sed 's/nor_Latn/nor/g' \
| sed 's/non_Latn/non/g' \
| sed 's/nor/nob/g' \
| sed 's/bul_Latn/bul/g' \
| sed 's/syr_Syrc/syr/g' \
| sed 's/yid_Latn/yid/g' \
| perl -pe 'if (/(cjy|cmn|gan|lzh|nan|wuu|yue|zho)_([A-Za-z]{4})/){if ($$2 ne "Hans" && $$2 ne "Hant"){s/(cjy|cmn|gan|lzh|nan|wuu|yue|zho)_([A-Za-z]{4})/$$1/} }'
# | sed 's/ara_Latn/ara/;s/arq_Latn/arq/;' \

View File

@ -1,2 +1,213 @@
# -*-makefile-*-
#
# recipes for specific tasks
#
include ${REPOHOME}lib/data.mk
include ${REPOHOME}lib/train.mk
include ${REPOHOME}lib/test.mk
include ${REPOHOME}lib/slurm.mk
include ${REPOHOME}lib/generic.mk
include ${REPOHOME}lib/misc.mk
include ${REPOHOME}lib/allas.mk
include ${REPOHOME}lib/dist.mk
#------------------------------------------------------------------------
# make various data sets (and word alignment)
#------------------------------------------------------------------------
.PHONY: data
data: ${TRAINDATA_SRC} ${TRAINDATA_TRG}
${MAKE} ${DEVDATA_SRC} ${DEVDATA_TRG}
${MAKE} ${TESTDATA_SRC} ${TESTDATA_TRG}
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)
${MAKE} ${TRAIN_ALG}
endif
traindata: ${TRAINDATA_SRC} ${TRAINDATA_TRG}
testdata: ${TESTDATA_SRC} ${TESTDATA_TRG}
devdata: ${DEVDATA_SRC} ${DEVDATA_TRG}
devdata-raw: ${DEV_SRC} ${DEV_TRG}
wordalign: ${TRAIN_ALG}
#------------------------------------------------------------------------
# train, translate and evaluate
#------------------------------------------------------------------------
## other model types
vocab: ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
train: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
translate: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
eval: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.eval
compare: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
## ensemble of models (assumes to find them in subdirs of the WORKDIR)
translate-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}
eval-ensemble: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.${TRG}.eval
## combined tasks:
## train and evaluate
train-and-eval: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
${MAKE} eval-testsets
## train model and start back-translation jobs once the model is ready
## (requires to create a dist package)
train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.compare
${MAKE} local-dist
${MAKE} -C backtranslate MODELHOME=${MODELDIR} translate-all-wikis-jobs
#------------------------------------------------------------------------
# create slurm jobs
#------------------------------------------------------------------------
.PHONY: all-job
all-job: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train-and-eval-job
.PHONY: train-job
train-job:
${MAKE} HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} train.submit${GPUJOB_SUBMIT}
.PHONY: train-and-eval-job
train-and-eval-job:
${MAKE} HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} train-and-eval.submit${GPUJOB_SUBMIT}
#---------------------------------------------------------------------
# run everything including backtranslation of wiki-data
#
## TODO: need to refresh backtranslate/index.html from time to time!
## ---> necessary for fetching latest wikidump with the correct link
#---------------------------------------------------------------------
.PHONY: all-and-backtranslate
all-and-backtranslate: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
-for t in ${TRGLANGS}; do \
for s in ${SRCLANGS}; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} -C backtranslate \
SRC=$$s TRG=$$t \
MODELHOME=${MODELDIR} \
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
all; \
fi \
done \
done
.PHONY: all-and-backtranslate-allwikis
all-and-backtranslate-allwikis: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
-for t in ${TRGLANGS}; do \
for s in ${SRCLANGS}; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} -C backtranslate SRC=$$s TRG=$$t all-wikitext; \
${MAKE} -C backtranslate \
SRC=$$s TRG=$$t \
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
MODELHOME=${MODELDIR} \
translate-all-wikis; \
fi \
done \
done
.PHONY: all-and-backtranslate-allwikiparts
all-and-backtranslate-allwikiparts: ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval
${MAKE} compare
${MAKE} local-dist
-for t in ${TRGLANGS}; do \
for s in ${SRCLANGS}; do \
if [ "$$s" != "$$t" ]; then \
${MAKE} -C backtranslate SRC=$$s TRG=$$t all-wikitext; \
${MAKE} -C backtranslate \
SRC=$$s TRG=$$t \
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
MODELHOME=${MODELDIR} \
translate-all-wikiparts; \
fi \
done \
done
## train a model with backtranslations of wikipedia data
## (1) train a model in the opposite direction and backtranslate wikipedia data
## (2) train a model with backtranslated data
.PHONY: all-with-bt
all-with-bt:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate
${MAKE} all-bt
## train a model with backtranslations of ALL wikimedia wiki data
.PHONY: all-with-bt-all
all-with-bt-all:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate-allwikis
${MAKE} all-bt
## and now with all parts of all wikis
.PHONY: all-with-bt-allparts
all-with-bt-allparts:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" all-and-backtranslate-allwikiparts
${MAKE} all-bt
## job1: submit jobs to create data, train models, backtranslate all, and train again
job1: ${WORKDIR}/${MODELCONFIG}
${MAKE} HPC_MEM=12g HPC_CORES=4 job1-step1.submitcpu
job1-step1:
${MAKE} data
${MAKE} reverse-data
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" data
-for t in ${TRGLANGS}; do \
${MAKE} -C backtranslate SRC=${SRC} TRG=$$t all-wikitext; \
done
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} job1-step2.submit${GPUJOB_SUBMIT}
job1-step2:
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
MAX_SENTENCES=${shell zcat ${TRAINDATA_SRC} | head -1000000 | wc -l} \
all-and-backtranslate-allwikis
${MAKE} SRCLANGS="${TRGLANGS}" TRGLANGS="${SRCLANGS}" \
HPC_CORES=1 HPC_MEM=${GPUJOB_HPC_MEM} job1-step3.submit${GPUJOB_SUBMIT}
job1-step3:
${MAKE} all-bt

View File

@ -95,13 +95,14 @@ MARIAN_TRAIN_PREREQS = ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
## define validation and early-stopping parameters
## as well as pre-requisites for training the model
## TODO: do we want to add valid-metrics "ce-mean-words" and "bleu-detok"?
ifndef SKIP_VALIDATION
MARIAN_TRAIN_PREREQS += ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
MARIAN_STOP_CRITERIA = --early-stopping ${MARIAN_EARLY_STOPPING} \
--valid-freq ${MARIAN_VALID_FREQ} \
--valid-sets ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} \
--valid-metrics perplexity ce-mean-words bleu-detok \
--valid-metrics perplexity \
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
--valid-max-length 100 \
--valid-log ${WORKDIR}/${MODEL}.${MODELTYPE}.valid${NR}.log \
@ -152,21 +153,39 @@ ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny)
MARIAN_DEC_DEPTH = 2
MARIAN_ATT_HEADS = 8
MARIAN_DIM_EMB = 256
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
MARIAN_EXTRA += --transformer-decoder-autoreg rnn \
--dec-cell ssru \
--fp16
--dec-cell ssru # --fp16
endif
## difference to student model in bergamot (tiny11):
# --transformer-dim-ffn 1536 --enc-depth 6 --transformer-ffn-activation relu
# 32000 vocab in total (tied source and target)
# --mini-batch-fit -w 9000 --mini-batch 1000 --maxi-batch 1000 --devices $GPUS --sync-sgd --optimizer-delay 2 \
# --learn-rate 0.0003 --lr-report --lr-warmup 16000 --lr-decay-inv-sqrt 32000 \
# --cost-type ce-mean-words \
# --optimizer-params 0.9 0.98 1e-09 --clip-norm 0
ifeq ($(subst -align,,${MODELTYPE}),transformer-tiny11)
MARIAN_ENC_DEPTH = 6
MARIAN_DEC_DEPTH = 2
MARIAN_ATT_HEADS = 8
MARIAN_DIM_EMB = 256
MARIAN_CLIP_NORM = 0
MARIAN_EXTRA += --transformer-dim-ffn 1536 \
--transformer-decoder-autoreg rnn \
--dec-cell ssru --optimizer-delay 2
# --fp16
endif
ifeq ($(subst -align,,${MODELTYPE}),transformer-small)
MARIAN_ENC_DEPTH = 3
MARIAN_DEC_DEPTH = 2
MARIAN_ATT_HEADS = 8
MARIAN_DIM_EMB = 256
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
MARIAN_EXTRA += --transformer-decoder-autoreg rnn \
--dec-cell ssru \
--fp16
--dec-cell ssru
# --fp16
endif
##------------------------------------------------
@ -178,13 +197,12 @@ endif
##------------------------------------------------
ifeq ($(subst -align,,${MODELTYPE}),transformer-base)
MARIAN_TRAINING_PARAMETER = --task transformer-base
MARIAN_TRAINING_PARAMETER = --task transformer-base # --fp16
endif
ifeq ($(subst -align,,${MODELTYPE}),transformer-big)
MARIAN_TRAINING_PARAMETER = \
--task transformer-big \
--optimizer-delay 2
MARIAN_TRAINING_PARAMETER = --task transformer-big \
--optimizer-delay 2 # --fp16
GPUJOB_HPC_MEM = 16g
endif
@ -215,7 +233,7 @@ MARIAN_TRAINING_PARAMETER ?= \
--lr-decay-inv-sqrt 16000 \
--lr-report \
--optimizer-params 0.9 0.98 1e-09 \
--clip-norm 5 \
--clip-norm ${MARIAN_CLIP_NORM} \
--sync-sgd \
--exponential-smoothing
@ -262,6 +280,7 @@ endif
${MARIAN_TRAINING_PARAMETER} \
${MARIAN_EXTRA} \
${MARIAN_STOP_CRITERIA} \
${MARIAN_DATA_STORAGE} \
--workspace ${MARIAN_WORKSPACE} \
--model $(@:.done=.npz) \
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
@ -272,8 +291,9 @@ endif
--devices ${MARIAN_GPUS} \
--seed ${SEED} \
--tempdir ${TMPDIR} \
--shuffle ${MARIAN_SHUFFLE} \
--sharding ${MARIAN_SHARDING} \
--overwrite \
--keep-best \
--sqlite
--keep-best
touch $@

View File

@ -7,7 +7,9 @@
#
PWD := ${shell pwd}
TOOLSDIR := ${PWD}/../tools
REPOHOME := ${PWD}/../
TOOLSDIR := ${REPOHOME}tools
## language (SRC->TRG) pair we need
SRC = fi
@ -27,10 +29,10 @@ INCLUDE = OpenSubtitles
ORIGINAL_LANGPAIR = ${firstword ${sort ${PIVOT} ${TRG}}}-${lastword ${sort ${PIVOT} ${TRG}}}
PIVOT_LANGPAIR = ${PIVOT}-${SRC}
include ../lib/env.mk
include ../lib/config.mk
include ../lib/slurm.mk
include ../lib/dist.mk
include ${REPOHOME}lib/env.mk
include ${REPOHOME}lib/config.mk
include ${REPOHOME}lib/slurm.mk
include ${REPOHOME}lib/dist.mk
include lib/models.mk

2609
tatoeba/Makefile Normal file

File diff suppressed because it is too large Load Diff