diff --git a/Makefile b/Makefile index 5c4214f9..ebdb50ba 100644 --- a/Makefile +++ b/Makefile @@ -378,3 +378,22 @@ train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done ${MAKE} local-dist ${MAKE} -C backtranslate MODELHOME=${MODELDIR} translate-all-wikis-jobs + +ALL_RELEASED_MODELS = ${wildcard models-tatoeba/*/*.zip} +ALL_VOCABS_FIXED = ${patsubst %.zip,%.fixed-vocab,${ALL_RELEASED_MODELS}} + +fix-released-vocabs: ${ALL_VOCABS_FIXED} + +%.fixed-vocab: %.zip + @( v=`unzip -l $< | grep 'vocab.yml$$' | sed 's/^.* //'`; \ + if [ "$$v" != "" ]; then \ + unzip $< $$v; \ + python3 scripts/fix_vocab.py $$v; \ + if [ -e $$v.bak ]; then \ + echo "update $$v in $<"; \ + zip $< $$v $$v.bak; \ + else \ + echo "vocab $$v is fine in $<"; \ + fi; \ + rm -f $$v $$v.bak; \ + fi ) diff --git a/lib/allas.mk b/lib/allas.mk index 8b912b21..b28a377a 100644 --- a/lib/allas.mk +++ b/lib/allas.mk @@ -20,6 +20,9 @@ WORK_DESTDIR ?= ${WORKHOME} WORK_CONTAINER ?= OPUS-MT-train_${notdir ${WORKHOME}}-${WHOAMI} WORK_CONTAINER_JT ?= OPUS-MT-train_${notdir ${WORKHOME}}-tiedeman +ALLAS_STORAGE_URL = https://object.pouta.csc.fi/ + + ## store workdir on allas store: cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --nc --follow-links --override ${LANGPAIRSTR} @@ -43,3 +46,41 @@ fetch-data: mkdir -p ${WORK_DESTDIR} cd ${WORK_DESTDIR} && a-get ${WORK_CONTAINER}/data.tar + + +## generic recipe for storing work data and removing it from the file system +## DANGEROUS --- this really deletes the data! +## NOTE: makes container also world-readable (see swift post command) +## --> this makes it easier to fetch things without login credentials +## --> should not store sensitive data here! +%.stored: % + if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \ + b=OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}; \ + cd $(dir $@); \ + a-put -b $$b --nc --follow-links --override $(notdir $<); \ + rm -fr $(notdir $<); \ + touch $(notdir $@); \ + rm -f $(notdir $(@:stored=.fetched)); \ + swift post $$b --read-acl ".r:*" + fi + + +## TODO: fetch with wget instead of using a-commands +## fetch work data from allas +%.fetched: + if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \ + cd $(dir $@); \ + a-get OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/$(notdir $(@:.fetched=.tar)); \ + touch $(notdir $@); \ + rm -f $(notdir $(@:fetched=.stored)); \ + fi + +## another way of fetching work data +## requires settings SRCLANGS and TRGLANGS (or LANGPAIRSTR directly) +work-%/${LANGPAIRSTR}: + mkdir -p $(dir $@) + cd $(dir $@) && a-get OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/${LANGPAIRSTR}.tar + + +UPLOAD_MODELS=$(patsubst %,%.stored,${wildcard work-tatoeba/[dg-rt-z]*}) +upload-workfiles: ${UPLOAD_MODELS} diff --git a/lib/config.mk b/lib/config.mk index 1befd5da..9eb54686 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -281,8 +281,6 @@ TUNE_GPUJOB_SUBMIT ?= - - ## existing projects in WORKHOME ALL_LANG_PAIRS := ${shell ls ${WORKHOME} | grep -- '-' | grep -v old} ALL_BILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'} @@ -293,6 +291,8 @@ ALL_MULTILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep ## pre-processing and vocabulary ##---------------------------------------------------------------------------- +## type of subword segmentation (bpe|spm) +## model size (NOTE: BPESIZE is also used for sentencepiece!) SUBWORDS ?= spm BPESIZE ?= 32000 SRCBPESIZE ?= ${BPESIZE} @@ -306,10 +306,12 @@ BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-mode SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model +## don't delete BPE/sentencepiece models! .PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL} .PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} - +## size of the joined vocabulary +## TODO: heuristically add 1,000 to cover language labels is a bit ad-hoc VOCABSIZE ?= $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000)) ## for document-level models @@ -353,7 +355,7 @@ WORKDIR = ${WORKHOME}/${LANGPAIRSTR} MODELDIR = ${WORKHOME}/models/${LANGPAIRSTR} SPMDIR = ${WORKHOME}/SentencePieceModels -## data sets +## train data sets (word alignment for the guided alignment option) TRAIN_BASE = ${WORKDIR}/train/${DATASET} TRAIN_SRC = ${TRAIN_BASE}.src TRAIN_TRG = ${TRAIN_BASE}.trg @@ -364,7 +366,7 @@ LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono - +## dev and test data DEV_SRC ?= ${WORKDIR}/val/${DEVSET_NAME}.src DEV_TRG ?= ${WORKDIR}/val/${DEVSET_NAME}.trg @@ -372,8 +374,15 @@ TEST_SRC ?= ${WORKDIR}/test/${TESTSET_NAME}.src TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg +## model basename and optional sub-dir + MODEL_SUBDIR = MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG} + + +## supported model types +## configuration for each type is in lib/train.mk + MODELTYPES = transformer \ transformer-big \ transformer-align \ diff --git a/lib/generic.mk b/lib/generic.mk index b3d6a5df..060af28c 100644 --- a/lib/generic.mk +++ b/lib/generic.mk @@ -372,6 +372,12 @@ endif ${@:-pivot=} +%-big-align: + ${MAKE} PRE_TRAINED_MODEL=${MODEL_FINAL} \ + MODELTYPE=transformer-big-align \ + ${@:-big-align=} + + ## run a multigpu job (2 or 4 GPUs) diff --git a/lib/projects/distill.mk b/lib/projects/distill.mk index c81ceaea..05434fbf 100644 --- a/lib/projects/distill.mk +++ b/lib/projects/distill.mk @@ -74,6 +74,37 @@ afreng-bt-tiny: +afreng: + make TATOEBA_VERSION=v2020-07-28 \ + SRCLANGS=afr TRGLANGS=eng \ + all-job-tatoeba + + +afreng-small: + make TATOEBA_VERSION=v2020-07-28 \ + BT_CONTINUE_EXISTING=0 \ + SRCLANGS=afr TRGLANGS=eng \ + MODELTYPE=transformer-small-align \ + MARIAN_WORKSPACE=10000 \ + all-job-tatoeba + +afreng-tiny: + make TATOEBA_VERSION=v2020-07-28 \ + BT_CONTINUE_EXISTING=0 \ + SRCLANGS=afr TRGLANGS=eng \ + MODELTYPE=transformer-tiny-align \ + MARIAN_WORKSPACE=10000 \ + all-job-tatoeba + +afreng-small-eval: + make TATOEBA_VERSION=v2020-07-28 \ + BT_CONTINUE_EXISTING=0 \ + SRCLANGS=afr TRGLANGS=eng \ + MODELTYPE=transformer-small-align \ + MARIAN_WORKSPACE=10000 \ + eval-tatoeba + + diff --git a/lib/train.mk b/lib/train.mk index 9ebb0ed4..7823db2f 100644 --- a/lib/train.mk +++ b/lib/train.mk @@ -6,35 +6,26 @@ #------------------------------------------------------------------------ + +## extract vocabulary from sentence piece model + +${WORKDIR}/${MODEL}.src.vocab: ${SPMSRCMODEL} + cut -f1 < $<.vocab > $@ +ifeq (${USE_TARGET_LABELS},1) + echo "${TARGET_LABELS}" | tr ' ' "\n" >> $@ +endif + +${WORKDIR}/${MODEL}.trg.vocab: ${SPMTRGMODEL} + cut -f1 < $<.vocab > $@ + + ifeq (${SUBWORDS},spm) ## make vocabulary from the source and target language specific ## sentence piece models (concatenate and yamlify) -## TODO: verify that this becomes valid YAML! -${MODEL_VOCAB}: ${SPMSRCMODEL} ${SPMTRGMODEL} -ifneq (${MODEL_LATEST_VOCAB},) -ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB}) - cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB} -endif -else - cut -f1 < ${word 1,$^}.vocab > ${@:.vocab.yml=.src.vocab} - cut -f1 < ${word 2,$^}.vocab > ${@:.vocab.yml=.trg.vocab} -ifeq (${USE_TARGET_LABELS},1) - echo "${TARGET_LABELS}" | tr ' ' "\n" >> ${@:.vocab.yml=.src.vocab} -endif - cat ${@:.vocab.yml=.src.vocab} ${@:.vocab.yml=.trg.vocab} | \ - sort -u | scripts/vocab2yaml.py > $@ - -## old buggy style ... -# cat ${@:.vocab.yml=.src.vocab} ${@:.vocab.yml=.trg.vocab} | \ -# sort -u | nl -v 0 | sed 's/^ *//'> $@.numbered -# cut -f1 $@.numbered > $@.ids -# cut -f2 $@.numbered | sed 's/\\/\\\\/g;s/\"/\\\"/g;s/^\(.*\)$$/"\1"/;s/$$/:/'> $@.tokens -# paste -d ' ' $@.tokens $@.ids > $@ -# rm -f $@.tokens $@.ids $@.numbered - -endif +${WORKDIR}/${MODEL}.vocab.yml: ${WORKDIR}/${MODEL}.src.vocab ${WORKDIR}/${MODEL}.trg.vocab + cat $^ | sort -u | scripts/vocab2yaml.py > $@ else @@ -42,12 +33,12 @@ else ## - no new vocabulary is created if the file already exists! ## - need to delete the file if you want to create a new one! -${MODEL_VOCAB}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ - ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz -ifeq ($(wildcard ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}),) -ifneq (${MODEL_LATEST_VOCAB},) -ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB}) - cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB} +${WORKDIR}/${MODEL}.vocab.yml: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ + ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz +ifeq ($(wildcard $@),) +ifneq ($(wildcard ${MODEL_LATEST_VOCAB}),) +ifneq (${MODEL_LATEST_VOCAB},$@) + cp ${MODEL_LATEST_VOCAB} $@ endif else mkdir -p ${dir $@} @@ -59,35 +50,12 @@ else @echo "WARNING! Delete the file if you want to start from scratch!" touch $@ endif - endif - -## if USE_SPM_VOCAB is set: -## get separate source and target language vocabularies -## from the two individual sentence piece models - -ifeq ($(USE_SPM_VOCAB),1) -${MODEL_SRCVOCAB}: ${SPMSRCMODEL} - cut -f1 < $<.vocab > $@ -ifeq (${USE_TARGET_LABELS},1) - echo "${TARGET_LABELS}" | tr ' ' "\n" >> $@ -endif - -${MODEL_TRGVOCAB}: ${SPMTRGMODEL} - cut -f1 < $<.vocab > $@ -endif - - - - print-latest: -ifneq (${wildcard ${MODEL_LATEST}},) -ifeq (${wildcard ${MODEL_START}},) - @echo "cp ${MODEL_LATEST} ${MODEL_START}" -endif -endif + @echo "latest model: ${MODEL_LATEST}" + @echo "start model: ${MODEL_START}" @@ -100,15 +68,12 @@ endif MARIAN_MODELS_DONE = ${patsubst %,${WORKDIR}/${MODEL}.%.model${NR}.done,${MODELTYPES}} MARIAN_TRAIN_PREREQS = ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ - ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz + ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \ + $(sort ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}) ## define validation and early-stopping parameters ## as well as pre-requisites for training the model -## -## NEW: take away dependency on ${MODEL_VOCAB} -## (will be created by marian if it does not exist) -## TODO: should we create the dependency again? ifndef SKIP_VALIDATION MARIAN_TRAIN_PREREQS += ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} @@ -137,9 +102,15 @@ else endif +# start weights with a pre-trained model + +ifneq (${wildcard ${PRE_TRAINED_MODEL}},) + MARIAN_EXTRA += --pretrained-model ${PRE_TRAINED_MODEL} +endif + ## dependencies and extra parameters -## for models with guided alignment +## for different models and guided alignment ifeq (${MODELTYPE},transformer-align) MARIAN_TRAIN_PREREQS += ${TRAIN_ALG} @@ -176,7 +147,6 @@ endif ifeq (${MODELTYPE},transformer-big-align) MARIAN_ENC_DEPTH = 12 MARIAN_ATT_HEADS = 16 - MARIAN_DIM_EMB = 1024 MARIAN_TRAIN_PREREQS += ${TRAIN_ALG} MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG} GPUJOB_HPC_MEM = 16g @@ -185,10 +155,10 @@ endif ifeq (${MODELTYPE},transformer-big) MARIAN_ENC_DEPTH = 12 MARIAN_ATT_HEADS = 16 - MARIAN_DIM_EMB = 1024 GPUJOB_HPC_MEM = 16g endif +# MARIAN_DIM_EMB = 1024 ## finally: recipe for training transformer model @@ -200,19 +170,13 @@ ${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS} ## (check lib/config.mk to see how the latest model is found) ##-------------------------------------------------------------------- ifeq (${wildcard ${MODEL_START}},) -ifneq (${MODEL_LATEST},) -ifneq (${MODEL_LATEST_VOCAB},) -ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB}) - cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB} -endif +ifneq (${wildcard ${MODEL_LATEST}},) ifneq (${MODEL_LATEST},${MODEL_START}) cp ${MODEL_LATEST} ${MODEL_START} endif endif endif -endif ##-------------------------------------------------------------------- - ${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} ${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \ ${MARIAN_STOP_CRITERIA} \ --model $(@:.done=.npz) \ diff --git a/scripts/fix_vocab.py b/scripts/fix_vocab.py index c6965880..5e34d0fb 100755 --- a/scripts/fix_vocab.py +++ b/scripts/fix_vocab.py @@ -10,7 +10,7 @@ filename = sys.argv[1] try: input = open(filename, 'r') - yaml.load(input) + yaml.safe_load(input) except: print('YAML file is broken - try to fix it!') print(f'copy {filename} to {filename}.bak')