Merge branch 'master' of github.com:Helsinki-NLP/OPUS-MT-train

This commit is contained in:
Joerg Tiedemann 2021-11-04 10:52:52 +02:00
commit 0e7b3e173a
7 changed files with 146 additions and 76 deletions

View File

@ -378,3 +378,22 @@ train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
${MAKE} local-dist
${MAKE} -C backtranslate MODELHOME=${MODELDIR} translate-all-wikis-jobs
ALL_RELEASED_MODELS = ${wildcard models-tatoeba/*/*.zip}
ALL_VOCABS_FIXED = ${patsubst %.zip,%.fixed-vocab,${ALL_RELEASED_MODELS}}
fix-released-vocabs: ${ALL_VOCABS_FIXED}
%.fixed-vocab: %.zip
@( v=`unzip -l $< | grep 'vocab.yml$$' | sed 's/^.* //'`; \
if [ "$$v" != "" ]; then \
unzip $< $$v; \
python3 scripts/fix_vocab.py $$v; \
if [ -e $$v.bak ]; then \
echo "update $$v in $<"; \
zip $< $$v $$v.bak; \
else \
echo "vocab $$v is fine in $<"; \
fi; \
rm -f $$v $$v.bak; \
fi )

View File

@ -20,6 +20,9 @@ WORK_DESTDIR ?= ${WORKHOME}
WORK_CONTAINER ?= OPUS-MT-train_${notdir ${WORKHOME}}-${WHOAMI}
WORK_CONTAINER_JT ?= OPUS-MT-train_${notdir ${WORKHOME}}-tiedeman
ALLAS_STORAGE_URL = https://object.pouta.csc.fi/
## store workdir on allas
store:
cd ${WORK_SRCDIR} && a-put -b ${WORK_CONTAINER} --nc --follow-links --override ${LANGPAIRSTR}
@ -43,3 +46,41 @@ fetch-data:
mkdir -p ${WORK_DESTDIR}
cd ${WORK_DESTDIR} && a-get ${WORK_CONTAINER}/data.tar
## generic recipe for storing work data and removing it from the file system
## DANGEROUS --- this really deletes the data!
## NOTE: makes container also world-readable (see swift post command)
## --> this makes it easier to fetch things without login credentials
## --> should not store sensitive data here!
%.stored: %
if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
b=OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}; \
cd $(dir $@); \
a-put -b $$b --nc --follow-links --override $(notdir $<); \
rm -fr $(notdir $<); \
touch $(notdir $@); \
rm -f $(notdir $(@:stored=.fetched)); \
swift post $$b --read-acl ".r:*"
fi
## TODO: fetch with wget instead of using a-commands
## fetch work data from allas
%.fetched:
if [ "$(firstword $(subst -, ,$(subst /, ,$@)))" == "work" ]; then \
cd $(dir $@); \
a-get OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/$(notdir $(@:.fetched=.tar)); \
touch $(notdir $@); \
rm -f $(notdir $(@:fetched=.stored)); \
fi
## another way of fetching work data
## requires settings SRCLANGS and TRGLANGS (or LANGPAIRSTR directly)
work-%/${LANGPAIRSTR}:
mkdir -p $(dir $@)
cd $(dir $@) && a-get OPUS-MT-train_$(subst /,-,$(dir $@))${WHOAMI}/${LANGPAIRSTR}.tar
UPLOAD_MODELS=$(patsubst %,%.stored,${wildcard work-tatoeba/[dg-rt-z]*})
upload-workfiles: ${UPLOAD_MODELS}

View File

@ -281,8 +281,6 @@ TUNE_GPUJOB_SUBMIT ?=
## existing projects in WORKHOME
ALL_LANG_PAIRS := ${shell ls ${WORKHOME} | grep -- '-' | grep -v old}
ALL_BILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep -v -- '\+'}
@ -293,6 +291,8 @@ ALL_MULTILINGUAL_MODELS := ${shell echo '${ALL_LANG_PAIRS}' | tr ' ' "\n" | grep
## pre-processing and vocabulary
##----------------------------------------------------------------------------
## type of subword segmentation (bpe|spm)
## model size (NOTE: BPESIZE is also used for sentencepiece!)
SUBWORDS ?= spm
BPESIZE ?= 32000
SRCBPESIZE ?= ${BPESIZE}
@ -306,10 +306,12 @@ BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-mode
SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
## don't delete BPE/sentencepiece models!
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
## size of the joined vocabulary
## TODO: heuristically add 1,000 to cover language labels is a bit ad-hoc
VOCABSIZE ?= $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))
## for document-level models
@ -353,7 +355,7 @@ WORKDIR = ${WORKHOME}/${LANGPAIRSTR}
MODELDIR = ${WORKHOME}/models/${LANGPAIRSTR}
SPMDIR = ${WORKHOME}/SentencePieceModels
## data sets
## train data sets (word alignment for the guided alignment option)
TRAIN_BASE = ${WORKDIR}/train/${DATASET}
TRAIN_SRC = ${TRAIN_BASE}.src
TRAIN_TRG = ${TRAIN_BASE}.trg
@ -364,7 +366,7 @@ LOCAL_TRAIN_SRC = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.src
LOCAL_TRAIN_TRG = ${TMPDIR}/${LANGPAIRSTR}/train/${DATASET}.trg
LOCAL_MONO_DATA = ${TMPDIR}/${LANGSTR}/train/${DATASET}.mono
## dev and test data
DEV_SRC ?= ${WORKDIR}/val/${DEVSET_NAME}.src
DEV_TRG ?= ${WORKDIR}/val/${DEVSET_NAME}.trg
@ -372,8 +374,15 @@ TEST_SRC ?= ${WORKDIR}/test/${TESTSET_NAME}.src
TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
## model basename and optional sub-dir
MODEL_SUBDIR =
MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
## supported model types
## configuration for each type is in lib/train.mk
MODELTYPES = transformer \
transformer-big \
transformer-align \

View File

@ -372,6 +372,12 @@ endif
${@:-pivot=}
%-big-align:
${MAKE} PRE_TRAINED_MODEL=${MODEL_FINAL} \
MODELTYPE=transformer-big-align \
${@:-big-align=}
## run a multigpu job (2 or 4 GPUs)

View File

@ -74,6 +74,37 @@ afreng-bt-tiny:
afreng:
make TATOEBA_VERSION=v2020-07-28 \
SRCLANGS=afr TRGLANGS=eng \
all-job-tatoeba
afreng-small:
make TATOEBA_VERSION=v2020-07-28 \
BT_CONTINUE_EXISTING=0 \
SRCLANGS=afr TRGLANGS=eng \
MODELTYPE=transformer-small-align \
MARIAN_WORKSPACE=10000 \
all-job-tatoeba
afreng-tiny:
make TATOEBA_VERSION=v2020-07-28 \
BT_CONTINUE_EXISTING=0 \
SRCLANGS=afr TRGLANGS=eng \
MODELTYPE=transformer-tiny-align \
MARIAN_WORKSPACE=10000 \
all-job-tatoeba
afreng-small-eval:
make TATOEBA_VERSION=v2020-07-28 \
BT_CONTINUE_EXISTING=0 \
SRCLANGS=afr TRGLANGS=eng \
MODELTYPE=transformer-small-align \
MARIAN_WORKSPACE=10000 \
eval-tatoeba

View File

@ -6,35 +6,26 @@
#------------------------------------------------------------------------
## extract vocabulary from sentence piece model
${WORKDIR}/${MODEL}.src.vocab: ${SPMSRCMODEL}
cut -f1 < $<.vocab > $@
ifeq (${USE_TARGET_LABELS},1)
echo "${TARGET_LABELS}" | tr ' ' "\n" >> $@
endif
${WORKDIR}/${MODEL}.trg.vocab: ${SPMTRGMODEL}
cut -f1 < $<.vocab > $@
ifeq (${SUBWORDS},spm)
## make vocabulary from the source and target language specific
## sentence piece models (concatenate and yamlify)
## TODO: verify that this becomes valid YAML!
${MODEL_VOCAB}: ${SPMSRCMODEL} ${SPMTRGMODEL}
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
endif
else
cut -f1 < ${word 1,$^}.vocab > ${@:.vocab.yml=.src.vocab}
cut -f1 < ${word 2,$^}.vocab > ${@:.vocab.yml=.trg.vocab}
ifeq (${USE_TARGET_LABELS},1)
echo "${TARGET_LABELS}" | tr ' ' "\n" >> ${@:.vocab.yml=.src.vocab}
endif
cat ${@:.vocab.yml=.src.vocab} ${@:.vocab.yml=.trg.vocab} | \
sort -u | scripts/vocab2yaml.py > $@
## old buggy style ...
# cat ${@:.vocab.yml=.src.vocab} ${@:.vocab.yml=.trg.vocab} | \
# sort -u | nl -v 0 | sed 's/^ *//'> $@.numbered
# cut -f1 $@.numbered > $@.ids
# cut -f2 $@.numbered | sed 's/\\/\\\\/g;s/\"/\\\"/g;s/^\(.*\)$$/"\1"/;s/$$/:/'> $@.tokens
# paste -d ' ' $@.tokens $@.ids > $@
# rm -f $@.tokens $@.ids $@.numbered
endif
${WORKDIR}/${MODEL}.vocab.yml: ${WORKDIR}/${MODEL}.src.vocab ${WORKDIR}/${MODEL}.trg.vocab
cat $^ | sort -u | scripts/vocab2yaml.py > $@
else
@ -42,12 +33,12 @@ else
## - no new vocabulary is created if the file already exists!
## - need to delete the file if you want to create a new one!
${MODEL_VOCAB}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
ifeq ($(wildcard ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}),)
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
${WORKDIR}/${MODEL}.vocab.yml: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
ifeq ($(wildcard $@),)
ifneq ($(wildcard ${MODEL_LATEST_VOCAB}),)
ifneq (${MODEL_LATEST_VOCAB},$@)
cp ${MODEL_LATEST_VOCAB} $@
endif
else
mkdir -p ${dir $@}
@ -59,35 +50,12 @@ else
@echo "WARNING! Delete the file if you want to start from scratch!"
touch $@
endif
endif
## if USE_SPM_VOCAB is set:
## get separate source and target language vocabularies
## from the two individual sentence piece models
ifeq ($(USE_SPM_VOCAB),1)
${MODEL_SRCVOCAB}: ${SPMSRCMODEL}
cut -f1 < $<.vocab > $@
ifeq (${USE_TARGET_LABELS},1)
echo "${TARGET_LABELS}" | tr ' ' "\n" >> $@
endif
${MODEL_TRGVOCAB}: ${SPMTRGMODEL}
cut -f1 < $<.vocab > $@
endif
print-latest:
ifneq (${wildcard ${MODEL_LATEST}},)
ifeq (${wildcard ${MODEL_START}},)
@echo "cp ${MODEL_LATEST} ${MODEL_START}"
endif
endif
@echo "latest model: ${MODEL_LATEST}"
@echo "start model: ${MODEL_START}"
@ -100,15 +68,12 @@ endif
MARIAN_MODELS_DONE = ${patsubst %,${WORKDIR}/${MODEL}.%.model${NR}.done,${MODELTYPES}}
MARIAN_TRAIN_PREREQS = ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
$(sort ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB})
## define validation and early-stopping parameters
## as well as pre-requisites for training the model
##
## NEW: take away dependency on ${MODEL_VOCAB}
## (will be created by marian if it does not exist)
## TODO: should we create the dependency again?
ifndef SKIP_VALIDATION
MARIAN_TRAIN_PREREQS += ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
@ -137,9 +102,15 @@ else
endif
# start weights with a pre-trained model
ifneq (${wildcard ${PRE_TRAINED_MODEL}},)
MARIAN_EXTRA += --pretrained-model ${PRE_TRAINED_MODEL}
endif
## dependencies and extra parameters
## for models with guided alignment
## for different models and guided alignment
ifeq (${MODELTYPE},transformer-align)
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
@ -176,7 +147,6 @@ endif
ifeq (${MODELTYPE},transformer-big-align)
MARIAN_ENC_DEPTH = 12
MARIAN_ATT_HEADS = 16
MARIAN_DIM_EMB = 1024
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG}
GPUJOB_HPC_MEM = 16g
@ -185,10 +155,10 @@ endif
ifeq (${MODELTYPE},transformer-big)
MARIAN_ENC_DEPTH = 12
MARIAN_ATT_HEADS = 16
MARIAN_DIM_EMB = 1024
GPUJOB_HPC_MEM = 16g
endif
# MARIAN_DIM_EMB = 1024
## finally: recipe for training transformer model
@ -200,19 +170,13 @@ ${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS}
## (check lib/config.mk to see how the latest model is found)
##--------------------------------------------------------------------
ifeq (${wildcard ${MODEL_START}},)
ifneq (${MODEL_LATEST},)
ifneq (${MODEL_LATEST_VOCAB},)
ifneq (${MODEL_LATEST_VOCAB},${MODEL_VOCAB})
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
endif
ifneq (${wildcard ${MODEL_LATEST}},)
ifneq (${MODEL_LATEST},${MODEL_START})
cp ${MODEL_LATEST} ${MODEL_START}
endif
endif
endif
endif
##--------------------------------------------------------------------
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \
${MARIAN_STOP_CRITERIA} \
--model $(@:.done=.npz) \

View File

@ -10,7 +10,7 @@ filename = sys.argv[1]
try:
input = open(filename, 'r')
yaml.load(input)
yaml.safe_load(input)
except:
print('YAML file is broken - try to fix it!')
print(f'copy {filename} to {filename}.bak')