plain text vocab files from spm models

This commit is contained in:
Jörg Tiedemann 2020-09-13 22:17:21 +03:00
parent 24e92de56a
commit c2798e9758
7 changed files with 71 additions and 187 deletions

View File

@ -323,7 +323,7 @@ train-and-eval-job:
data: ${TRAIN_SRC}.clean.${PRE_SRC}.gz ${TRAIN_TRG}.clean.${PRE_TRG}.gz \
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
${MAKE} ${TEST_SRC}.${PRE_SRC} ${TEST_TRG}
${MAKE} ${MODEL_VOCAB}
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
ifeq (${MODELTYPE},transformer-align)
${MAKE} ${TRAIN_ALG}
endif
@ -344,7 +344,7 @@ wordalign: ${TRAIN_ALG}
## other model types
vocab: ${MODEL_VOCAB}
vocab: ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
train: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
translate: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
eval: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}.eval

View File

@ -219,7 +219,7 @@ DEVSET ?= ${firstword ${foreach c,${POTENTIAL_DEVSETS},${filter ${c},${BIGGER_B
## increase dev/test sets for Tatoeba (very short sentences!)
ifeq (${DEVSET},Tatoeba)
DEVSIZE = 5000
DEVSIZE = 5000
TESTSIZE = 5000
endif
@ -254,19 +254,18 @@ TRGBPESIZE ?= ${BPESIZE}
BPEMODELNAME ?= opus
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
# BPESRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
# BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
BPESRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.${SRCBPESIZE:000=}k-model.bpe
BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.${TRGBPESIZE:000=}k-model.bpe
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
# SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
# SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.${SRCBPESIZE:000=}k-model.spm
SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.${TRGBPESIZE:000=}k-model.spm
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
VOCABSIZE ?= $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))
@ -341,25 +340,29 @@ MODEL = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
MODELTYPE = transformer
NR = 1
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
MODEL_VALIDLOG = ${MODEL}.${MODELTYPE}.valid${NR}.log
MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
ifeq (${MODELTYPE},transformer-spm)
MODEL_VOCABTYPE = spm
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
MODEL_SRCVOCAB = ${SPMSRCMODEL}
MODEL_TRGVOCAB = ${SPMTRGMODEL}
PRE_SRC = plain
PRE_TRG = plain
# MODEL_SRCVOCAB = ${MODEL_VOCAB}
# MODEL_TRGVOCAB = ${MODEL_VOCAB}
## for sentence-piece models: get plain text vocabularies
## for others: extract vocabulary from training data with MarianNMT
## backwards compatibility: if there is already a vocab-file then use it
ifeq (${SUBWORDS},spm)
ifneq ($(wildcard ${WORKDIR}/${MODEL}.vocab.yml),)
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.yml
MODEL_SRCVOCAB = ${MODEL_VOCAB}
MODEL_TRGVOCAB = ${MODEL_VOCAB}
else
MODEL_VOCABTYPE = yml
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab
MODEL_SRCVOCAB = ${WORKDIR}/${MODEL}.src.vocab
MODEL_TRGVOCAB = ${WORKDIR}/${MODEL}.trg.vocab
endif
else
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.yml
MODEL_SRCVOCAB = ${MODEL_VOCAB}
MODEL_TRGVOCAB = ${MODEL_VOCAB}
endif
@ -367,13 +370,11 @@ endif
## latest model with the same pre-processing but any data or modeltype
ifdef CONTINUE_EXISTING
MODEL_LATEST = $(firstword ${shell ls -t ${WORKDIR}/*.${PRE_SRC}-${PRE_TRG}.*.best-perplexity.npz 2>/dev/null})
MODEL_LATEST_VOCAB = $(shell echo "${MODEL_LATEST}" | \
sed 's|\.${PRE_SRC}-${PRE_TRG}\..*$$|.${PRE_SRC}-${PRE_TRG}.vocab.${MODEL_VOCABTYPE}|')
sed 's|\.${PRE_SRC}-${PRE_TRG}\..*$$|.${PRE_SRC}-${PRE_TRG}.vocab.yml|')
endif
@ -398,12 +399,15 @@ MARIAN_MAXI_BATCH = 500
MARIAN_DROPOUT = 0.1
MARIAN_MAX_LENGTH = 500
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
MARIAN_DECODER_GPU = -b 12 -n1 -d ${MARIAN_GPUS} \
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} --mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
MARIAN_DECODER_CPU = -b 12 -n1 --cpu-threads ${HPC_CORES} \
--mini-batch 8 --maxi-batch 32 --maxi-batch-sort src \
--max-length ${MARIAN_MAX_LENGTH} --max-length-crop
MARIAN_DECODER_FLAGS = ${MARIAN_DECODER_GPU}
## TODO: currently marianNMT crashes with workspace > 26000
ifeq (${GPU},p100)
MARIAN_WORKSPACE = 13000

View File

@ -152,14 +152,6 @@ ifneq ("$(wildcard ${BPESRCMODEL})","")
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
PREPROCESS_DESCRIPTION = normalization + tokenization + BPE (${PRE_SRC},${PRE_TRG})
else ifeq (${MODELTYPE},transformer-spm)
PREPROCESS_TYPE = txt
SUBWORD_TYPE = spm
RELEASE_SRCVOCAB = source.spm
RELEASE_TRGVOCAB = target.spm
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
PREPROCESS_DESCRIPTION = normalization + in-build SentencePiece (${PRE_SRC},${PRE_TRG})
else
PREPROCESS_TYPE = spm
SUBWORD_TYPE = spm
@ -468,7 +460,7 @@ LASTTRG = ${lastword ${TRGLANGS}}
MODEL_OLD = ${MODEL_SUBDIR}${DATASET}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}.${LASTSRC}${LASTTRG}
MODEL_OLD_BASENAME = ${MODEL_OLD}.${MODELTYPE}.model${NR}
MODEL_OLD_FINAL = ${WORKDIR}/${MODEL_OLD_BASENAME}.npz.best-perplexity.npz
MODEL_OLD_VOCAB = ${WORKDIR}/${MODEL_OLD}.vocab.${MODEL_VOCABTYPE}
MODEL_OLD_VOCAB = ${WORKDIR}/${MODEL_OLD}.vocab.yml
MODEL_OLD_DECODER = ${MODEL_OLD_FINAL}.decoder.yml
MODEL_TRANSLATE = ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}
MODEL_OLD_TRANSLATE = ${WORKDIR}/${TESTSET_NAME}.${MODEL_OLD}${NR}.${MODELTYPE}.${SRC}.${TRG}

View File

@ -221,7 +221,7 @@ endif
PHONY: install-prerequisites install-prereq install-requirements
install-prerequisites install-prereq install-requirements:
${PIP} install --user -r requirements.txt
${MAKE} install-perl-modules:
${MAKE} install-perl-modules
${MAKE} ${PREREQ_TOOLS}
.PHONY: install-perl-modules
@ -285,5 +285,5 @@ ${TOOLSDIR}/protobuf/bin/protoc:
${TOOLSDIR}/eflomal/eflomal:
${MAKE} -C ${dir $@} all
cd ${dir $@} && python3 setup.py install
cd ${dir $@} && python3 setup.py install --user
# python3 setup.py install --install-dir ${HOME}/.local

View File

@ -245,7 +245,7 @@ listallmodels:
BT_MODEL = ${MODEL_SUBDIR}${DATASET}+bt${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
BT_MODEL_BASE = ${BT_MODEL}.${MODELTYPE}.model${NR}
BT_MODEL_START = ${WORKDIR}/${BT_MODEL_BASE}.npz
BT_MODEL_VOCAB = ${WORKDIR}/${BT_MODEL}.vocab.${MODEL_VOCABTYPE}
BT_MODEL_VOCAB = ${WORKDIR}/${BT_MODEL}.vocab.yml
# %-add-backtranslations:
%-bt:
@ -268,7 +268,7 @@ endif
PIVOT_MODEL = ${MODEL_SUBDIR}${DATASET}+pivot${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
PIVOT_MODEL_BASE = ${PIVOT_MODEL}.${MODELTYPE}.model${NR}
PIVOT_MODEL_START = ${WORKDIR}/${PIVOT_MODEL_BASE}.npz
PIVOT_MODEL_VOCAB = ${WORKDIR}/${PIVOT_MODEL}.vocab.${MODEL_VOCABTYPE}
PIVOT_MODEL_VOCAB = ${WORKDIR}/${PIVOT_MODEL}.vocab.yml
%-pivot:
ifneq (${wildcard ${MODEL_FINAL}},)

View File

@ -1,8 +1,6 @@
# -*-makefile-*-
## resume training on an existing model
resume:
if [ -e ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.npz.best-perplexity.npz ]; then \
@ -15,10 +13,9 @@ resume:
#------------------------------------------------------------------------
# training MarianNMT models
# vocabulary
#------------------------------------------------------------------------
## make vocabulary
## - no new vocabulary is created if the file already exists!
## - need to delete the file if you want to create a new one!
@ -29,11 +26,9 @@ ifeq ($(wildcard ${MODEL_VOCAB}),)
ifneq (${MODEL_LATEST_VOCAB},)
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
else
ifneq (${MODEL_VOCABTYPE},spm)
mkdir -p ${dir $@}
${LOADMODS} && ${ZCAT} $^ | ${MARIAN_VOCAB} --max-size ${VOCABSIZE} > $@
endif
endif
else
@echo "$@ already exists!"
@echo "WARNING! No new vocabulary is created even though the data has changed!"
@ -42,6 +37,16 @@ else
endif
## get vocabulary from sentence piece model
ifeq (${SUBWORDS},spm)
${MODEL_SRCVOCAB}: ${SPMSRCMODEL}
cut -f1 < $<.vocab > $@
${MODEL_TRGVOCAB}: ${SPMTRGMODEL}
cut -f1 < $<.vocab > $@
endif
print-latest:
ifneq (${wildcard ${MODEL_LATEST}},)
ifeq (${wildcard ${MODEL_START}},)
@ -50,14 +55,32 @@ endif
endif
#------------------------------------------------------------------------
# training MarianNMT models
#------------------------------------------------------------------------
## NEW: take away dependency on ${MODEL_VOCAB}
## (will be created by marian if it does not exist)
## possible model variants
MARIAN_MODELS_DONE = ${WORKDIR}/${MODEL}.transformer.model${NR}.done \
${WORKDIR}/${MODEL}.transformer-align.model${NR}.done
MARIAN_TRAIN_PREREQS = ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
## dependencies and extra parameters
ifeq (${MODELTYPE},transformer-align)
MARIAN_TRAIN_PREREQS += ${TRAIN_ALG}
MARIAN_EXTRA += --guided-alignment ${TRAIN_ALG}
endif
## train transformer model
${WORKDIR}/${MODEL}.transformer.model${NR}.done ${WORKDIR}/${MODEL}.transformer-spm.model${NR}.done: \
${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
${MARIAN_MODELS_DONE}: ${MARIAN_TRAIN_PREREQS}
mkdir -p ${dir $@}
##--------------------------------------------------------------------
## in case we want to continue training from the latest existing model
@ -108,139 +131,3 @@ endif
--tempdir ${TMPDIR} \
--exponential-smoothing
touch $@
## NEW: take away dependency on ${MODEL_VOCAB}
## train transformer model with guided alignment
${WORKDIR}/${MODEL}.transformer-align.model${NR}.done: \
${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
${TRAIN_ALG} \
${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
mkdir -p ${dir $@}
##--------------------------------------------------------------------
## in case we want to continue training from the latest existing model
## (check lib/config.mk to see how the latest model is found)
##--------------------------------------------------------------------
ifeq (${wildcard ${MODEL_START}},)
ifneq (${MODEL_LATEST},)
ifneq (${MODEL_LATEST_VOCAB},)
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
cp ${MODEL_LATEST} ${MODEL_START}
endif
endif
endif
##--------------------------------------------------------------------
${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \
--model $(@:.done=.npz) \
--type transformer \
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
--max-length 500 \
--vocabs ${MODEL_VOCAB} ${MODEL_VOCAB} \
--mini-batch-fit \
-w ${MARIAN_WORKSPACE} \
--maxi-batch ${MARIAN_MAXI_BATCH} \
--early-stopping ${MARIAN_EARLY_STOPPING} \
--valid-freq ${MARIAN_VALID_FREQ} \
--save-freq ${MARIAN_SAVE_FREQ} \
--disp-freq ${MARIAN_DISP_FREQ} \
--valid-sets ${word 4,$^} ${word 5,$^} \
--valid-metrics perplexity \
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
--beam-size 12 --normalize 1 --allow-unk \
--log $(@:.model${NR}.done=.train${NR}.log) \
--valid-log $(@:.model${NR}.done=.valid${NR}.log) \
--enc-depth 6 --dec-depth 6 \
--transformer-heads 8 \
--transformer-postprocess-emb d \
--transformer-postprocess dan \
--transformer-dropout ${MARIAN_DROPOUT} \
--label-smoothing 0.1 \
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
--tied-embeddings-all \
--overwrite --keep-best \
--devices ${MARIAN_GPUS} \
--sync-sgd --seed ${SEED} \
--sqlite \
--tempdir ${TMPDIR} \
--exponential-smoothing \
--guided-alignment ${word 3,$^}
touch $@
# ${TRAIN_SRC}.clean${TRAINSIZE}.gz: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz
# ${ZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
# sed 's/\@\@ //g;s/ \@\@//g;s/ \@\-\@ /-/g' | ${GZIP} -c > $@
# ${TRAIN_TRG}.clean${TRAINSIZE}.gz: ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
# ${ZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
# sed 's/\@\@ //g;s/ \@\@//g;s/ \@\-\@ /-/g' | ${GZIP} -c > $@
# ## train transformer model
# ${WORKDIR}/${MODEL}.transformer-spm.model${NR}.done: \
# ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
# ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
# ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG}
# mkdir -p ${dir $@}
# ##--------------------------------------------------------------------
# ## in case we want to continue training from the latest existing model
# ## (check lib/config.mk to see how the latest model is found)
# ##--------------------------------------------------------------------
# ifeq (${wildcard ${MODEL_START}},)
# ifneq (${MODEL_LATEST},)
# ifneq (${MODEL_LATEST_VOCAB},)
# cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
# cp ${MODEL_LATEST} ${MODEL_START}
# endif
# endif
# endif
# ##--------------------------------------------------------------------
# ${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
# ${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \
# --model $(@:.done=.npz) \
# --type transformer \
# --train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
# --max-length 500 \
# --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
# --mini-batch-fit \
# -w ${MARIAN_WORKSPACE} \
# --maxi-batch ${MARIAN_MAXI_BATCH} \
# --early-stopping ${MARIAN_EARLY_STOPPING} \
# --valid-freq ${MARIAN_VALID_FREQ} \
# --save-freq ${MARIAN_SAVE_FREQ} \
# --disp-freq ${MARIAN_DISP_FREQ} \
# --valid-sets ${word 3,$^} ${word 4,$^} \
# --valid-metrics perplexity \
# --valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
# --beam-size 12 --normalize 1 --allow-unk \
# --log $(@:.model${NR}.done=.train${NR}.log) \
# --valid-log $(@:.model${NR}.done=.valid${NR}.log) \
# --enc-depth 6 --dec-depth 6 \
# --transformer-heads 8 \
# --transformer-postprocess-emb d \
# --transformer-postprocess dan \
# --transformer-dropout ${MARIAN_DROPOUT} \
# --label-smoothing 0.1 \
# --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
# --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
# --tied-embeddings-all \
# --overwrite --keep-best \
# --devices ${MARIAN_GPUS} \
# --sync-sgd --seed ${SEED} \
# --sqlite \
# --tempdir ${TMPDIR} \
# --exponential-smoothing
# touch $@

View File

@ -3,3 +3,4 @@ iso-639
opustools
subword-nmt
sacrebleu
Cython