internal sentence piece models in transformers

This commit is contained in:
Jörg Tiedemann 2020-09-12 16:16:01 +03:00
parent ddafb43d66
commit 666b2b8462
11 changed files with 248 additions and 37 deletions

View File

@ -76,7 +76,6 @@ This command will in the standard setup
* train sentence-piece models (separate for source and target language)
* segment data sets using those sentence-piece models
* applies some additional bitext cleaning (using Moses scripts)
* word-align all training data (used for guided alignment)
* create a vocabulary file for Marian-NMT
@ -165,7 +164,8 @@ Currently, the makefile looks at the local copy of released OPUS data to find av
Most settings can be adjusted by setting corresponding variables to new values. Common changes are:
* don't run word-alignment: set `MODELTYPE=transformer`
* run word-alignment and train with guided alignment: set `MODELTYPE=transformer-align`
* use sentence piece model internally to define vocabularies: set `MODELTYPE=transformer-spm`
* change the vocabulary size: set `BPESIZE=<yourvalue>` for example BPESIZE=4000 (this is also used for sentence-piece models)
* vocabulary sizes can also be set for source and target language independently (`SRCBPESIZE` and `TRGBPESIZE`)
* use BPE instead of sentence-piece (not recommended): set `SUBWORDS=bpe`

View File

@ -15,18 +15,6 @@ bpe-models: ${BPESRCMODEL} ${BPETRGMODEL}
## ---> need to delete the old ones if we want to create new BPE models
# BPESRCMODEL = ${TRAIN_SRC}.bpe${SRCBPESIZE:000=}k-model
# BPETRGMODEL = ${TRAIN_TRG}.bpe${TRGBPESIZE:000=}k-model
## NEW: always use the same name for the BPE models
## --> avoid overwriting validation/test data with new segmentation models
## if a new data set is used
BPESRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
## we keep the dependency on LOCAL_TRAIN_SRC
## to make multi-threaded make calls behave properly
## --> otherwise there can be multiple threads writing to the same file!

View File

@ -252,6 +252,23 @@ BPESIZE ?= 32000
SRCBPESIZE ?= ${BPESIZE}
TRGBPESIZE ?= ${BPESIZE}
BPEMODELNAME ?= opus
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
# BPESRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
# BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
BPESRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.${SRCBPESIZE:000=}k-model.bpe
BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.${TRGBPESIZE:000=}k-model.bpe
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
# SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
# SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.${SRCBPESIZE:000=}k-model.spm
SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.${TRGBPESIZE:000=}k-model.spm
VOCABSIZE ?= $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))
## for document-level models
@ -268,15 +285,7 @@ PRE_TRG = ${SUBWORDS}${TRGBPESIZE:000=}k
## default name of the data set (and the model)
##-------------------------------------
ifndef DATASET
DATASET = opus
endif
ifndef BPEMODELNAME
BPEMODELNAME = opus
endif
DATASET ?= opus
## DATADIR = directory where the train/dev/test data are
## WORKDIR = directory used for training
@ -338,8 +347,24 @@ MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
MODEL_VOCABTYPE = yml
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
ifeq (${MODELTYPE},transformer-spm)
MODEL_VOCABTYPE = spm
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
MODEL_SRCVOCAB = ${SPMSRCMODEL}
MODEL_TRGVOCAB = ${SPMTRGMODEL}
# MODEL_SRCVOCAB = ${MODEL_VOCAB}
# MODEL_TRGVOCAB = ${MODEL_VOCAB}
else
MODEL_VOCABTYPE = yml
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
MODEL_SRCVOCAB = ${MODEL_VOCAB}
MODEL_TRGVOCAB = ${MODEL_VOCAB}
endif
## latest model with the same pre-processing but any data or modeltype

View File

@ -151,6 +151,11 @@ ifneq ("$(wildcard ${BPESRCMODEL})","")
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
PREPROCESS_DESCRIPTION = normalization + tokenization + BPE (${PRE_SRC},${PRE_TRG})
else ifeq (${MODELTYPE},transformer-spm)
PREPROCESS_TYPE = txt
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
PREPROCESS_DESCRIPTION = normalization + in-build SentencePiece (${PRE_SRC},${PRE_TRG})
else
PREPROCESS_TYPE = spm
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}

View File

@ -22,19 +22,9 @@
spm-models: ${SPMSRCMODEL} ${SPMTRGMODEL}
# SPMSRCMODEL = ${TRAIN_SRC}.spm${SRCBPESIZE:000=}k-model
# SPMTRGMODEL = ${TRAIN_TRG}.spm${TRGBPESIZE:000=}k-model
## NEW: always use the same name for the SPM models
## --> avoid overwriting validation/test data with new segmentation models
## if a new data set is used
SPMSRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
SPMTRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
# SPMEXTRA = --split_by_whitespace=false
SPMEXTRA =
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
## set to 1 if you want to generate SPM vocab file
GENERATE_SPM_VOC = 0

View File

@ -176,3 +176,69 @@ endif
${TRAIN_SRC}.clean${TRAINSIZE}.gz: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz
${ZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
sed 's/\@\@ //g;s/ \@\@//g;s/ \@\-\@ /-/g' | ${GZIP} -c > $@
${TRAIN_TRG}.clean${TRAINSIZE}.gz: ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
${ZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
sed 's/\@\@ //g;s/ \@\@//g;s/ \@\-\@ /-/g' | ${GZIP} -c > $@
## train transformer model
${WORKDIR}/${MODEL}.transformer-spm.model${NR}.done: \
${TRAIN_SRC}.clean${TRAINSIZE}.gz \
${TRAIN_TRG}.clean${TRAINSIZE}.gz \
${DEV_SRC} ${DEV_TRG}
mkdir -p ${dir $@}
##--------------------------------------------------------------------
## in case we want to continue training from the latest existing model
## (check lib/config.mk to see how the latest model is found)
##--------------------------------------------------------------------
ifeq (${wildcard ${MODEL_START}},)
ifneq (${MODEL_LATEST},)
ifneq (${MODEL_LATEST_VOCAB},)
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
cp ${MODEL_LATEST} ${MODEL_START}
endif
endif
endif
##--------------------------------------------------------------------
${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \
--model $(@:.done=.npz) \
--type transformer \
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
--max-length 500 \
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
--mini-batch-fit \
-w ${MARIAN_WORKSPACE} \
--maxi-batch ${MARIAN_MAXI_BATCH} \
--early-stopping ${MARIAN_EARLY_STOPPING} \
--valid-freq ${MARIAN_VALID_FREQ} \
--save-freq ${MARIAN_SAVE_FREQ} \
--disp-freq ${MARIAN_DISP_FREQ} \
--valid-sets ${word 3,$^} ${word 4,$^} \
--valid-metrics perplexity \
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
--beam-size 12 --normalize 1 --allow-unk \
--log $(@:.model${NR}.done=.train${NR}.log) \
--valid-log $(@:.model${NR}.done=.valid${NR}.log) \
--enc-depth 6 --dec-depth 6 \
--transformer-heads 8 \
--transformer-postprocess-emb d \
--transformer-postprocess dan \
--transformer-dropout ${MARIAN_DROPOUT} \
--label-smoothing 0.1 \
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
--tied-embeddings-all \
--overwrite --keep-best \
--devices ${MARIAN_GPUS} \
--sync-sgd --seed ${SEED} \
--sqlite \
--tempdir ${TMPDIR} \
--exponential-smoothing
touch $@

View File

@ -0,0 +1,6 @@
#!/bin/bash
#
# USAGE postprocess.sh < input > output
#
perl -C -pe 's/\p{C}/ /g;'

View File

@ -1,6 +1,6 @@
#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
# USAGE preprocess.sh source-langid target-langid spmodel [noflags] < input > output
#
#
# replace SPMENCODE with your own setup!

View File

@ -1,6 +1,6 @@
#!/bin/bash
#
# USAGE preprocess.sh langid bpecodes < input > output
# USAGE preprocess.sh langid spmodel < input > output
#
# replace SPMENCODE with your own setup!

View File

@ -0,0 +1,87 @@
#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid [noflags] < input > output
#
#
if [ "$3" == "noflags" ]; then
sed -e 's//,/g' \
-e 's/。 */. /g' \
-e 's/、/,/g' \
-e 's/”/"/g' \
-e 's/“/"/g' \
-e 's//:/g' \
-e 's//:/g' \
-e 's//\?/g' \
-e 's/《/"/g' \
-e 's/》/"/g' \
-e 's//\)/g' \
-e 's//\!/g' \
-e 's//\(/g' \
-e 's//;/g' \
-e 's//"/g' \
-e 's/」/"/g' \
-e 's/「/"/g' \
-e 's//0/g' \
-e 's//3/g' \
-e 's//2/g' \
-e 's//5/g' \
-e 's//6/g' \
-e 's//9/g' \
-e 's//7/g' \
-e 's//8/g' \
-e 's//4/g' \
-e 's/ */. /g' \
-e 's//\~/g' \
-e "s//\'/g" \
-e 's/…/\.\.\./g' \
-e 's/━/\-/g' \
-e 's/〈/\</g' \
-e 's/〉/\>/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g'
else
sed -e 's//,/g' \
-e 's/。 */. /g' \
-e 's/、/,/g' \
-e 's/”/"/g' \
-e 's/“/"/g' \
-e 's//:/g' \
-e 's//:/g' \
-e 's//\?/g' \
-e 's/《/"/g' \
-e 's/》/"/g' \
-e 's//\)/g' \
-e 's//\!/g' \
-e 's//\(/g' \
-e 's//;/g' \
-e 's//"/g' \
-e 's/」/"/g' \
-e 's/「/"/g' \
-e 's//0/g' \
-e 's//3/g' \
-e 's//2/g' \
-e 's//5/g' \
-e 's//6/g' \
-e 's//9/g' \
-e 's//7/g' \
-e 's//8/g' \
-e 's//4/g' \
-e 's/ */. /g' \
-e 's//\~/g' \
-e "s//\'/g" \
-e 's/…/\.\.\./g' \
-e 's/━/\-/g' \
-e 's/〈/\</g' \
-e 's/〉/\>/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
sed "s/^/>>$2<< /"
fi

44
scripts/preprocess-txt.sh Normal file
View File

@ -0,0 +1,44 @@
#!/bin/bash
#
# USAGE preprocess.sh < input > output
#
## simple pre-processing steps adapted from Moses tools
sed -e 's//,/g' \
-e 's/。 */. /g' \
-e 's/、/,/g' \
-e 's/”/"/g' \
-e 's/“/"/g' \
-e 's//:/g' \
-e 's//:/g' \
-e 's//\?/g' \
-e 's/《/"/g' \
-e 's/》/"/g' \
-e 's//\)/g' \
-e 's//\!/g' \
-e 's//\(/g' \
-e 's//;/g' \
-e 's//"/g' \
-e 's/」/"/g' \
-e 's/「/"/g' \
-e 's//0/g' \
-e 's//3/g' \
-e 's//2/g' \
-e 's//5/g' \
-e 's//6/g' \
-e 's//9/g' \
-e 's//7/g' \
-e 's//8/g' \
-e 's//4/g' \
-e 's/ */. /g' \
-e 's//\~/g' \
-e "s//\'/g" \
-e 's/…/\.\.\./g' \
-e 's/━/\-/g' \
-e 's/〈/\</g' \
-e 's/〉/\>/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g'