mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-07-14 15:40:33 +03:00
internal sentence piece models in transformers
This commit is contained in:
parent
ddafb43d66
commit
666b2b8462
@ -76,7 +76,6 @@ This command will in the standard setup
|
||||
* train sentence-piece models (separate for source and target language)
|
||||
* segment data sets using those sentence-piece models
|
||||
* applies some additional bitext cleaning (using Moses scripts)
|
||||
* word-align all training data (used for guided alignment)
|
||||
* create a vocabulary file for Marian-NMT
|
||||
|
||||
|
||||
@ -165,7 +164,8 @@ Currently, the makefile looks at the local copy of released OPUS data to find av
|
||||
|
||||
Most settings can be adjusted by setting corresponding variables to new values. Common changes are:
|
||||
|
||||
* don't run word-alignment: set `MODELTYPE=transformer`
|
||||
* run word-alignment and train with guided alignment: set `MODELTYPE=transformer-align`
|
||||
* use sentence piece model internally to define vocabularies: set `MODELTYPE=transformer-spm`
|
||||
* change the vocabulary size: set `BPESIZE=<yourvalue>` for example BPESIZE=4000 (this is also used for sentence-piece models)
|
||||
* vocabulary sizes can also be set for source and target language independently (`SRCBPESIZE` and `TRGBPESIZE`)
|
||||
* use BPE instead of sentence-piece (not recommended): set `SUBWORDS=bpe`
|
||||
|
12
lib/bpe.mk
12
lib/bpe.mk
@ -15,18 +15,6 @@ bpe-models: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||
## ---> need to delete the old ones if we want to create new BPE models
|
||||
|
||||
|
||||
# BPESRCMODEL = ${TRAIN_SRC}.bpe${SRCBPESIZE:000=}k-model
|
||||
# BPETRGMODEL = ${TRAIN_TRG}.bpe${TRGBPESIZE:000=}k-model
|
||||
|
||||
## NEW: always use the same name for the BPE models
|
||||
## --> avoid overwriting validation/test data with new segmentation models
|
||||
## if a new data set is used
|
||||
BPESRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
|
||||
BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
|
||||
|
||||
|
||||
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||
|
||||
## we keep the dependency on LOCAL_TRAIN_SRC
|
||||
## to make multi-threaded make calls behave properly
|
||||
## --> otherwise there can be multiple threads writing to the same file!
|
||||
|
@ -252,6 +252,23 @@ BPESIZE ?= 32000
|
||||
SRCBPESIZE ?= ${BPESIZE}
|
||||
TRGBPESIZE ?= ${BPESIZE}
|
||||
|
||||
BPEMODELNAME ?= opus
|
||||
|
||||
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||
# BPESRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
|
||||
# BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
|
||||
BPESRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.${SRCBPESIZE:000=}k-model.bpe
|
||||
BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.${TRGBPESIZE:000=}k-model.bpe
|
||||
|
||||
|
||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
# SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
|
||||
# SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
|
||||
SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.${SRCBPESIZE:000=}k-model.spm
|
||||
SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.${TRGBPESIZE:000=}k-model.spm
|
||||
|
||||
|
||||
|
||||
VOCABSIZE ?= $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000))
|
||||
|
||||
## for document-level models
|
||||
@ -268,15 +285,7 @@ PRE_TRG = ${SUBWORDS}${TRGBPESIZE:000=}k
|
||||
## default name of the data set (and the model)
|
||||
##-------------------------------------
|
||||
|
||||
ifndef DATASET
|
||||
DATASET = opus
|
||||
endif
|
||||
|
||||
ifndef BPEMODELNAME
|
||||
BPEMODELNAME = opus
|
||||
endif
|
||||
|
||||
|
||||
DATASET ?= opus
|
||||
|
||||
## DATADIR = directory where the train/dev/test data are
|
||||
## WORKDIR = directory used for training
|
||||
@ -338,8 +347,24 @@ MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log
|
||||
MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
|
||||
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
|
||||
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
|
||||
MODEL_VOCABTYPE = yml
|
||||
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
|
||||
|
||||
ifeq (${MODELTYPE},transformer-spm)
|
||||
MODEL_VOCABTYPE = spm
|
||||
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
|
||||
MODEL_SRCVOCAB = ${SPMSRCMODEL}
|
||||
MODEL_TRGVOCAB = ${SPMTRGMODEL}
|
||||
# MODEL_SRCVOCAB = ${MODEL_VOCAB}
|
||||
# MODEL_TRGVOCAB = ${MODEL_VOCAB}
|
||||
else
|
||||
MODEL_VOCABTYPE = yml
|
||||
MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE}
|
||||
MODEL_SRCVOCAB = ${MODEL_VOCAB}
|
||||
MODEL_TRGVOCAB = ${MODEL_VOCAB}
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## latest model with the same pre-processing but any data or modeltype
|
||||
|
@ -151,6 +151,11 @@ ifneq ("$(wildcard ${BPESRCMODEL})","")
|
||||
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
|
||||
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
|
||||
PREPROCESS_DESCRIPTION = normalization + tokenization + BPE (${PRE_SRC},${PRE_TRG})
|
||||
else ifeq (${MODELTYPE},transformer-spm)
|
||||
PREPROCESS_TYPE = txt
|
||||
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
|
||||
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
|
||||
PREPROCESS_DESCRIPTION = normalization + in-build SentencePiece (${PRE_SRC},${PRE_TRG})
|
||||
else
|
||||
PREPROCESS_TYPE = spm
|
||||
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
|
||||
|
@ -22,19 +22,9 @@
|
||||
|
||||
spm-models: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
|
||||
# SPMSRCMODEL = ${TRAIN_SRC}.spm${SRCBPESIZE:000=}k-model
|
||||
# SPMTRGMODEL = ${TRAIN_TRG}.spm${TRGBPESIZE:000=}k-model
|
||||
|
||||
## NEW: always use the same name for the SPM models
|
||||
## --> avoid overwriting validation/test data with new segmentation models
|
||||
## if a new data set is used
|
||||
SPMSRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
|
||||
# SPMEXTRA = --split_by_whitespace=false
|
||||
SPMEXTRA =
|
||||
|
||||
.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL}
|
||||
|
||||
## set to 1 if you want to generate SPM vocab file
|
||||
GENERATE_SPM_VOC = 0
|
||||
|
||||
|
66
lib/train.mk
66
lib/train.mk
@ -176,3 +176,69 @@ endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
${TRAIN_SRC}.clean${TRAINSIZE}.gz: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz
|
||||
${ZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
|
||||
sed 's/\@\@ //g;s/ \@\@//g;s/ \@\-\@ /-/g' | ${GZIP} -c > $@
|
||||
|
||||
${TRAIN_TRG}.clean${TRAINSIZE}.gz: ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz
|
||||
${ZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
|
||||
sed 's/\@\@ //g;s/ \@\@//g;s/ \@\-\@ /-/g' | ${GZIP} -c > $@
|
||||
|
||||
|
||||
## train transformer model
|
||||
${WORKDIR}/${MODEL}.transformer-spm.model${NR}.done: \
|
||||
${TRAIN_SRC}.clean${TRAINSIZE}.gz \
|
||||
${TRAIN_TRG}.clean${TRAINSIZE}.gz \
|
||||
${DEV_SRC} ${DEV_TRG}
|
||||
mkdir -p ${dir $@}
|
||||
##--------------------------------------------------------------------
|
||||
## in case we want to continue training from the latest existing model
|
||||
## (check lib/config.mk to see how the latest model is found)
|
||||
##--------------------------------------------------------------------
|
||||
ifeq (${wildcard ${MODEL_START}},)
|
||||
ifneq (${MODEL_LATEST},)
|
||||
ifneq (${MODEL_LATEST_VOCAB},)
|
||||
cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB}
|
||||
cp ${MODEL_LATEST} ${MODEL_START}
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
##--------------------------------------------------------------------
|
||||
${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \
|
||||
--model $(@:.done=.npz) \
|
||||
--type transformer \
|
||||
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
|
||||
--max-length 500 \
|
||||
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
|
||||
--mini-batch-fit \
|
||||
-w ${MARIAN_WORKSPACE} \
|
||||
--maxi-batch ${MARIAN_MAXI_BATCH} \
|
||||
--early-stopping ${MARIAN_EARLY_STOPPING} \
|
||||
--valid-freq ${MARIAN_VALID_FREQ} \
|
||||
--save-freq ${MARIAN_SAVE_FREQ} \
|
||||
--disp-freq ${MARIAN_DISP_FREQ} \
|
||||
--valid-sets ${word 3,$^} ${word 4,$^} \
|
||||
--valid-metrics perplexity \
|
||||
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
|
||||
--beam-size 12 --normalize 1 --allow-unk \
|
||||
--log $(@:.model${NR}.done=.train${NR}.log) \
|
||||
--valid-log $(@:.model${NR}.done=.valid${NR}.log) \
|
||||
--enc-depth 6 --dec-depth 6 \
|
||||
--transformer-heads 8 \
|
||||
--transformer-postprocess-emb d \
|
||||
--transformer-postprocess dan \
|
||||
--transformer-dropout ${MARIAN_DROPOUT} \
|
||||
--label-smoothing 0.1 \
|
||||
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
|
||||
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
|
||||
--tied-embeddings-all \
|
||||
--overwrite --keep-best \
|
||||
--devices ${MARIAN_GPUS} \
|
||||
--sync-sgd --seed ${SEED} \
|
||||
--sqlite \
|
||||
--tempdir ${TMPDIR} \
|
||||
--exponential-smoothing
|
||||
touch $@
|
||||
|
6
scripts/postprocess-txt.sh
Normal file
6
scripts/postprocess-txt.sh
Normal file
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE postprocess.sh < input > output
|
||||
#
|
||||
|
||||
perl -C -pe 's/\p{C}/ /g;'
|
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
|
||||
# USAGE preprocess.sh source-langid target-langid spmodel [noflags] < input > output
|
||||
#
|
||||
#
|
||||
# replace SPMENCODE with your own setup!
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh langid bpecodes < input > output
|
||||
# USAGE preprocess.sh langid spmodel < input > output
|
||||
#
|
||||
# replace SPMENCODE with your own setup!
|
||||
|
||||
|
87
scripts/preprocess-txt-multi-target.sh
Normal file
87
scripts/preprocess-txt-multi-target.sh
Normal file
@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid [noflags] < input > output
|
||||
#
|
||||
#
|
||||
|
||||
if [ "$3" == "noflags" ]; then
|
||||
sed -e 's/,/,/g' \
|
||||
-e 's/。 */. /g' \
|
||||
-e 's/、/,/g' \
|
||||
-e 's/”/"/g' \
|
||||
-e 's/“/"/g' \
|
||||
-e 's/∶/:/g' \
|
||||
-e 's/:/:/g' \
|
||||
-e 's/?/\?/g' \
|
||||
-e 's/《/"/g' \
|
||||
-e 's/》/"/g' \
|
||||
-e 's/)/\)/g' \
|
||||
-e 's/!/\!/g' \
|
||||
-e 's/(/\(/g' \
|
||||
-e 's/;/;/g' \
|
||||
-e 's/1/"/g' \
|
||||
-e 's/」/"/g' \
|
||||
-e 's/「/"/g' \
|
||||
-e 's/0/0/g' \
|
||||
-e 's/3/3/g' \
|
||||
-e 's/2/2/g' \
|
||||
-e 's/5/5/g' \
|
||||
-e 's/6/6/g' \
|
||||
-e 's/9/9/g' \
|
||||
-e 's/7/7/g' \
|
||||
-e 's/8/8/g' \
|
||||
-e 's/4/4/g' \
|
||||
-e 's/. */. /g' \
|
||||
-e 's/~/\~/g' \
|
||||
-e "s/’/\'/g" \
|
||||
-e 's/…/\.\.\./g' \
|
||||
-e 's/━/\-/g' \
|
||||
-e 's/〈/\</g' \
|
||||
-e 's/〉/\>/g' \
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/\p{C}/ /g;' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g'
|
||||
else
|
||||
sed -e 's/,/,/g' \
|
||||
-e 's/。 */. /g' \
|
||||
-e 's/、/,/g' \
|
||||
-e 's/”/"/g' \
|
||||
-e 's/“/"/g' \
|
||||
-e 's/∶/:/g' \
|
||||
-e 's/:/:/g' \
|
||||
-e 's/?/\?/g' \
|
||||
-e 's/《/"/g' \
|
||||
-e 's/》/"/g' \
|
||||
-e 's/)/\)/g' \
|
||||
-e 's/!/\!/g' \
|
||||
-e 's/(/\(/g' \
|
||||
-e 's/;/;/g' \
|
||||
-e 's/1/"/g' \
|
||||
-e 's/」/"/g' \
|
||||
-e 's/「/"/g' \
|
||||
-e 's/0/0/g' \
|
||||
-e 's/3/3/g' \
|
||||
-e 's/2/2/g' \
|
||||
-e 's/5/5/g' \
|
||||
-e 's/6/6/g' \
|
||||
-e 's/9/9/g' \
|
||||
-e 's/7/7/g' \
|
||||
-e 's/8/8/g' \
|
||||
-e 's/4/4/g' \
|
||||
-e 's/. */. /g' \
|
||||
-e 's/~/\~/g' \
|
||||
-e "s/’/\'/g" \
|
||||
-e 's/…/\.\.\./g' \
|
||||
-e 's/━/\-/g' \
|
||||
-e 's/〈/\</g' \
|
||||
-e 's/〉/\>/g' \
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/\p{C}/ /g;' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
sed "s/^/>>$2<< /"
|
||||
fi
|
||||
|
44
scripts/preprocess-txt.sh
Normal file
44
scripts/preprocess-txt.sh
Normal file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh < input > output
|
||||
#
|
||||
## simple pre-processing steps adapted from Moses tools
|
||||
|
||||
sed -e 's/,/,/g' \
|
||||
-e 's/。 */. /g' \
|
||||
-e 's/、/,/g' \
|
||||
-e 's/”/"/g' \
|
||||
-e 's/“/"/g' \
|
||||
-e 's/∶/:/g' \
|
||||
-e 's/:/:/g' \
|
||||
-e 's/?/\?/g' \
|
||||
-e 's/《/"/g' \
|
||||
-e 's/》/"/g' \
|
||||
-e 's/)/\)/g' \
|
||||
-e 's/!/\!/g' \
|
||||
-e 's/(/\(/g' \
|
||||
-e 's/;/;/g' \
|
||||
-e 's/1/"/g' \
|
||||
-e 's/」/"/g' \
|
||||
-e 's/「/"/g' \
|
||||
-e 's/0/0/g' \
|
||||
-e 's/3/3/g' \
|
||||
-e 's/2/2/g' \
|
||||
-e 's/5/5/g' \
|
||||
-e 's/6/6/g' \
|
||||
-e 's/9/9/g' \
|
||||
-e 's/7/7/g' \
|
||||
-e 's/8/8/g' \
|
||||
-e 's/4/4/g' \
|
||||
-e 's/. */. /g' \
|
||||
-e 's/~/\~/g' \
|
||||
-e "s/’/\'/g" \
|
||||
-e 's/…/\.\.\./g' \
|
||||
-e 's/━/\-/g' \
|
||||
-e 's/〈/\</g' \
|
||||
-e 's/〉/\>/g' \
|
||||
-e 's/【/\[/g' \
|
||||
-e 's/】/\]/g' \
|
||||
-e 's/%/\%/g' |
|
||||
perl -C -pe 's/\p{C}/ /g;' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g'
|
Loading…
Reference in New Issue
Block a user