diff --git a/doc/Data.md b/doc/Data.md index 9d292052..8e72a996 100644 --- a/doc/Data.md +++ b/doc/Data.md @@ -76,7 +76,6 @@ This command will in the standard setup * train sentence-piece models (separate for source and target language) * segment data sets using those sentence-piece models * applies some additional bitext cleaning (using Moses scripts) -* word-align all training data (used for guided alignment) * create a vocabulary file for Marian-NMT @@ -165,7 +164,8 @@ Currently, the makefile looks at the local copy of released OPUS data to find av Most settings can be adjusted by setting corresponding variables to new values. Common changes are: -* don't run word-alignment: set `MODELTYPE=transformer` +* run word-alignment and train with guided alignment: set `MODELTYPE=transformer-align` +* use sentence piece model internally to define vocabularies: set `MODELTYPE=transformer-spm` * change the vocabulary size: set `BPESIZE=` for example BPESIZE=4000 (this is also used for sentence-piece models) * vocabulary sizes can also be set for source and target language independently (`SRCBPESIZE` and `TRGBPESIZE`) * use BPE instead of sentence-piece (not recommended): set `SUBWORDS=bpe` diff --git a/lib/bpe.mk b/lib/bpe.mk index 2bedc2f4..9ed7ac0b 100644 --- a/lib/bpe.mk +++ b/lib/bpe.mk @@ -15,18 +15,6 @@ bpe-models: ${BPESRCMODEL} ${BPETRGMODEL} ## ---> need to delete the old ones if we want to create new BPE models -# BPESRCMODEL = ${TRAIN_SRC}.bpe${SRCBPESIZE:000=}k-model -# BPETRGMODEL = ${TRAIN_TRG}.bpe${TRGBPESIZE:000=}k-model - -## NEW: always use the same name for the BPE models -## --> avoid overwriting validation/test data with new segmentation models -## if a new data set is used -BPESRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model -BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model - - -.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL} - ## we keep the dependency on LOCAL_TRAIN_SRC ## to make multi-threaded make calls behave properly ## --> otherwise there can be multiple threads writing to the same file! diff --git a/lib/config.mk b/lib/config.mk index 361eb3bd..aa6d362e 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -252,6 +252,23 @@ BPESIZE ?= 32000 SRCBPESIZE ?= ${BPESIZE} TRGBPESIZE ?= ${BPESIZE} +BPEMODELNAME ?= opus + +.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL} +# BPESRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model +# BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model +BPESRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.${SRCBPESIZE:000=}k-model.bpe +BPETRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.${TRGBPESIZE:000=}k-model.bpe + + +.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} +# SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model +# SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model +SPMSRCMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.src.${SRCBPESIZE:000=}k-model.spm +SPMTRGMODEL ?= ${WORKDIR}/train/${BPEMODELNAME}.trg.${TRGBPESIZE:000=}k-model.spm + + + VOCABSIZE ?= $$((${SRCBPESIZE} + ${TRGBPESIZE} + 1000)) ## for document-level models @@ -268,15 +285,7 @@ PRE_TRG = ${SUBWORDS}${TRGBPESIZE:000=}k ## default name of the data set (and the model) ##------------------------------------- -ifndef DATASET - DATASET = opus -endif - -ifndef BPEMODELNAME - BPEMODELNAME = opus -endif - - +DATASET ?= opus ## DATADIR = directory where the train/dev/test data are ## WORKDIR = directory used for training @@ -338,8 +347,24 @@ MODEL_TRAINLOG = ${MODEL}.${MODELTYPE}.train${NR}.log MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz MODEL_DECODER = ${MODEL_FINAL}.decoder.yml -MODEL_VOCABTYPE = yml -MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE} + +ifeq (${MODELTYPE},transformer-spm) + MODEL_VOCABTYPE = spm + MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE} + MODEL_SRCVOCAB = ${SPMSRCMODEL} + MODEL_TRGVOCAB = ${SPMTRGMODEL} +# MODEL_SRCVOCAB = ${MODEL_VOCAB} +# MODEL_TRGVOCAB = ${MODEL_VOCAB} +else + MODEL_VOCABTYPE = yml + MODEL_VOCAB = ${WORKDIR}/${MODEL}.vocab.${MODEL_VOCABTYPE} + MODEL_SRCVOCAB = ${MODEL_VOCAB} + MODEL_TRGVOCAB = ${MODEL_VOCAB} +endif + + + + ## latest model with the same pre-processing but any data or modeltype diff --git a/lib/dist.mk b/lib/dist.mk index 52b1a10f..5401790c 100644 --- a/lib/dist.mk +++ b/lib/dist.mk @@ -151,6 +151,11 @@ ifneq ("$(wildcard ${BPESRCMODEL})","") PREPROCESS_SRCMODEL = ${BPESRCMODEL} PREPROCESS_TRGMODEL = ${BPETRGMODEL} PREPROCESS_DESCRIPTION = normalization + tokenization + BPE (${PRE_SRC},${PRE_TRG}) +else ifeq (${MODELTYPE},transformer-spm) + PREPROCESS_TYPE = txt + PREPROCESS_SRCMODEL = ${SPMSRCMODEL} + PREPROCESS_TRGMODEL = ${SPMTRGMODEL} + PREPROCESS_DESCRIPTION = normalization + in-build SentencePiece (${PRE_SRC},${PRE_TRG}) else PREPROCESS_TYPE = spm PREPROCESS_SRCMODEL = ${SPMSRCMODEL} diff --git a/lib/sentencepiece.mk b/lib/sentencepiece.mk index 02627324..14aecb69 100644 --- a/lib/sentencepiece.mk +++ b/lib/sentencepiece.mk @@ -22,19 +22,9 @@ spm-models: ${SPMSRCMODEL} ${SPMTRGMODEL} -# SPMSRCMODEL = ${TRAIN_SRC}.spm${SRCBPESIZE:000=}k-model -# SPMTRGMODEL = ${TRAIN_TRG}.spm${TRGBPESIZE:000=}k-model - -## NEW: always use the same name for the SPM models -## --> avoid overwriting validation/test data with new segmentation models -## if a new data set is used -SPMSRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model -SPMTRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model # SPMEXTRA = --split_by_whitespace=false SPMEXTRA = -.PRECIOUS: ${SPMSRCMODEL} ${SPMTRGMODEL} - ## set to 1 if you want to generate SPM vocab file GENERATE_SPM_VOC = 0 diff --git a/lib/train.mk b/lib/train.mk index 7fb26891..ae4290db 100644 --- a/lib/train.mk +++ b/lib/train.mk @@ -176,3 +176,69 @@ endif + + + +${TRAIN_SRC}.clean${TRAINSIZE}.gz: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz + ${ZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \ + sed 's/\@\@ //g;s/ \@\@//g;s/ \@\-\@ /-/g' | ${GZIP} -c > $@ + +${TRAIN_TRG}.clean${TRAINSIZE}.gz: ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz + ${ZCAT} $< | sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \ + sed 's/\@\@ //g;s/ \@\@//g;s/ \@\-\@ /-/g' | ${GZIP} -c > $@ + + +## train transformer model +${WORKDIR}/${MODEL}.transformer-spm.model${NR}.done: \ + ${TRAIN_SRC}.clean${TRAINSIZE}.gz \ + ${TRAIN_TRG}.clean${TRAINSIZE}.gz \ + ${DEV_SRC} ${DEV_TRG} + mkdir -p ${dir $@} +##-------------------------------------------------------------------- +## in case we want to continue training from the latest existing model +## (check lib/config.mk to see how the latest model is found) +##-------------------------------------------------------------------- +ifeq (${wildcard ${MODEL_START}},) +ifneq (${MODEL_LATEST},) +ifneq (${MODEL_LATEST_VOCAB},) + cp ${MODEL_LATEST_VOCAB} ${MODEL_VOCAB} + cp ${MODEL_LATEST} ${MODEL_START} +endif +endif +endif +##-------------------------------------------------------------------- + ${LOADMODS} && ${MARIAN_TRAIN} ${MARIAN_EXTRA} \ + --model $(@:.done=.npz) \ + --type transformer \ + --train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \ + --max-length 500 \ + --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \ + --mini-batch-fit \ + -w ${MARIAN_WORKSPACE} \ + --maxi-batch ${MARIAN_MAXI_BATCH} \ + --early-stopping ${MARIAN_EARLY_STOPPING} \ + --valid-freq ${MARIAN_VALID_FREQ} \ + --save-freq ${MARIAN_SAVE_FREQ} \ + --disp-freq ${MARIAN_DISP_FREQ} \ + --valid-sets ${word 3,$^} ${word 4,$^} \ + --valid-metrics perplexity \ + --valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \ + --beam-size 12 --normalize 1 --allow-unk \ + --log $(@:.model${NR}.done=.train${NR}.log) \ + --valid-log $(@:.model${NR}.done=.valid${NR}.log) \ + --enc-depth 6 --dec-depth 6 \ + --transformer-heads 8 \ + --transformer-postprocess-emb d \ + --transformer-postprocess dan \ + --transformer-dropout ${MARIAN_DROPOUT} \ + --label-smoothing 0.1 \ + --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ + --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ + --tied-embeddings-all \ + --overwrite --keep-best \ + --devices ${MARIAN_GPUS} \ + --sync-sgd --seed ${SEED} \ + --sqlite \ + --tempdir ${TMPDIR} \ + --exponential-smoothing + touch $@ diff --git a/scripts/postprocess-txt.sh b/scripts/postprocess-txt.sh new file mode 100644 index 00000000..cf2cd6b2 --- /dev/null +++ b/scripts/postprocess-txt.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# +# USAGE postprocess.sh < input > output +# + +perl -C -pe 's/\p{C}/ /g;' diff --git a/scripts/preprocess-spm-multi-target.sh b/scripts/preprocess-spm-multi-target.sh index 0dba53f4..d593ac4d 100755 --- a/scripts/preprocess-spm-multi-target.sh +++ b/scripts/preprocess-spm-multi-target.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output +# USAGE preprocess.sh source-langid target-langid spmodel [noflags] < input > output # # # replace SPMENCODE with your own setup! diff --git a/scripts/preprocess-spm.sh b/scripts/preprocess-spm.sh index 60cb73b3..82a05d7b 100755 --- a/scripts/preprocess-spm.sh +++ b/scripts/preprocess-spm.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# USAGE preprocess.sh langid bpecodes < input > output +# USAGE preprocess.sh langid spmodel < input > output # # replace SPMENCODE with your own setup! diff --git a/scripts/preprocess-txt-multi-target.sh b/scripts/preprocess-txt-multi-target.sh new file mode 100644 index 00000000..35ebd422 --- /dev/null +++ b/scripts/preprocess-txt-multi-target.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# +# USAGE preprocess.sh source-langid target-langid [noflags] < input > output +# +# + +if [ "$3" == "noflags" ]; then + sed -e 's/,/,/g' \ + -e 's/。 */. /g' \ + -e 's/、/,/g' \ + -e 's/”/"/g' \ + -e 's/“/"/g' \ + -e 's/∶/:/g' \ + -e 's/:/:/g' \ + -e 's/?/\?/g' \ + -e 's/《/"/g' \ + -e 's/》/"/g' \ + -e 's/)/\)/g' \ + -e 's/!/\!/g' \ + -e 's/(/\(/g' \ + -e 's/;/;/g' \ + -e 's/1/"/g' \ + -e 's/」/"/g' \ + -e 's/「/"/g' \ + -e 's/0/0/g' \ + -e 's/3/3/g' \ + -e 's/2/2/g' \ + -e 's/5/5/g' \ + -e 's/6/6/g' \ + -e 's/9/9/g' \ + -e 's/7/7/g' \ + -e 's/8/8/g' \ + -e 's/4/4/g' \ + -e 's/. */. /g' \ + -e 's/~/\~/g' \ + -e "s/’/\'/g" \ + -e 's/…/\.\.\./g' \ + -e 's/━/\-/g' \ + -e 's/〈/\/g' \ + -e 's/【/\[/g' \ + -e 's/】/\]/g' \ + -e 's/%/\%/g' | + perl -C -pe 's/\p{C}/ /g;' | + sed 's/ */ /g;s/^ *//g;s/ *$//g' +else + sed -e 's/,/,/g' \ + -e 's/。 */. /g' \ + -e 's/、/,/g' \ + -e 's/”/"/g' \ + -e 's/“/"/g' \ + -e 's/∶/:/g' \ + -e 's/:/:/g' \ + -e 's/?/\?/g' \ + -e 's/《/"/g' \ + -e 's/》/"/g' \ + -e 's/)/\)/g' \ + -e 's/!/\!/g' \ + -e 's/(/\(/g' \ + -e 's/;/;/g' \ + -e 's/1/"/g' \ + -e 's/」/"/g' \ + -e 's/「/"/g' \ + -e 's/0/0/g' \ + -e 's/3/3/g' \ + -e 's/2/2/g' \ + -e 's/5/5/g' \ + -e 's/6/6/g' \ + -e 's/9/9/g' \ + -e 's/7/7/g' \ + -e 's/8/8/g' \ + -e 's/4/4/g' \ + -e 's/. */. /g' \ + -e 's/~/\~/g' \ + -e "s/’/\'/g" \ + -e 's/…/\.\.\./g' \ + -e 's/━/\-/g' \ + -e 's/〈/\/g' \ + -e 's/【/\[/g' \ + -e 's/】/\]/g' \ + -e 's/%/\%/g' | + perl -C -pe 's/\p{C}/ /g;' | + sed 's/ */ /g;s/^ *//g;s/ *$//g' | + sed "s/^/>>$2<< /" +fi + diff --git a/scripts/preprocess-txt.sh b/scripts/preprocess-txt.sh new file mode 100644 index 00000000..04d50882 --- /dev/null +++ b/scripts/preprocess-txt.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# USAGE preprocess.sh < input > output +# +## simple pre-processing steps adapted from Moses tools + +sed -e 's/,/,/g' \ + -e 's/。 */. /g' \ + -e 's/、/,/g' \ + -e 's/”/"/g' \ + -e 's/“/"/g' \ + -e 's/∶/:/g' \ + -e 's/:/:/g' \ + -e 's/?/\?/g' \ + -e 's/《/"/g' \ + -e 's/》/"/g' \ + -e 's/)/\)/g' \ + -e 's/!/\!/g' \ + -e 's/(/\(/g' \ + -e 's/;/;/g' \ + -e 's/1/"/g' \ + -e 's/」/"/g' \ + -e 's/「/"/g' \ + -e 's/0/0/g' \ + -e 's/3/3/g' \ + -e 's/2/2/g' \ + -e 's/5/5/g' \ + -e 's/6/6/g' \ + -e 's/9/9/g' \ + -e 's/7/7/g' \ + -e 's/8/8/g' \ + -e 's/4/4/g' \ + -e 's/. */. /g' \ + -e 's/~/\~/g' \ + -e "s/’/\'/g" \ + -e 's/…/\.\.\./g' \ + -e 's/━/\-/g' \ + -e 's/〈/\/g' \ + -e 's/【/\[/g' \ + -e 's/】/\]/g' \ + -e 's/%/\%/g' | +perl -C -pe 's/\p{C}/ /g;' | +sed 's/ */ /g;s/^ *//g;s/ *$//g'