From bc54b403cdf3a91ab0c48cce6579758c1aa296d3 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Tue, 18 Jan 2022 14:41:17 +0200 Subject: [PATCH] student model quantisation finetuning added --- .gitmodules | 3 ++ Makefile | 4 +- NOTES.md | 15 ++++++ lib/config.mk | 11 ++++- lib/data.mk | 21 ++++++-- lib/env.mk | 9 ++++ lib/env/mahti.mk | 1 + lib/tasks.mk | 4 +- lib/test.mk | 1 - lib/train.mk | 77 ++++++++++++++++++++++++++++++ tatoeba/forward-translate/Makefile | 6 +-- tools/browsermt/marian-dev | 1 + 12 files changed, 140 insertions(+), 13 deletions(-) create mode 160000 tools/browsermt/marian-dev diff --git a/.gitmodules b/.gitmodules index e2664968..20c987f8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -31,3 +31,6 @@ [submodule "OPUS-MT-testsets"] path = OPUS-MT-testsets url = https://github.com/Helsinki-NLP/OPUS-MT-testsets.git +[submodule "tools/browsermt/marian-dev"] + path = tools/browsermt/marian-dev + url = https://github.com/browsermt/marian-dev.git diff --git a/Makefile b/Makefile index 216ab86c..9e7f1204 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,9 @@ include lib/tasks.mk include lib/projects.mk .PHONY: all -all: ${WORKDIR}/${MODELCONFIG} +all: + ${MAKE} rawdata + ${MAKE} ${WORKDIR}/${MODELCONFIG} ${MAKE} data ${MAKE} train ${MAKE} eval diff --git a/NOTES.md b/NOTES.md index fdab05c1..8ed2817e 100644 --- a/NOTES.md +++ b/NOTES.md @@ -13,9 +13,19 @@ Need better data filtering: * stronger filters in cleanup script? * idea: compare character diversity between the two languages and use a threshold to filter sentences? (language-specific?) +links and tools: + +* https://github.com/ZJaume/clean +* https://github.com/Helsinki-NLP/OPUS-MT-distillation + # more efficient parallelisation +* https://www.gnu.org/software/parallel/ +* https://www.gnu.org/software/parallel/parallel_tutorial.html +* https://www.gnu.org/software/bash/manual/html_node/GNU-Parallel.html +* multinode training with MarianNMT: https://github.com/marian-nmt/marian/issues/244 + from Bergamot: https://github.com/browsermt/students/blob/master/train-student/alignment/generate-alignment-and-shortlist.sh @@ -25,6 +35,11 @@ test -s $DIR/corpus.spm.$SRC || cat $CORPUS_SRC | pigz -dc | parallel --no-notic test -s $DIR/corpus.spm.$TRG || cat $CORPUS_TRG | pigz -dc | parallel --no-notice --pipe -k -j16 --block 50M "$MARIAN/spm_encode --model $VOCAB" > $DIR/corpus.spm.$TRG ``` +# Benchmarking + +* SOTA-bench forum: https://forum.sotabench.com/ + + # OPUS-MT at huggingface diff --git a/lib/config.mk b/lib/config.mk index 4cb089bc..ea1afc57 100644 --- a/lib/config.mk +++ b/lib/config.mk @@ -18,9 +18,11 @@ MODELTYPES = transformer \ transformer-base-align \ transformer-big \ transformer-big-align \ + transformer-small \ transformer-small-align \ transformer-tiny \ transformer-tiny-align \ + transformer-tiny11 \ transformer-tiny11-align ## default model type @@ -450,8 +452,13 @@ MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz MODEL_DECODER = ${MODEL_FINAL}.decoder.yml +MODEL_BIN = ${WORKDIR}/${MODEL_BASENAME}.intgemm8.bin +MODEL_INTGEMM8TUNED = ${WORKDIR}/${MODEL_BASENAME}.intgemm8tuned.npz +MODEL_BIN_ALPHAS = ${WORKDIR}/${MODEL_BASENAME}.intgemm8.alphas.bin +.PRECIOUS: ${MODEL_FINAL} ${MODEL_BIN} + ## for sentence-piece models: get plain text vocabularies ## for others: extract vocabulary from training data with MarianNMT @@ -671,7 +678,7 @@ ${WORKDIR}/${MODELCONFIG}: fi; \ if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \ echo "# ${LANGPAIRSTR} training data bigger than ${LARGEST_TRAINSIZE}" > $@; \ - echo "GPUJOB_HPC_MEM = 8g" >> $@; \ + echo "GPUJOB_HPC_MEM = 16g" >> $@; \ echo "GPUJOB_SUBMIT = -gpu01" >> $@; \ echo "SUBWORD_VOCAB_SIZE = ${SUBWORD_VOCAB_SIZE}" >> $@; \ echo "DEVSIZE = ${DEVSIZE}" >> $@; \ @@ -679,7 +686,7 @@ ${WORKDIR}/${MODELCONFIG}: echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \ elif [ $$s -gt ${LARGE_TRAINSIZE} ]; then \ echo "# ${LANGPAIRSTR} training data bigger than ${LARGE_TRAINSIZE}" > $@; \ - echo "GPUJOB_HPC_MEM = 8g" >> $@; \ + echo "GPUJOB_HPC_MEM = 12g" >> $@; \ echo "GPUJOB_SUBMIT = " >> $@; \ echo "MARIAN_VALID_FREQ = 2500" >> $@; \ echo "SUBWORD_VOCAB_SIZE = ${SUBWORD_VOCAB_SIZE}" >> $@; \ diff --git a/lib/data.mk b/lib/data.mk index 0acedf68..2585c833 100644 --- a/lib/data.mk +++ b/lib/data.mk @@ -86,7 +86,7 @@ endif # filtered by reconstruction scores (ce filter) ifneq (${USE_FORWARDTRANS_SELECTED},) FORWARDTRANS_SRC += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}} - FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}} + FORWARDTRANS_TRG += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${TRGEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}} endif # forward-translation data of monolingual data (source-to-target) @@ -118,11 +118,21 @@ endif print-datasets: @echo ${TATOEBA_TRAINSET} @echo ${TRAINSET} + @echo "all data:" @echo ${CLEAN_TRAIN_SRC} + @echo ${CLEAN_TRAIN_TRG} + @echo "back-translation data:" @echo ${BACKTRANS_SRC} + @echo ${BACKTRANS_TRG} + @echo "forward translation data:" @echo ${FORWARDTRANS_SRC} + @echo ${FORWARDTRANS_TRG} + @echo "monolingual forward translation data:" @echo ${FORWARDTRANSMONO_SRC} + @echo ${FORWARDTRANSMONO_TRG} + @echo "pivot-based translation data:" @echo ${PIVOTING_SRC} + @echo ${PIVOTING_TRG} ##------------------------------------------------------------- ## data sets (train/dev/test) @@ -133,7 +143,8 @@ print-datasets: CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${SRCEXT}.gz,${TRAINSET}} \ ${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC} -CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}} +CLEAN_TRAIN_TRG = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${TRGEXT}.gz,${TRAINSET}} \ + ${BACKTRANS_TRG} ${FORWARDTRANS_TRG} ${FORWARDTRANSMONO_TRG} ${PIVOTING_TRG} CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_DEVDATA_TYPE}.${SRCEXT}.gz,${DEVSET}} CLEAN_DEV_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_DEV_SRC}} @@ -235,9 +246,9 @@ clean-data rawdata: .PHONY: clean-data-source clean-data-source: - ${MAKE} ${CLEAN_TEST_SRC} ${CLEAN_TEST_TRG} - ${MAKE} ${CLEAN_TEST_SRC_STATS} ${CLEAN_TEST_TRG_STATS} - ${MAKE} ${DATA_SRC} ${DATA_TRG} + @${MAKE} ${CLEAN_TEST_SRC} ${CLEAN_TEST_TRG} + @${MAKE} ${CLEAN_TEST_SRC_STATS} ${CLEAN_TEST_TRG_STATS} + @${MAKE} ${DATA_SRC} ${DATA_TRG} diff --git a/lib/env.mk b/lib/env.mk index d20c7410..38e61595 100644 --- a/lib/env.mk +++ b/lib/env.mk @@ -128,6 +128,15 @@ MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab TOKENIZER = ${MOSESSCRIPTS}/tokenizer +## browsermt branch of marian-nmt +## https://github.com/browsermt/marian-dev + +BROWSERMT_HOME ?= ${TOOLSDIR}/browsermt +BROWSERMT_TRAIN = ${BROWSERMT_HOME}/marian-dev/build/marian +BROWSERMT_DECODE = ${BROWSERMT_HOME}/marian-dev/build/marian-decoder +BROWSERMT_CONVERT = ${BROWSERMT_HOME}/marian-dev/build/marian-conv + + ## BPE SUBWORD_BPE ?= ${shell which subword-nmt 2>/dev/null || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py} diff --git a/lib/env/mahti.mk b/lib/env/mahti.mk index 06c0f82c..3f7a4ad4 100644 --- a/lib/env/mahti.mk +++ b/lib/env/mahti.mk @@ -14,6 +14,7 @@ OPUSHOME = /projappl/nlpl/data/OPUS MOSESHOME = ${APPLHOME}/install/mosesdecoder MOSESSCRIPTS = ${MOSESHOME}/scripts EFLOMAL_HOME = ${APPLHOME}/install/eflomal/ +BROWSERMT_HOME = ${APPLHOME}/install/browsermt MARIAN_HOME = ${APPLHOME}/install/marian-dev/build/ MARIAN = ${MARIAN_HOME} SPM_HOME = ${MARIAN_HOME} diff --git a/lib/tasks.mk b/lib/tasks.mk index d00045ab..e493e66c 100644 --- a/lib/tasks.mk +++ b/lib/tasks.mk @@ -80,7 +80,9 @@ train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done #------------------------------------------------------------------------ .PHONY: all-job -all-job: ${WORKDIR}/${MODELCONFIG} +all-job: + ${MAKE} rawdata + ${MAKE} ${WORKDIR}/${MODELCONFIG} ${MAKE} data ${MAKE} train-and-eval-job diff --git a/lib/test.mk b/lib/test.mk index ddf42446..1be9e73d 100644 --- a/lib/test.mk +++ b/lib/test.mk @@ -61,7 +61,6 @@ ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}: ${TEST_SRC} grep . $< > $@.input ${LOAD_ENV} && ${MARIAN_DECODER} -i $@.input \ -c ${word 2,$^}.decoder.yml \ - -d ${MARIAN_GPUS} \ ${MARIAN_DECODER_FLAGS} > $@.output ifneq ($(findstring spm,${PRE_TRG}),) sed 's/ //g;s/▁/ /g' < $@.output | sed 's/^ *//;s/ *$$//' > $@ diff --git a/lib/train.mk b/lib/train.mk index bf5dd3bc..9cd35efe 100644 --- a/lib/train.mk +++ b/lib/train.mk @@ -299,3 +299,80 @@ endif --keep-best touch $@ +quantize: ${MODEL_BIN} +quantize-alphas: ${MODEL_BIN_ALPHAS} +intgemm8tuned: ${MODEL_INTGEMM8TUNED} + +# ${MODEL_BIN}: ${MODEL_FINAL} +%.intgemm8.bin: %.npz.best-perplexity.npz + mkdir -p ${dir $@} + ${BROWSERMT_CONVERT} -g intgemm8 -f $< -t $@ + +%.intgemm8.alphas.bin: %.alphas.npz + ${BROWSERMT_CONVERT} --gemm-type intgemm8 -f $< -t $@ + +%.alphas.npz: %.quantmults %.intgemm8tuned.npz + ${BROWSERMT_HOME}/marian-dev/scripts/alphas/extract_stats.py $^ $@ + +## NOTE: need to run this on CPU and with one core only! +%.quantmults: %.intgemm8tuned.npz + ${BROWSERMT_DECODE} \ + --beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \ + --skip-cost --cpu-threads 1 \ + --quiet --quiet-translation \ + -m $< --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \ + -i ${DEV_SRC}.${PRE_SRC} -o ${DEV_SRC}.${PRE_SRC}.${TRG} \ + --dump-quantmult --log $@.log 2> $@ + +%.intgemm8tuned.npz: %.npz.best-perplexity.npz + cp $< $@ + ${LOAD_ENV} && ${BROWSERMT_TRAIN} \ + ${MARIAN_TRAINING_PARAMETER} \ + ${MARIAN_EXTRA} \ + ${MARIAN_DATA_STORAGE} \ + --model $@ \ + --devices ${MARIAN_GPUS} -w 8000 --cost-type ce-mean-words \ + --valid-freq 200 --save-freq 200 --disp-freq 100 --disp-first 10 \ + --valid-metrics ce-mean-words \ + --valid-sets ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} \ + --valid-translation-output ${DEV_SRC}.${PRE_SRC}.${TRG} \ + --early-stopping 20 --overwrite --keep-best --quantize-bits 8 \ + --train-sets ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ + ${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \ + ${MARIAN_TRAIN_WEIGHTS} \ + --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \ + --log $(@:.npz=.train${NR}.log) \ + --valid-log $(@:.npz=.valid${NR}.log) \ + --tempdir ${TMPDIR} \ + --shuffle ${MARIAN_SHUFFLE} + +# --optimizer-delay 4 +# --mini-batch-fit --mini-batch 1000 --maxi-batch 1000 --sync-sgd +# --learn-rate 0.0003 --lr-report --lr-warmup 16000 --lr-decay-inv-sqrt 32000 \ +# --optimizer-params 0.9 0.98 1e-09 --clip-norm 0 \ +# --valid-metrics ce-mean-words \ +# --quiet-translation --valid-mini-batch 16 --beam-size 1 --normalize 1 \ + + +## need to run this on CPU (?) +test-intgemm8: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.${SRC}.${TRG}.eval \ + ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.alphas.${SRC}.${TRG}.eval + +${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN} + ${BROWSERMT_DECODE} \ + --beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \ + --skip-cost --cpu-threads ${HPC_CORES} \ + --quiet --quiet-translation \ + -m ${MODEL_BIN} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \ + -i $< | \ + sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@ + +${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.alphas.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_ALPHAS} + ${BROWSERMT_DECODE} \ + --beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \ + --skip-cost --cpu-threads ${HPC_CORES} \ + --quiet --quiet-translation \ + -m ${MODEL_BIN_ALPHAS} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \ + -i $< | \ + sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@ + diff --git a/tatoeba/forward-translate/Makefile b/tatoeba/forward-translate/Makefile index a9568491..45f75c31 100644 --- a/tatoeba/forward-translate/Makefile +++ b/tatoeba/forward-translate/Makefile @@ -311,15 +311,15 @@ ${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz ${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz mkdir -p ${dir $@} - cp $< $@ + rsync $< $@ ${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz mkdir -p ${dir $@} - cp $< $@ + rsync $< $@ ${BITEXT_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md mkdir -p ${dir $@} - cp $< $@ + rsync $< $@ ## translate diff --git a/tools/browsermt/marian-dev b/tools/browsermt/marian-dev new file mode 160000 index 00000000..9e1bb713 --- /dev/null +++ b/tools/browsermt/marian-dev @@ -0,0 +1 @@ +Subproject commit 9e1bb7131d224ead58f168df89d32fc218a19161