student model quantisation finetuning added

This commit is contained in:
Joerg Tiedemann 2022-01-18 14:41:17 +02:00
parent df1b4b2942
commit bc54b403cd
12 changed files with 140 additions and 13 deletions

3
.gitmodules vendored
View File

@ -31,3 +31,6 @@
[submodule "OPUS-MT-testsets"]
path = OPUS-MT-testsets
url = https://github.com/Helsinki-NLP/OPUS-MT-testsets.git
[submodule "tools/browsermt/marian-dev"]
path = tools/browsermt/marian-dev
url = https://github.com/browsermt/marian-dev.git

View File

@ -147,7 +147,9 @@ include lib/tasks.mk
include lib/projects.mk
.PHONY: all
all: ${WORKDIR}/${MODELCONFIG}
all:
${MAKE} rawdata
${MAKE} ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train
${MAKE} eval

View File

@ -13,9 +13,19 @@ Need better data filtering:
* stronger filters in cleanup script?
* idea: compare character diversity between the two languages and use a threshold to filter sentences? (language-specific?)
links and tools:
* https://github.com/ZJaume/clean
* https://github.com/Helsinki-NLP/OPUS-MT-distillation
# more efficient parallelisation
* https://www.gnu.org/software/parallel/
* https://www.gnu.org/software/parallel/parallel_tutorial.html
* https://www.gnu.org/software/bash/manual/html_node/GNU-Parallel.html
* multinode training with MarianNMT: https://github.com/marian-nmt/marian/issues/244
from Bergamot:
https://github.com/browsermt/students/blob/master/train-student/alignment/generate-alignment-and-shortlist.sh
@ -25,6 +35,11 @@ test -s $DIR/corpus.spm.$SRC || cat $CORPUS_SRC | pigz -dc | parallel --no-notic
test -s $DIR/corpus.spm.$TRG || cat $CORPUS_TRG | pigz -dc | parallel --no-notice --pipe -k -j16 --block 50M "$MARIAN/spm_encode --model $VOCAB" > $DIR/corpus.spm.$TRG
```
# Benchmarking
* SOTA-bench forum: https://forum.sotabench.com/
# OPUS-MT at huggingface

View File

@ -18,9 +18,11 @@ MODELTYPES = transformer \
transformer-base-align \
transformer-big \
transformer-big-align \
transformer-small \
transformer-small-align \
transformer-tiny \
transformer-tiny-align \
transformer-tiny11 \
transformer-tiny11-align
## default model type
@ -450,8 +452,13 @@ MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
MODEL_BIN = ${WORKDIR}/${MODEL_BASENAME}.intgemm8.bin
MODEL_INTGEMM8TUNED = ${WORKDIR}/${MODEL_BASENAME}.intgemm8tuned.npz
MODEL_BIN_ALPHAS = ${WORKDIR}/${MODEL_BASENAME}.intgemm8.alphas.bin
.PRECIOUS: ${MODEL_FINAL} ${MODEL_BIN}
## for sentence-piece models: get plain text vocabularies
## for others: extract vocabulary from training data with MarianNMT
@ -671,7 +678,7 @@ ${WORKDIR}/${MODELCONFIG}:
fi; \
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than ${LARGEST_TRAINSIZE}" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_HPC_MEM = 16g" >> $@; \
echo "GPUJOB_SUBMIT = -gpu01" >> $@; \
echo "SUBWORD_VOCAB_SIZE = ${SUBWORD_VOCAB_SIZE}" >> $@; \
echo "DEVSIZE = ${DEVSIZE}" >> $@; \
@ -679,7 +686,7 @@ ${WORKDIR}/${MODELCONFIG}:
echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \
elif [ $$s -gt ${LARGE_TRAINSIZE} ]; then \
echo "# ${LANGPAIRSTR} training data bigger than ${LARGE_TRAINSIZE}" > $@; \
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
echo "GPUJOB_HPC_MEM = 12g" >> $@; \
echo "GPUJOB_SUBMIT = " >> $@; \
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
echo "SUBWORD_VOCAB_SIZE = ${SUBWORD_VOCAB_SIZE}" >> $@; \

View File

@ -86,7 +86,7 @@ endif
# filtered by reconstruction scores (ce filter)
ifneq (${USE_FORWARDTRANS_SELECTED},)
FORWARDTRANS_SRC += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}}
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
FORWARDTRANS_TRG += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${TRGEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}}
endif
# forward-translation data of monolingual data (source-to-target)
@ -118,11 +118,21 @@ endif
print-datasets:
@echo ${TATOEBA_TRAINSET}
@echo ${TRAINSET}
@echo "all data:"
@echo ${CLEAN_TRAIN_SRC}
@echo ${CLEAN_TRAIN_TRG}
@echo "back-translation data:"
@echo ${BACKTRANS_SRC}
@echo ${BACKTRANS_TRG}
@echo "forward translation data:"
@echo ${FORWARDTRANS_SRC}
@echo ${FORWARDTRANS_TRG}
@echo "monolingual forward translation data:"
@echo ${FORWARDTRANSMONO_SRC}
@echo ${FORWARDTRANSMONO_TRG}
@echo "pivot-based translation data:"
@echo ${PIVOTING_SRC}
@echo ${PIVOTING_TRG}
##-------------------------------------------------------------
## data sets (train/dev/test)
@ -133,7 +143,8 @@ print-datasets:
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${SRCEXT}.gz,${TRAINSET}} \
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC}
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
CLEAN_TRAIN_TRG = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${TRGEXT}.gz,${TRAINSET}} \
${BACKTRANS_TRG} ${FORWARDTRANS_TRG} ${FORWARDTRANSMONO_TRG} ${PIVOTING_TRG}
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_DEVDATA_TYPE}.${SRCEXT}.gz,${DEVSET}}
CLEAN_DEV_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_DEV_SRC}}
@ -235,9 +246,9 @@ clean-data rawdata:
.PHONY: clean-data-source
clean-data-source:
${MAKE} ${CLEAN_TEST_SRC} ${CLEAN_TEST_TRG}
${MAKE} ${CLEAN_TEST_SRC_STATS} ${CLEAN_TEST_TRG_STATS}
${MAKE} ${DATA_SRC} ${DATA_TRG}
@${MAKE} ${CLEAN_TEST_SRC} ${CLEAN_TEST_TRG}
@${MAKE} ${CLEAN_TEST_SRC_STATS} ${CLEAN_TEST_TRG_STATS}
@${MAKE} ${DATA_SRC} ${DATA_TRG}

View File

@ -128,6 +128,15 @@ MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
## browsermt branch of marian-nmt
## https://github.com/browsermt/marian-dev
BROWSERMT_HOME ?= ${TOOLSDIR}/browsermt
BROWSERMT_TRAIN = ${BROWSERMT_HOME}/marian-dev/build/marian
BROWSERMT_DECODE = ${BROWSERMT_HOME}/marian-dev/build/marian-decoder
BROWSERMT_CONVERT = ${BROWSERMT_HOME}/marian-dev/build/marian-conv
## BPE
SUBWORD_BPE ?= ${shell which subword-nmt 2>/dev/null || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py}

1
lib/env/mahti.mk vendored
View File

@ -14,6 +14,7 @@ OPUSHOME = /projappl/nlpl/data/OPUS
MOSESHOME = ${APPLHOME}/install/mosesdecoder
MOSESSCRIPTS = ${MOSESHOME}/scripts
EFLOMAL_HOME = ${APPLHOME}/install/eflomal/
BROWSERMT_HOME = ${APPLHOME}/install/browsermt
MARIAN_HOME = ${APPLHOME}/install/marian-dev/build/
MARIAN = ${MARIAN_HOME}
SPM_HOME = ${MARIAN_HOME}

View File

@ -80,7 +80,9 @@ train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
#------------------------------------------------------------------------
.PHONY: all-job
all-job: ${WORKDIR}/${MODELCONFIG}
all-job:
${MAKE} rawdata
${MAKE} ${WORKDIR}/${MODELCONFIG}
${MAKE} data
${MAKE} train-and-eval-job

View File

@ -61,7 +61,6 @@ ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}: ${TEST_SRC}
grep . $< > $@.input
${LOAD_ENV} && ${MARIAN_DECODER} -i $@.input \
-c ${word 2,$^}.decoder.yml \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} > $@.output
ifneq ($(findstring spm,${PRE_TRG}),)
sed 's/ //g;s/▁/ /g' < $@.output | sed 's/^ *//;s/ *$$//' > $@

View File

@ -299,3 +299,80 @@ endif
--keep-best
touch $@
quantize: ${MODEL_BIN}
quantize-alphas: ${MODEL_BIN_ALPHAS}
intgemm8tuned: ${MODEL_INTGEMM8TUNED}
# ${MODEL_BIN}: ${MODEL_FINAL}
%.intgemm8.bin: %.npz.best-perplexity.npz
mkdir -p ${dir $@}
${BROWSERMT_CONVERT} -g intgemm8 -f $< -t $@
%.intgemm8.alphas.bin: %.alphas.npz
${BROWSERMT_CONVERT} --gemm-type intgemm8 -f $< -t $@
%.alphas.npz: %.quantmults %.intgemm8tuned.npz
${BROWSERMT_HOME}/marian-dev/scripts/alphas/extract_stats.py $^ $@
## NOTE: need to run this on CPU and with one core only!
%.quantmults: %.intgemm8tuned.npz
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--skip-cost --cpu-threads 1 \
--quiet --quiet-translation \
-m $< --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i ${DEV_SRC}.${PRE_SRC} -o ${DEV_SRC}.${PRE_SRC}.${TRG} \
--dump-quantmult --log $@.log 2> $@
%.intgemm8tuned.npz: %.npz.best-perplexity.npz
cp $< $@
${LOAD_ENV} && ${BROWSERMT_TRAIN} \
${MARIAN_TRAINING_PARAMETER} \
${MARIAN_EXTRA} \
${MARIAN_DATA_STORAGE} \
--model $@ \
--devices ${MARIAN_GPUS} -w 8000 --cost-type ce-mean-words \
--valid-freq 200 --save-freq 200 --disp-freq 100 --disp-first 10 \
--valid-metrics ce-mean-words \
--valid-sets ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} \
--valid-translation-output ${DEV_SRC}.${PRE_SRC}.${TRG} \
--early-stopping 20 --overwrite --keep-best --quantize-bits 8 \
--train-sets ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
${MARIAN_TRAIN_WEIGHTS} \
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
--log $(@:.npz=.train${NR}.log) \
--valid-log $(@:.npz=.valid${NR}.log) \
--tempdir ${TMPDIR} \
--shuffle ${MARIAN_SHUFFLE}
# --optimizer-delay 4
# --mini-batch-fit --mini-batch 1000 --maxi-batch 1000 --sync-sgd
# --learn-rate 0.0003 --lr-report --lr-warmup 16000 --lr-decay-inv-sqrt 32000 \
# --optimizer-params 0.9 0.98 1e-09 --clip-norm 0 \
# --valid-metrics ce-mean-words \
# --quiet-translation --valid-mini-batch 16 --beam-size 1 --normalize 1 \
## need to run this on CPU (?)
test-intgemm8: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.${SRC}.${TRG}.eval \
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.alphas.${SRC}.${TRG}.eval
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--skip-cost --cpu-threads ${HPC_CORES} \
--quiet --quiet-translation \
-m ${MODEL_BIN} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.alphas.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_ALPHAS}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--skip-cost --cpu-threads ${HPC_CORES} \
--quiet --quiet-translation \
-m ${MODEL_BIN_ALPHAS} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@

View File

@ -311,15 +311,15 @@ ${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz
mkdir -p ${dir $@}
cp $< $@
rsync $< $@
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz
mkdir -p ${dir $@}
cp $< $@
rsync $< $@
${BITEXT_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md
mkdir -p ${dir $@}
cp $< $@
rsync $< $@
## translate

@ -0,0 +1 @@
Subproject commit 9e1bb7131d224ead58f168df89d32fc218a19161