OPUS-MT-train/lib/quantize.mk
2022-03-05 23:46:52 +02:00

260 lines
11 KiB
Makefile

# -*-makefile-*-
#
# create efficient models
# inspired by browsermt
#
# - binary lexical shortlists
# - several variants of quantization
# - finetuned quantization
# - test translations with various quantized models
.PHONY: lexical-shortlist
lexical-shortlist: ${MODEL_BIN_SHORTLIST}
${MODEL_BIN_SHORTLIST}: ${TRAIN_S2T} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
${BROWSERMT_CONVERT} \
--shortlist $< ${SHORTLIST_NRVOC} ${SHORTLIST_NRTRANS} 0 \
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
--dump $@
.PHONY: quantize quantize-alphas quantize-tuned quantize-tuned-alphas intgemm8tuned
quantize: ${MODEL_BIN}
quantize-alphas: ${MODEL_BIN_ALPHAS}
quantize-tuned: ${MODEL_BIN_TUNED}
quantize-tuned-alphas: ${MODEL_BIN_TUNED_ALPHAS}
intgemm8tuned: ${MODEL_INTGEMM8TUNED}
# ${MODEL_BIN}: ${MODEL_FINAL}
%.intgemm8.bin: %.npz.best-perplexity.npz
mkdir -p ${dir $@}
${BROWSERMT_CONVERT} -g intgemm8 -f $< -t $@
%.intgemm8.alphas.bin: %.alphas.npz
${BROWSERMT_CONVERT} --gemm-type intgemm8 -f $< -t $@
%.alphas.npz: %.quantmults %.npz.best-perplexity.npz
${BROWSERMT_HOME}/marian-dev/scripts/alphas/extract_stats.py $^ $@
## NOTE: need to run this on CPU and with one core only!
%.quantmults: %.npz.best-perplexity.npz
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--skip-cost --cpu-threads 1 \
--quiet --quiet-translation \
-m $< --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i ${DEV_SRC}.${PRE_SRC} -o ${DEV_SRC}.${PRE_SRC}.${TRG} \
--dump-quantmult --log $@.log 2> $@
## additional finetuning for intgemm8
%.intgemm8tuned.bin: %.intgemm8tuned.npz
mkdir -p ${dir $@}
${BROWSERMT_CONVERT} -g intgemm8 -f $< -t $@
%.intgemm8tuned.alphas.bin: %.finetune-alphas.npz
${BROWSERMT_CONVERT} --gemm-type intgemm8 -f $< -t $@
%.finetune-alphas.npz: %.finetune-quantmults %.intgemm8tuned.npz
${BROWSERMT_HOME}/marian-dev/scripts/alphas/extract_stats.py $^ $@
## NOTE: need to run this on CPU and with one core only!
%.finetune-quantmults: %.intgemm8tuned.npz
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--skip-cost --cpu-threads 1 \
--quiet --quiet-translation \
-m $< --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i ${DEV_SRC}.${PRE_SRC} -o ${DEV_SRC}.${PRE_SRC}.${TRG} \
--dump-quantmult --log $@.log 2> $@
%.intgemm8tuned.npz: %.npz.best-perplexity.npz
cp $< $@
${LOAD_ENV} && ${BROWSERMT_TRAIN} \
${MARIAN_TRAINING_PARAMETER} \
${MARIAN_EXTRA} \
${MARIAN_DATA_STORAGE} \
--model $@ \
--devices ${MARIAN_GPUS} -w 8000 --cost-type ce-mean-words \
--valid-freq 200 --save-freq 200 --disp-freq 100 --disp-first 10 \
--valid-metrics ce-mean-words \
--valid-sets ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} \
--valid-translation-output ${DEV_SRC}.${PRE_SRC}.${TRG} \
--early-stopping 20 --overwrite --keep-best --quantize-bits 8 \
--train-sets ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
${MARIAN_TRAIN_WEIGHTS} \
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
--log $(@:.npz=.train${NR}.log) \
--valid-log $(@:.npz=.valid${NR}.log) \
--tempdir ${TMPDIR} \
--shuffle ${MARIAN_SHUFFLE}
# --optimizer-delay 4
# --mini-batch-fit --mini-batch 1000 --maxi-batch 1000 --sync-sgd
# --learn-rate 0.0003 --lr-report --lr-warmup 16000 --lr-decay-inv-sqrt 32000 \
# --optimizer-params 0.9 0.98 1e-09 --clip-norm 0 \
# --valid-metrics ce-mean-words \
# --quiet-translation --valid-mini-batch 16 --beam-size 1 --normalize 1 \
## test quanitized student models
## need to run this on CPU!!
.PHONY: test-intgemm8 test-intgemm8 test-intgemm8alpha test-intgemm8-shortlist test-intgemm8alpha-shortlist
test-intgemm8-all: test-intgemm8 test-intgemm8shift test-intgemm8alpha
test-intgemm8-all-shortlist: test-intgemm8-shortlist test-intgemm8shift-shortlist test-intgemm8alpha-shortlist
test-intgemm8-alltuned: test-intgemm8tuned test-intgemm8tunedshift test-intgemm8tunedalpha
test-intgemm8-alltuned-shortlist: test-intgemm8tuned-shortlist test-intgemm8tunedshift-shortlist test-intgemm8tunedalpha-shortlist
test-intgemm8: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.${SRC}.${TRG}.eval
test-intgemm8shift: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8shift.${SRC}.${TRG}.eval
test-intgemm8alpha: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8shiftAlphas.${SRC}.${TRG}.eval
test-intgemm8-shortlist: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.shortlist.${SRC}.${TRG}.eval
test-intgemm8shift-shortlist: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8shift.shortlist.${SRC}.${TRG}.eval
test-intgemm8alpha-shortlist: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8shiftAlphas.shortlist.${SRC}.${TRG}.eval
test-intgemm8tuned: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tuned.${SRC}.${TRG}.eval
test-intgemm8tunedshift: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tunedshift.${SRC}.${TRG}.eval
test-intgemm8tunedalpha: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tunedshiftAlphas.${SRC}.${TRG}.eval
test-intgemm8tuned-shortlist: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tuned.shortlist.${SRC}.${TRG}.eval
test-intgemm8tunedshift-shortlist: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tunedshift.shortlist.${SRC}.${TRG}.eval
test-intgemm8tunedalpha-shortlist: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tunedshiftAlphas.shortlist.${SRC}.${TRG}.eval
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8 --skip-cost --cpu-threads ${HPC_CORES} \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8shift.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8shift --skip-cost --cpu-threads ${HPC_CORES} \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8shiftAlphas.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_ALPHAS}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8shiftAlpha --skip-cost --cpu-threads ${HPC_CORES} \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN_ALPHAS} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
## with shortlists
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.shortlist.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN} ${MODEL_BIN_SHORTLIST}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8 --skip-cost --cpu-threads ${HPC_CORES} \
--shortlist ${MODEL_BIN_SHORTLIST} false \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8shift.shortlist.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN} ${MODEL_BIN_SHORTLIST}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8shift --skip-cost --cpu-threads ${HPC_CORES} \
--shortlist ${MODEL_BIN_SHORTLIST} false \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8shiftAlphas.shortlist.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_ALPHAS} ${MODEL_BIN_SHORTLIST}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8shiftAlpha --skip-cost --cpu-threads ${HPC_CORES} \
--shortlist ${MODEL_BIN_SHORTLIST} false \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN_ALPHAS} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
## finetuned quantization
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tuned.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_TUNED}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8 --skip-cost --cpu-threads ${HPC_CORES} \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN_TUNED} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tunedshift.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_TUNED}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8shift --skip-cost --cpu-threads ${HPC_CORES} \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN_TUNED} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tunedshiftAlphas.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_TUNED_ALPHAS}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8shiftAlpha --skip-cost --cpu-threads ${HPC_CORES} \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN_TUNED_ALPHAS} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
## with shortlists
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tuned.shortlist.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_TUNED} ${MODEL_BIN_SHORTLIST}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8 --skip-cost --cpu-threads ${HPC_CORES} \
--shortlist ${MODEL_BIN_SHORTLIST} false \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN_TUNED} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tunedshift.shortlist.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_TUNED} ${MODEL_BIN_SHORTLIST}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8shift --skip-cost --cpu-threads ${HPC_CORES} \
--shortlist ${MODEL_BIN_SHORTLIST} false \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN_TUNED} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8tunedshiftAlphas.shortlist.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_TUNED_ALPHAS} ${MODEL_BIN_SHORTLIST}
${BROWSERMT_DECODE} \
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
--int8shiftAlpha --skip-cost --cpu-threads ${HPC_CORES} \
--shortlist ${MODEL_BIN_SHORTLIST} false \
--quiet --quiet-translation --log $@.log \
-m ${MODEL_BIN_TUNED_ALPHAS} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
-i $< | \
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@