mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 12:32:24 +03:00
student model quantisation finetuning added
This commit is contained in:
parent
df1b4b2942
commit
bc54b403cd
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -31,3 +31,6 @@
|
||||
[submodule "OPUS-MT-testsets"]
|
||||
path = OPUS-MT-testsets
|
||||
url = https://github.com/Helsinki-NLP/OPUS-MT-testsets.git
|
||||
[submodule "tools/browsermt/marian-dev"]
|
||||
path = tools/browsermt/marian-dev
|
||||
url = https://github.com/browsermt/marian-dev.git
|
||||
|
4
Makefile
4
Makefile
@ -147,7 +147,9 @@ include lib/tasks.mk
|
||||
include lib/projects.mk
|
||||
|
||||
.PHONY: all
|
||||
all: ${WORKDIR}/${MODELCONFIG}
|
||||
all:
|
||||
${MAKE} rawdata
|
||||
${MAKE} ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} data
|
||||
${MAKE} train
|
||||
${MAKE} eval
|
||||
|
15
NOTES.md
15
NOTES.md
@ -13,9 +13,19 @@ Need better data filtering:
|
||||
* stronger filters in cleanup script?
|
||||
* idea: compare character diversity between the two languages and use a threshold to filter sentences? (language-specific?)
|
||||
|
||||
links and tools:
|
||||
|
||||
* https://github.com/ZJaume/clean
|
||||
* https://github.com/Helsinki-NLP/OPUS-MT-distillation
|
||||
|
||||
|
||||
# more efficient parallelisation
|
||||
|
||||
* https://www.gnu.org/software/parallel/
|
||||
* https://www.gnu.org/software/parallel/parallel_tutorial.html
|
||||
* https://www.gnu.org/software/bash/manual/html_node/GNU-Parallel.html
|
||||
* multinode training with MarianNMT: https://github.com/marian-nmt/marian/issues/244
|
||||
|
||||
from Bergamot:
|
||||
https://github.com/browsermt/students/blob/master/train-student/alignment/generate-alignment-and-shortlist.sh
|
||||
|
||||
@ -25,6 +35,11 @@ test -s $DIR/corpus.spm.$SRC || cat $CORPUS_SRC | pigz -dc | parallel --no-notic
|
||||
test -s $DIR/corpus.spm.$TRG || cat $CORPUS_TRG | pigz -dc | parallel --no-notice --pipe -k -j16 --block 50M "$MARIAN/spm_encode --model $VOCAB" > $DIR/corpus.spm.$TRG
|
||||
```
|
||||
|
||||
# Benchmarking
|
||||
|
||||
* SOTA-bench forum: https://forum.sotabench.com/
|
||||
|
||||
|
||||
|
||||
# OPUS-MT at huggingface
|
||||
|
||||
|
@ -18,9 +18,11 @@ MODELTYPES = transformer \
|
||||
transformer-base-align \
|
||||
transformer-big \
|
||||
transformer-big-align \
|
||||
transformer-small \
|
||||
transformer-small-align \
|
||||
transformer-tiny \
|
||||
transformer-tiny-align \
|
||||
transformer-tiny11 \
|
||||
transformer-tiny11-align
|
||||
|
||||
## default model type
|
||||
@ -450,8 +452,13 @@ MODEL_START = ${WORKDIR}/${MODEL_BASENAME}.npz
|
||||
MODEL_FINAL = ${WORKDIR}/${MODEL_BASENAME}.npz.best-perplexity.npz
|
||||
MODEL_DECODER = ${MODEL_FINAL}.decoder.yml
|
||||
|
||||
MODEL_BIN = ${WORKDIR}/${MODEL_BASENAME}.intgemm8.bin
|
||||
MODEL_INTGEMM8TUNED = ${WORKDIR}/${MODEL_BASENAME}.intgemm8tuned.npz
|
||||
MODEL_BIN_ALPHAS = ${WORKDIR}/${MODEL_BASENAME}.intgemm8.alphas.bin
|
||||
|
||||
|
||||
.PRECIOUS: ${MODEL_FINAL} ${MODEL_BIN}
|
||||
|
||||
|
||||
## for sentence-piece models: get plain text vocabularies
|
||||
## for others: extract vocabulary from training data with MarianNMT
|
||||
@ -671,7 +678,7 @@ ${WORKDIR}/${MODELCONFIG}:
|
||||
fi; \
|
||||
if [ $$s -gt ${LARGEST_TRAINSIZE} ]; then \
|
||||
echo "# ${LANGPAIRSTR} training data bigger than ${LARGEST_TRAINSIZE}" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
|
||||
echo "GPUJOB_HPC_MEM = 16g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = -gpu01" >> $@; \
|
||||
echo "SUBWORD_VOCAB_SIZE = ${SUBWORD_VOCAB_SIZE}" >> $@; \
|
||||
echo "DEVSIZE = ${DEVSIZE}" >> $@; \
|
||||
@ -679,7 +686,7 @@ ${WORKDIR}/${MODELCONFIG}:
|
||||
echo "DEVMINSIZE = ${DEVMINSIZE}" >> $@; \
|
||||
elif [ $$s -gt ${LARGE_TRAINSIZE} ]; then \
|
||||
echo "# ${LANGPAIRSTR} training data bigger than ${LARGE_TRAINSIZE}" > $@; \
|
||||
echo "GPUJOB_HPC_MEM = 8g" >> $@; \
|
||||
echo "GPUJOB_HPC_MEM = 12g" >> $@; \
|
||||
echo "GPUJOB_SUBMIT = " >> $@; \
|
||||
echo "MARIAN_VALID_FREQ = 2500" >> $@; \
|
||||
echo "SUBWORD_VOCAB_SIZE = ${SUBWORD_VOCAB_SIZE}" >> $@; \
|
||||
|
21
lib/data.mk
21
lib/data.mk
@ -86,7 +86,7 @@ endif
|
||||
# filtered by reconstruction scores (ce filter)
|
||||
ifneq (${USE_FORWARDTRANS_SELECTED},)
|
||||
FORWARDTRANS_SRC += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${SRCEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}}
|
||||
FORWARDTRANS_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${FORWARDTRANS_SRC}}
|
||||
FORWARDTRANS_TRG += ${sort ${wildcard ${FORWARDTRANS_HOME}/${SRC}-${TRG}/latest/*.${TRGEXT}.best${USE_FORWARDTRANS_SELECTED}.gz}}
|
||||
endif
|
||||
|
||||
# forward-translation data of monolingual data (source-to-target)
|
||||
@ -118,11 +118,21 @@ endif
|
||||
print-datasets:
|
||||
@echo ${TATOEBA_TRAINSET}
|
||||
@echo ${TRAINSET}
|
||||
@echo "all data:"
|
||||
@echo ${CLEAN_TRAIN_SRC}
|
||||
@echo ${CLEAN_TRAIN_TRG}
|
||||
@echo "back-translation data:"
|
||||
@echo ${BACKTRANS_SRC}
|
||||
@echo ${BACKTRANS_TRG}
|
||||
@echo "forward translation data:"
|
||||
@echo ${FORWARDTRANS_SRC}
|
||||
@echo ${FORWARDTRANS_TRG}
|
||||
@echo "monolingual forward translation data:"
|
||||
@echo ${FORWARDTRANSMONO_SRC}
|
||||
@echo ${FORWARDTRANSMONO_TRG}
|
||||
@echo "pivot-based translation data:"
|
||||
@echo ${PIVOTING_SRC}
|
||||
@echo ${PIVOTING_TRG}
|
||||
|
||||
##-------------------------------------------------------------
|
||||
## data sets (train/dev/test)
|
||||
@ -133,7 +143,8 @@ print-datasets:
|
||||
|
||||
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${SRCEXT}.gz,${TRAINSET}} \
|
||||
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC}
|
||||
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
|
||||
CLEAN_TRAIN_TRG = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${TRGEXT}.gz,${TRAINSET}} \
|
||||
${BACKTRANS_TRG} ${FORWARDTRANS_TRG} ${FORWARDTRANSMONO_TRG} ${PIVOTING_TRG}
|
||||
|
||||
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_DEVDATA_TYPE}.${SRCEXT}.gz,${DEVSET}}
|
||||
CLEAN_DEV_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_DEV_SRC}}
|
||||
@ -235,9 +246,9 @@ clean-data rawdata:
|
||||
|
||||
.PHONY: clean-data-source
|
||||
clean-data-source:
|
||||
${MAKE} ${CLEAN_TEST_SRC} ${CLEAN_TEST_TRG}
|
||||
${MAKE} ${CLEAN_TEST_SRC_STATS} ${CLEAN_TEST_TRG_STATS}
|
||||
${MAKE} ${DATA_SRC} ${DATA_TRG}
|
||||
@${MAKE} ${CLEAN_TEST_SRC} ${CLEAN_TEST_TRG}
|
||||
@${MAKE} ${CLEAN_TEST_SRC_STATS} ${CLEAN_TEST_TRG_STATS}
|
||||
@${MAKE} ${DATA_SRC} ${DATA_TRG}
|
||||
|
||||
|
||||
|
||||
|
@ -128,6 +128,15 @@ MARIAN_VOCAB = ${MARIAN_HOME}marian-vocab
|
||||
|
||||
TOKENIZER = ${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
## browsermt branch of marian-nmt
|
||||
## https://github.com/browsermt/marian-dev
|
||||
|
||||
BROWSERMT_HOME ?= ${TOOLSDIR}/browsermt
|
||||
BROWSERMT_TRAIN = ${BROWSERMT_HOME}/marian-dev/build/marian
|
||||
BROWSERMT_DECODE = ${BROWSERMT_HOME}/marian-dev/build/marian-decoder
|
||||
BROWSERMT_CONVERT = ${BROWSERMT_HOME}/marian-dev/build/marian-conv
|
||||
|
||||
|
||||
|
||||
## BPE
|
||||
SUBWORD_BPE ?= ${shell which subword-nmt 2>/dev/null || echo ${TOOLSDIR}/subword-nmt/subword_nmt/subword_nmt.py}
|
||||
|
1
lib/env/mahti.mk
vendored
1
lib/env/mahti.mk
vendored
@ -14,6 +14,7 @@ OPUSHOME = /projappl/nlpl/data/OPUS
|
||||
MOSESHOME = ${APPLHOME}/install/mosesdecoder
|
||||
MOSESSCRIPTS = ${MOSESHOME}/scripts
|
||||
EFLOMAL_HOME = ${APPLHOME}/install/eflomal/
|
||||
BROWSERMT_HOME = ${APPLHOME}/install/browsermt
|
||||
MARIAN_HOME = ${APPLHOME}/install/marian-dev/build/
|
||||
MARIAN = ${MARIAN_HOME}
|
||||
SPM_HOME = ${MARIAN_HOME}
|
||||
|
@ -80,7 +80,9 @@ train-and-start-bt-jobs: ${WORKDIR}/${MODEL}.${MODELTYPE}.model${NR}.done
|
||||
#------------------------------------------------------------------------
|
||||
|
||||
.PHONY: all-job
|
||||
all-job: ${WORKDIR}/${MODELCONFIG}
|
||||
all-job:
|
||||
${MAKE} rawdata
|
||||
${MAKE} ${WORKDIR}/${MODELCONFIG}
|
||||
${MAKE} data
|
||||
${MAKE} train-and-eval-job
|
||||
|
||||
|
@ -61,7 +61,6 @@ ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.${SRC}.${TRG}: ${TEST_SRC}
|
||||
grep . $< > $@.input
|
||||
${LOAD_ENV} && ${MARIAN_DECODER} -i $@.input \
|
||||
-c ${word 2,$^}.decoder.yml \
|
||||
-d ${MARIAN_GPUS} \
|
||||
${MARIAN_DECODER_FLAGS} > $@.output
|
||||
ifneq ($(findstring spm,${PRE_TRG}),)
|
||||
sed 's/ //g;s/▁/ /g' < $@.output | sed 's/^ *//;s/ *$$//' > $@
|
||||
|
77
lib/train.mk
77
lib/train.mk
@ -299,3 +299,80 @@ endif
|
||||
--keep-best
|
||||
touch $@
|
||||
|
||||
quantize: ${MODEL_BIN}
|
||||
quantize-alphas: ${MODEL_BIN_ALPHAS}
|
||||
intgemm8tuned: ${MODEL_INTGEMM8TUNED}
|
||||
|
||||
# ${MODEL_BIN}: ${MODEL_FINAL}
|
||||
%.intgemm8.bin: %.npz.best-perplexity.npz
|
||||
mkdir -p ${dir $@}
|
||||
${BROWSERMT_CONVERT} -g intgemm8 -f $< -t $@
|
||||
|
||||
%.intgemm8.alphas.bin: %.alphas.npz
|
||||
${BROWSERMT_CONVERT} --gemm-type intgemm8 -f $< -t $@
|
||||
|
||||
%.alphas.npz: %.quantmults %.intgemm8tuned.npz
|
||||
${BROWSERMT_HOME}/marian-dev/scripts/alphas/extract_stats.py $^ $@
|
||||
|
||||
## NOTE: need to run this on CPU and with one core only!
|
||||
%.quantmults: %.intgemm8tuned.npz
|
||||
${BROWSERMT_DECODE} \
|
||||
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
|
||||
--skip-cost --cpu-threads 1 \
|
||||
--quiet --quiet-translation \
|
||||
-m $< --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
|
||||
-i ${DEV_SRC}.${PRE_SRC} -o ${DEV_SRC}.${PRE_SRC}.${TRG} \
|
||||
--dump-quantmult --log $@.log 2> $@
|
||||
|
||||
%.intgemm8tuned.npz: %.npz.best-perplexity.npz
|
||||
cp $< $@
|
||||
${LOAD_ENV} && ${BROWSERMT_TRAIN} \
|
||||
${MARIAN_TRAINING_PARAMETER} \
|
||||
${MARIAN_EXTRA} \
|
||||
${MARIAN_DATA_STORAGE} \
|
||||
--model $@ \
|
||||
--devices ${MARIAN_GPUS} -w 8000 --cost-type ce-mean-words \
|
||||
--valid-freq 200 --save-freq 200 --disp-freq 100 --disp-first 10 \
|
||||
--valid-metrics ce-mean-words \
|
||||
--valid-sets ${DEV_SRC}.${PRE_SRC} ${DEV_TRG}.${PRE_TRG} \
|
||||
--valid-translation-output ${DEV_SRC}.${PRE_SRC}.${TRG} \
|
||||
--early-stopping 20 --overwrite --keep-best --quantize-bits 8 \
|
||||
--train-sets ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
${TRAIN_TRG}.clean.${PRE_TRG}${TRAINSIZE}.gz \
|
||||
${MARIAN_TRAIN_WEIGHTS} \
|
||||
--vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
|
||||
--log $(@:.npz=.train${NR}.log) \
|
||||
--valid-log $(@:.npz=.valid${NR}.log) \
|
||||
--tempdir ${TMPDIR} \
|
||||
--shuffle ${MARIAN_SHUFFLE}
|
||||
|
||||
# --optimizer-delay 4
|
||||
# --mini-batch-fit --mini-batch 1000 --maxi-batch 1000 --sync-sgd
|
||||
# --learn-rate 0.0003 --lr-report --lr-warmup 16000 --lr-decay-inv-sqrt 32000 \
|
||||
# --optimizer-params 0.9 0.98 1e-09 --clip-norm 0 \
|
||||
# --valid-metrics ce-mean-words \
|
||||
# --quiet-translation --valid-mini-batch 16 --beam-size 1 --normalize 1 \
|
||||
|
||||
|
||||
## need to run this on CPU (?)
|
||||
test-intgemm8: ${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.${SRC}.${TRG}.eval \
|
||||
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.alphas.${SRC}.${TRG}.eval
|
||||
|
||||
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN}
|
||||
${BROWSERMT_DECODE} \
|
||||
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
|
||||
--skip-cost --cpu-threads ${HPC_CORES} \
|
||||
--quiet --quiet-translation \
|
||||
-m ${MODEL_BIN} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
|
||||
-i $< | \
|
||||
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
|
||||
|
||||
${WORKDIR}/${TESTSET_NAME}.${MODEL}${NR}.${MODELTYPE}.intgemm8.alphas.${SRC}.${TRG}: ${TEST_SRC}.${PRE_SRC} ${MODEL_BIN_ALPHAS}
|
||||
${BROWSERMT_DECODE} \
|
||||
--beam-size 1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src -w 128 \
|
||||
--skip-cost --cpu-threads ${HPC_CORES} \
|
||||
--quiet --quiet-translation \
|
||||
-m ${MODEL_BIN_ALPHAS} --vocabs ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB} \
|
||||
-i $< | \
|
||||
sed 's/ //g;s/▁/ /g;s/^ *//;s/ *$$//' > $@
|
||||
|
||||
|
@ -311,15 +311,15 @@ ${BITEXT_BASE}.${SRC}.%.gz: ${BITEXT_BASE}.${SRC}.spm.%.gz
|
||||
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${SRC}.gz: ${BITEXT_BASE}.${SRC}.%.gz
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
rsync $< $@
|
||||
|
||||
${OUTPUT_DIR}/latest/Tatoeba-train.%.${LANGPAIR}.${TRG}.gz: ${BITEXT_BASE}.${TRG}.%.gz
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
rsync $< $@
|
||||
|
||||
${BITEXT_LATEST_README}: ${LANGPAIR}/${MODELNAME}/README.md
|
||||
mkdir -p ${dir $@}
|
||||
cp $< $@
|
||||
rsync $< $@
|
||||
|
||||
|
||||
## translate
|
||||
|
1
tools/browsermt/marian-dev
Submodule
1
tools/browsermt/marian-dev
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 9e1bb7131d224ead58f168df89d32fc218a19161
|
Loading…
Reference in New Issue
Block a user