finetuning and backtranslation

This commit is contained in:
Joerg Tiedemann 2020-01-12 01:10:53 +02:00
parent fe16a0c4dd
commit e2ed3d85d1
33 changed files with 309 additions and 21 deletions

View File

@ -882,6 +882,9 @@ endif
## make symbolic links to spm-models
## (previously we had data-specific models but now we want to re-use existing ones)
fix-spm-models: fix-spm-models:
cd work-spm; \ cd work-spm; \
for l in ${ALL_LANG_PAIRS}; do \ for l in ${ALL_LANG_PAIRS}; do \

View File

@ -11,35 +11,51 @@ include ../Makefile.slurm
SRC = af SRC = af
TRG = en TRG = en
## maximum input length (number sentence piece segments) ## various sources are available
MAX_LENGTH = 250 ## can be general wikipedia, wikinews, wikibooks, ...
WIKISOURCE = wiki
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH = 100
MAX_SENTENCES = 1000000
LANGPAIR = ${SRC}-${TRG} LANGPAIR = ${SRC}-${TRG}
MODELHOME = ../models/${LANGPAIR} MODELHOME = ../models/${LANGPAIR}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*.zip}}} MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \ LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus && module load nlpl-udpipe nlpl-opus &&
WIKILANGS = ${sort $(patsubst >%wiki-,%,${shell grep -o '>..wiki-' index.html})} \ WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
${sort $(patsubst >%wiki-,%,${shell grep -o '>...wiki-' index.html})} ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
LANGID = ${SRC} LANGID = ${SRC}
WIKI_TXT = wiki.${LANGID}.gz WIKI_TXT = ${WIKISOURCE}.${LANGID}.gz
WIKI_SRC = wiki.${LANGPAIR}.${SRC}.gz WIKI_SRC = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
WIKI_PRE = wiki.${LANGPAIR}.${SRC}.spm.gz WIKI_PRE = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG = wiki.${LANGPAIR}.${TRG}.gz WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}
## find wiki downloads ## find wiki downloads
WIKI_JSON = $(shell grep -o '${LANGID}wiki-[0-9]*-cirrussearch-content.json.gz' index.html | head -1) WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
## we don't need to keep the json file
.INTERMEDIATE: ${WIKI_JSON}
## find UDPipe model ## find UDPipe model
ifndef UDPIPE_MODELS
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
endif
LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \ LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'} cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)} UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
@ -51,11 +67,11 @@ all: index.html
all-wikis: index.html all-wikis: index.html
for l in ${WIKILANGS}; do \ for l in ${WIKILANGS}; do \
${MAKE} LANGID=$$l wiki-txt; \ ${MAKE} LANGID=$$l extract-text; \
done done
wiki-txt: ${WIKI_TXT} extract-text: ${WIKI_TXT}
prepare-model: ${LANGPAIR}/decoder.yml prepare-model: ${LANGPAIR}/decoder.yml
prepare-data: ${WIKI_PRE} prepare-data: ${WIKI_PRE}
translate: ${WIKI_SRC} ${WIKI_TRG} translate: ${WIKI_SRC} ${WIKI_TRG}
@ -66,8 +82,10 @@ print-names:
echo ${WIKI_JSON} echo ${WIKI_JSON}
## fetch the latest model
## ---> TODO: should we fetch from ObjectStorage instead?
${LANGPAIR}/decoder.yml: ${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},) ifneq (${MODELZIP},)
mkdir -p ${dir $@} mkdir -p ${dir $@}
cp ${MODELZIP} ${dir $@} cp ${MODELZIP} ${dir $@}
@ -75,45 +93,66 @@ ifneq (${MODELZIP},)
endif endif
%.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz ## pre-process data
## ---> TODO: does that work for multilingual data that need prefix?
${LANGPAIR}/%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz
ifneq (${MODELZIP},) ifneq (${MODELZIP},)
${MAKE} ${LANGPAIR}/decoder.yml mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
zcat $< |\ zcat $< |\
${LANGPAIR}/preprocess.sh af ${LANGPAIR}/source.spm |\ ${LANGPAIR}/${MODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm |\
perl -pe 'print if (split(/\s+/)>${MAX_LENGTH});' |\ perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
head -${MAX_SENTENCES} |\
gzip -c > $@ gzip -c > $@
endif endif
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
${WIKI_SRC}: ${WIKI_PRE} ${WIKI_SRC}: ${WIKI_PRE}
ifneq (${MODELZIP},) ifneq (${MODELZIP},)
mkdir -p ${dir $@}
zcat $< |\ zcat $< |\
sed 's/ //g;s/▁/ /g' | \ sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\ sed 's/^ *//;s/ *$$//' |\
gzip -c > $@ gzip -c > $@
endif endif
## translate
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz %.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
ifneq (${MODELZIP},) ifneq (${MODELZIP},)
${MAKE} ${LANGPAIR}/decoder.yml mkdir -p ${dir $@}
${LOADMODS} && cd ${LANGPAIR} && ${MARIAN}/marian-decoder \ ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN}/marian-decoder \
-i ${PWD}/$< \ -i ${PWD}/$< \
-c decoder.yml \ -c decoder.yml \
-d ${MARIAN_GPUS} \ -d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\ ${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\ sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@ gzip -c > ${PWD}/$@
ifneq (${LANGPAIR},)
ifneq (${MODELNAME},)
rm -fr ${LANGPAIR}/${MODELNAME}
endif
endif
endif endif
## index of all downloadable files ## index of all downloadable files
index.html: index.html:
wget -O $@ https://dumps.wikimedia.org/other/cirrussearch/current wget -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
## wiki in json format ## wiki in json format
${WIKI_JSON}: ${WIKI_JSON}:
wget https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON} wget -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
## check whether there is a UDPipe model
## backoff to moses tools
ifneq (${UDPIPE_MODEL},) ifneq (${UDPIPE_MODEL},)
SENTSPLITTER = udpipe --input=horizontal --tokenize \ SENTSPLITTER = udpipe --input=horizontal --tokenize \
${UDPIPE_MODELS}/${UDPIPE_MODEL} |\ ${UDPIPE_MODELS}/${UDPIPE_MODEL} |\

View File

@ -1,6 +1,15 @@
# Translate data as synthetic training data
Use Wiki data:
* json processor: https://stedolan.github.io/jq/ * json processor: https://stedolan.github.io/jq/
* wiki JSON dumps: https://dumps.wikimedia.org/other/cirrussearch/current/ * wiki JSON dumps: https://dumps.wikimedia.org/other/cirrussearch/current/
NOTE: this only works for SentencePiece models
## TODO
* download base models from ObjectStorage
* make it work with multilingual models (need to adjust preprocess-scripts for those models)

225
finetune/Makefile Normal file
View File

@ -0,0 +1,225 @@
#
# fine-tune an existing model
#
# make news-tune-data ...... create tunig data from newstest sets
# make all ................. tune and eval
#
#
# NOTE: this only works for SentencePiece models
#
# TODO
# - download base models from ObjectStorage
# - make it work with multilingual models
# --> need to adjust preprocess-scripts for those models
#
include ../Makefile.env
include ../Makefile.config
include ../Makefile.slurm
SRC = en
TRG = de
LANGPAIR = ${SRC}-${TRG}
MODEL = news
TRAIN_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/train/*.${SRC}.gz)}
DEV_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/dev/*.${SRC}.gz)}
TEST_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/test/*.${SRC}.gz)}
TRAIN_TRG = ${patsubst %.${SRC},%.${TRG},${TRAIN_SRC}}
DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}}
TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}}
BASEMODELHOME = ../models/${LANGPAIR}
BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}}
BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}}
TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model
TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml
MARIAN_WORKSPACE = 5000
MARIAN_VALID_FREQ = 100
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_EARLY_STOPPING = 5
.PHONY: all
all: ${TEST_SRC}.${TRG}.compare
.PHONY: news-enfi
news-enfi:
${MAKE} SRC=en TRG=fi MODEL=news \
TRAIN_SRC=en-fi/news/train/newstest2015-2018.en \
TRAIN_TRG=en-fi/news/train/newstest2015-2018.fi \
DEV_SRC=en-fi/news/dev/newsdev2015-enfi.en \
DEV_TRG=en-fi/news/dev/newsdev2015-enfi.fi \
TEST_SRC=en-fi/news/test/newstest2019-enfi.en \
TEST_TRG=en-fi/news/test/newstest2019-enfi.fi \
all
.PHONY: goethe-fide
goethe-ende:
${MAKE} SRC=fi TRG=de MODEL=goethe \
TRAIN_SRC=fi-de/goethe/train/goethe-institute-train.fi \
TRAIN_TRG=fi-de/goethe/train/goethe-institute-train.de \
DEV_SRC=fi-de/goethe/dev/goethe-institute-dev1.fi \
DEV_TRG=fi-de/goethe/dev/goethe-institute-dev1.de \
TEST_SRC=fi-de/goethe/test/goethe-institute-test1.fi \
TEST_TRG=fi-de/goethe/test/goethe-institute-test1.de \
all
## make news tuning data from testsets
TESTSETS_HOME = ../testsets/${LANGPAIR}
NEWS_ALLSETS_SRC = ${sort ${wildcard ${TESTSETS_HOME}/news*.${SRC}.gz}}
NEWS_ALLSETS_TRG = ${sort ${wildcard ${TESTSETS_HOME}/news*.${TRG}.gz}}
NEWS_DEVSET_SRC = ${firstword ${NEWS_ALLSETS_SRC}}
NEWS_DEVSET_TRG = ${firstword ${NEWS_ALLSETS_TRG}}
NEWS_TESTSET_SRC = ${lastword ${NEWS_ALLSETS_SRC}}
NEWS_TESTSET_TRG = ${lastword ${NEWS_ALLSETS_TRG}}
NEWS_TRAINSET_SRC = ${filter-out ${NEWS_DEVSET_SRC} ${NEWS_TESTSET_SRC},${NEWS_ALLSETS_SRC}}
NEWS_TRAINSET_TRG = ${filter-out ${NEWS_DEVSET_TRG} ${NEWS_TESTSET_TRG},${NEWS_ALLSETS_TRG}}
.PHONY: news-tune-data
news-tune-data:
ifneq (${words ${NEWS_ALLSETS_SRC}},0)
ifneq (${words ${NEWS_ALLSETS_SRC}},1)
ifneq (${words ${NEWS_ALLSETS_SRC}},2)
mkdir -p ${LANGPAIR}/news/train
mkdir -p ${LANGPAIR}/news/dev
mkdir -p ${LANGPAIR}/news/test
cp ${NEWS_TESTSET_SRC} ${LANGPAIR}/news/test/
cp ${NEWS_TESTSET_TRG} ${LANGPAIR}/news/test/
cp ${NEWS_DEVSET_SRC} ${LANGPAIR}/news/dev/
cp ${NEWS_DEVSET_TRG} ${LANGPAIR}/news/dev/
zcat ${NEWS_TRAINSET_SRC} | gzip -c > ${LANGPAIR}/news/train/news.${SRC}.gz
zcat ${NEWS_TRAINSET_TRG} | gzip -c > ${LANGPAIR}/news/train/news.${TRG}.gz
endif
endif
endif
.PHONY: data
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
.INTERMEDIATE: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
mkdir -p ${dir $@}
cp ${BASEMODELZIP} ${dir $@}
cd ${dir $@} && unzip -u *.zip
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
gzip -c > $@
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
gzip -c > $@
.PHONY: tune
tune: ${TUNED_MODEL}.done
## train transformer model
${TUNED_MODEL}.npz.best-perplexity.npz: ${TUNED_MODEL}.done
${TUNED_MODEL}.done: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz \
${LANGPAIR}/${BASEMODELNAME}/decoder.yml
mkdir -p ${dir $@}
if [ ! -e ${@:done=npz} ]; then \
cp ${LANGPAIR}/${BASEMODELNAME}/*.npz ${@:done=npz}; \
cp ${LANGPAIR}/${BASEMODELNAME}/*.vocab.yml ${TUNED_MODEL_VOCAB}; \
fi
${LOADMODS} && ${MARIAN}/marian ${MARIAN_EXTRA} \
--model $(@:.done=.npz) \
--type transformer \
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
--max-length 500 \
--vocabs ${TUNED_MODEL_VOCAB} ${TUNED_MODEL_VOCAB} \
--mini-batch-fit \
-w ${MARIAN_WORKSPACE} \
--maxi-batch ${MARIAN_MAXI_BATCH} \
--early-stopping ${MARIAN_EARLY_STOPPING} \
--valid-freq ${MARIAN_VALID_FREQ} \
--save-freq ${MARIAN_SAVE_FREQ} \
--disp-freq ${MARIAN_DISP_FREQ} \
--valid-sets ${word 3,$^} ${word 4,$^} \
--valid-metrics perplexity \
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
--beam-size 12 --normalize 1 \
--log $(@:.model.done=.train.log) --valid-log $(@:.model.done=.valid.log) \
--enc-depth 6 --dec-depth 6 \
--transformer-heads 8 \
--transformer-postprocess-emb d \
--transformer-postprocess dan \
--transformer-dropout ${MARIAN_DROPOUT} \
--label-smoothing 0.1 \
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
--tied-embeddings-all \
--overwrite --keep-best \
--devices ${MARIAN_GPUS} \
--sync-sgd --seed ${SEED} \
--sqlite \
--tempdir ${TMPDIR} \
--exponential-smoothing
touch $@
.PHONY: translate
translate: ${TEST_SRC}.${TRG}.gz
## translate test set
${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
mkdir -p ${dir $@}
${LOADMODS} && ${MARIAN}/marian-decoder -i $< \
-c ${word 2,$^}.decoder.yml \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > $@
.PHONY: eval
eval: ${TEST_SRC}.${TRG}.eval
${TEST_SRC}.${TRG}.eval: ${TEST_SRC}.${TRG}.gz ${TEST_TRG}.gz
zcat ${TEST_TRG} > $@.ref
zcat $< | sacrebleu $@.ref > $@
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
rm -f $@.ref
.PHONY: compare
compare: ${TEST_SRC}.${TRG}.compare
${TEST_SRC}.${TRG}.compare: ${TEST_SRC}.${TRG}.eval
zcat ${TEST_SRC}.gz > $@.1
zcat ${TEST_TRG}.gz > $@.2
zcat ${<:.eval=.gz} > $@.3
paste -d "\n" $@.1 $@.2 $@.3 |\
sed -e "s/&apos;/'/g" \
-e 's/&quot;/"/g' \
-e 's/&lt;/</g' \
-e 's/&gt;/>/g' \
-e 's/&amp;/&/g' |\
sed 'n;n;G;' > $@
rm -f $@.1 $@.2 $@.3

12
finetune/README.md Normal file
View File

@ -0,0 +1,12 @@
# Model fine-tuning
Scripts for fine-tuning transformer models using some small in-domain data.
* NOTE: this only works for SentencePiece models
## TODO
* download base models from ObjectStorage
* make it work with multilingual models (need to adjust preprocess-scripts for those models)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.