finetuning and backtranslation

This commit is contained in:
Joerg Tiedemann 2020-01-12 01:10:53 +02:00
parent fe16a0c4dd
commit e2ed3d85d1
33 changed files with 309 additions and 21 deletions

View File

@ -882,6 +882,9 @@ endif
## make symbolic links to spm-models
## (previously we had data-specific models but now we want to re-use existing ones)
fix-spm-models:
cd work-spm; \
for l in ${ALL_LANG_PAIRS}; do \

View File

@ -11,35 +11,51 @@ include ../Makefile.slurm
SRC = af
TRG = en
## maximum input length (number sentence piece segments)
MAX_LENGTH = 250
## various sources are available
## can be general wikipedia, wikinews, wikibooks, ...
WIKISOURCE = wiki
## maximum input length (number sentence piece segments)
## maximum number of sentences to be translated (top N lines)
MAX_LENGTH = 100
MAX_SENTENCES = 1000000
LANGPAIR = ${SRC}-${TRG}
MODELHOME = ../models/${LANGPAIR}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*.zip}}}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
module load nlpl-udpipe nlpl-opus &&
WIKILANGS = ${sort $(patsubst >%wiki-,%,${shell grep -o '>..wiki-' index.html})} \
${sort $(patsubst >%wiki-,%,${shell grep -o '>...wiki-' index.html})}
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
LANGID = ${SRC}
WIKI_TXT = wiki.${LANGID}.gz
WIKI_SRC = wiki.${LANGPAIR}.${SRC}.gz
WIKI_PRE = wiki.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG = wiki.${LANGPAIR}.${TRG}.gz
WIKI_TXT = ${WIKISOURCE}.${LANGID}.gz
WIKI_SRC = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
WIKI_PRE = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}
## find wiki downloads
WIKI_JSON = $(shell grep -o '${LANGID}wiki-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
## we don't need to keep the json file
.INTERMEDIATE: ${WIKI_JSON}
## find UDPipe model
ifndef UDPIPE_MODELS
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
endif
LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
@ -51,11 +67,11 @@ all: index.html
all-wikis: index.html
for l in ${WIKILANGS}; do \
${MAKE} LANGID=$$l wiki-txt; \
${MAKE} LANGID=$$l extract-text; \
done
wiki-txt: ${WIKI_TXT}
extract-text: ${WIKI_TXT}
prepare-model: ${LANGPAIR}/decoder.yml
prepare-data: ${WIKI_PRE}
translate: ${WIKI_SRC} ${WIKI_TRG}
@ -66,8 +82,10 @@ print-names:
echo ${WIKI_JSON}
## fetch the latest model
## ---> TODO: should we fetch from ObjectStorage instead?
${LANGPAIR}/decoder.yml:
${LANGPAIR}/${MODELNAME}/decoder.yml:
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
cp ${MODELZIP} ${dir $@}
@ -75,45 +93,66 @@ ifneq (${MODELZIP},)
endif
%.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz
## pre-process data
## ---> TODO: does that work for multilingual data that need prefix?
${LANGPAIR}/%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz
ifneq (${MODELZIP},)
${MAKE} ${LANGPAIR}/decoder.yml
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/preprocess.sh af ${LANGPAIR}/source.spm |\
perl -pe 'print if (split(/\s+/)>${MAX_LENGTH});' |\
${LANGPAIR}/${MODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm |\
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
head -${MAX_SENTENCES} |\
gzip -c > $@
endif
## merge SentencePiece segments in the source text
## (Why? because we filter out some data from the original wiki text, see above)
${WIKI_SRC}: ${WIKI_PRE}
ifneq (${MODELZIP},)
mkdir -p ${dir $@}
zcat $< |\
sed 's/ //g;s/▁/ /g' | \
sed 's/^ *//;s/ *$$//' |\
gzip -c > $@
endif
## translate
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
ifneq (${MODELZIP},)
${MAKE} ${LANGPAIR}/decoder.yml
${LOADMODS} && cd ${LANGPAIR} && ${MARIAN}/marian-decoder \
mkdir -p ${dir $@}
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN}/marian-decoder \
-i ${PWD}/$< \
-c decoder.yml \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > ${PWD}/$@
ifneq (${LANGPAIR},)
ifneq (${MODELNAME},)
rm -fr ${LANGPAIR}/${MODELNAME}
endif
endif
endif
## index of all downloadable files
index.html:
wget -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
wget -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
## wiki in json format
${WIKI_JSON}:
wget https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
wget -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
## check whether there is a UDPipe model
## backoff to moses tools
ifneq (${UDPIPE_MODEL},)
SENTSPLITTER = udpipe --input=horizontal --tokenize \
${UDPIPE_MODELS}/${UDPIPE_MODEL} |\

View File

@ -1,6 +1,15 @@
# Translate data as synthetic training data
Use Wiki data:
* json processor: https://stedolan.github.io/jq/
* wiki JSON dumps: https://dumps.wikimedia.org/other/cirrussearch/current/
NOTE: this only works for SentencePiece models
## TODO
* download base models from ObjectStorage
* make it work with multilingual models (need to adjust preprocess-scripts for those models)

225
finetune/Makefile Normal file
View File

@ -0,0 +1,225 @@
#
# fine-tune an existing model
#
# make news-tune-data ...... create tunig data from newstest sets
# make all ................. tune and eval
#
#
# NOTE: this only works for SentencePiece models
#
# TODO
# - download base models from ObjectStorage
# - make it work with multilingual models
# --> need to adjust preprocess-scripts for those models
#
include ../Makefile.env
include ../Makefile.config
include ../Makefile.slurm
SRC = en
TRG = de
LANGPAIR = ${SRC}-${TRG}
MODEL = news
TRAIN_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/train/*.${SRC}.gz)}
DEV_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/dev/*.${SRC}.gz)}
TEST_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/test/*.${SRC}.gz)}
TRAIN_TRG = ${patsubst %.${SRC},%.${TRG},${TRAIN_SRC}}
DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}}
TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}}
BASEMODELHOME = ../models/${LANGPAIR}
BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}}
BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}}
TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model
TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml
MARIAN_WORKSPACE = 5000
MARIAN_VALID_FREQ = 100
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
MARIAN_EARLY_STOPPING = 5
.PHONY: all
all: ${TEST_SRC}.${TRG}.compare
.PHONY: news-enfi
news-enfi:
${MAKE} SRC=en TRG=fi MODEL=news \
TRAIN_SRC=en-fi/news/train/newstest2015-2018.en \
TRAIN_TRG=en-fi/news/train/newstest2015-2018.fi \
DEV_SRC=en-fi/news/dev/newsdev2015-enfi.en \
DEV_TRG=en-fi/news/dev/newsdev2015-enfi.fi \
TEST_SRC=en-fi/news/test/newstest2019-enfi.en \
TEST_TRG=en-fi/news/test/newstest2019-enfi.fi \
all
.PHONY: goethe-fide
goethe-ende:
${MAKE} SRC=fi TRG=de MODEL=goethe \
TRAIN_SRC=fi-de/goethe/train/goethe-institute-train.fi \
TRAIN_TRG=fi-de/goethe/train/goethe-institute-train.de \
DEV_SRC=fi-de/goethe/dev/goethe-institute-dev1.fi \
DEV_TRG=fi-de/goethe/dev/goethe-institute-dev1.de \
TEST_SRC=fi-de/goethe/test/goethe-institute-test1.fi \
TEST_TRG=fi-de/goethe/test/goethe-institute-test1.de \
all
## make news tuning data from testsets
TESTSETS_HOME = ../testsets/${LANGPAIR}
NEWS_ALLSETS_SRC = ${sort ${wildcard ${TESTSETS_HOME}/news*.${SRC}.gz}}
NEWS_ALLSETS_TRG = ${sort ${wildcard ${TESTSETS_HOME}/news*.${TRG}.gz}}
NEWS_DEVSET_SRC = ${firstword ${NEWS_ALLSETS_SRC}}
NEWS_DEVSET_TRG = ${firstword ${NEWS_ALLSETS_TRG}}
NEWS_TESTSET_SRC = ${lastword ${NEWS_ALLSETS_SRC}}
NEWS_TESTSET_TRG = ${lastword ${NEWS_ALLSETS_TRG}}
NEWS_TRAINSET_SRC = ${filter-out ${NEWS_DEVSET_SRC} ${NEWS_TESTSET_SRC},${NEWS_ALLSETS_SRC}}
NEWS_TRAINSET_TRG = ${filter-out ${NEWS_DEVSET_TRG} ${NEWS_TESTSET_TRG},${NEWS_ALLSETS_TRG}}
.PHONY: news-tune-data
news-tune-data:
ifneq (${words ${NEWS_ALLSETS_SRC}},0)
ifneq (${words ${NEWS_ALLSETS_SRC}},1)
ifneq (${words ${NEWS_ALLSETS_SRC}},2)
mkdir -p ${LANGPAIR}/news/train
mkdir -p ${LANGPAIR}/news/dev
mkdir -p ${LANGPAIR}/news/test
cp ${NEWS_TESTSET_SRC} ${LANGPAIR}/news/test/
cp ${NEWS_TESTSET_TRG} ${LANGPAIR}/news/test/
cp ${NEWS_DEVSET_SRC} ${LANGPAIR}/news/dev/
cp ${NEWS_DEVSET_TRG} ${LANGPAIR}/news/dev/
zcat ${NEWS_TRAINSET_SRC} | gzip -c > ${LANGPAIR}/news/train/news.${SRC}.gz
zcat ${NEWS_TRAINSET_TRG} | gzip -c > ${LANGPAIR}/news/train/news.${TRG}.gz
endif
endif
endif
.PHONY: data
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
.INTERMEDIATE: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
mkdir -p ${dir $@}
cp ${BASEMODELZIP} ${dir $@}
cd ${dir $@} && unzip -u *.zip
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
gzip -c > $@
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
zcat $< |\
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
gzip -c > $@
.PHONY: tune
tune: ${TUNED_MODEL}.done
## train transformer model
${TUNED_MODEL}.npz.best-perplexity.npz: ${TUNED_MODEL}.done
${TUNED_MODEL}.done: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz \
${LANGPAIR}/${BASEMODELNAME}/decoder.yml
mkdir -p ${dir $@}
if [ ! -e ${@:done=npz} ]; then \
cp ${LANGPAIR}/${BASEMODELNAME}/*.npz ${@:done=npz}; \
cp ${LANGPAIR}/${BASEMODELNAME}/*.vocab.yml ${TUNED_MODEL_VOCAB}; \
fi
${LOADMODS} && ${MARIAN}/marian ${MARIAN_EXTRA} \
--model $(@:.done=.npz) \
--type transformer \
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
--max-length 500 \
--vocabs ${TUNED_MODEL_VOCAB} ${TUNED_MODEL_VOCAB} \
--mini-batch-fit \
-w ${MARIAN_WORKSPACE} \
--maxi-batch ${MARIAN_MAXI_BATCH} \
--early-stopping ${MARIAN_EARLY_STOPPING} \
--valid-freq ${MARIAN_VALID_FREQ} \
--save-freq ${MARIAN_SAVE_FREQ} \
--disp-freq ${MARIAN_DISP_FREQ} \
--valid-sets ${word 3,$^} ${word 4,$^} \
--valid-metrics perplexity \
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
--beam-size 12 --normalize 1 \
--log $(@:.model.done=.train.log) --valid-log $(@:.model.done=.valid.log) \
--enc-depth 6 --dec-depth 6 \
--transformer-heads 8 \
--transformer-postprocess-emb d \
--transformer-postprocess dan \
--transformer-dropout ${MARIAN_DROPOUT} \
--label-smoothing 0.1 \
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
--tied-embeddings-all \
--overwrite --keep-best \
--devices ${MARIAN_GPUS} \
--sync-sgd --seed ${SEED} \
--sqlite \
--tempdir ${TMPDIR} \
--exponential-smoothing
touch $@
.PHONY: translate
translate: ${TEST_SRC}.${TRG}.gz
## translate test set
${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
mkdir -p ${dir $@}
${LOADMODS} && ${MARIAN}/marian-decoder -i $< \
-c ${word 2,$^}.decoder.yml \
-d ${MARIAN_GPUS} \
${MARIAN_DECODER_FLAGS} |\
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
gzip -c > $@
.PHONY: eval
eval: ${TEST_SRC}.${TRG}.eval
${TEST_SRC}.${TRG}.eval: ${TEST_SRC}.${TRG}.gz ${TEST_TRG}.gz
zcat ${TEST_TRG} > $@.ref
zcat $< | sacrebleu $@.ref > $@
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
rm -f $@.ref
.PHONY: compare
compare: ${TEST_SRC}.${TRG}.compare
${TEST_SRC}.${TRG}.compare: ${TEST_SRC}.${TRG}.eval
zcat ${TEST_SRC}.gz > $@.1
zcat ${TEST_TRG}.gz > $@.2
zcat ${<:.eval=.gz} > $@.3
paste -d "\n" $@.1 $@.2 $@.3 |\
sed -e "s/&apos;/'/g" \
-e 's/&quot;/"/g' \
-e 's/&lt;/</g' \
-e 's/&gt;/>/g' \
-e 's/&amp;/&/g' |\
sed 'n;n;G;' > $@
rm -f $@.1 $@.2 $@.3

12
finetune/README.md Normal file
View File

@ -0,0 +1,12 @@
# Model fine-tuning
Scripts for fine-tuning transformer models using some small in-domain data.
* NOTE: this only works for SentencePiece models
## TODO
* download base models from ObjectStorage
* make it work with multilingual models (need to adjust preprocess-scripts for those models)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.