diff --git a/Makefile.data b/Makefile.data index 8211d416..b6d854b6 100644 --- a/Makefile.data +++ b/Makefile.data @@ -882,6 +882,9 @@ endif +## make symbolic links to spm-models +## (previously we had data-specific models but now we want to re-use existing ones) + fix-spm-models: cd work-spm; \ for l in ${ALL_LANG_PAIRS}; do \ diff --git a/backtranslate/Makefile b/backtranslate/Makefile index 4c58dd93..787f67ef 100644 --- a/backtranslate/Makefile +++ b/backtranslate/Makefile @@ -11,35 +11,51 @@ include ../Makefile.slurm SRC = af TRG = en -## maximum input length (number sentence piece segments) -MAX_LENGTH = 250 +## various sources are available +## can be general wikipedia, wikinews, wikibooks, ... +WIKISOURCE = wiki +## maximum input length (number sentence piece segments) +## maximum number of sentences to be translated (top N lines) +MAX_LENGTH = 100 +MAX_SENTENCES = 1000000 LANGPAIR = ${SRC}-${TRG} MODELHOME = ../models/${LANGPAIR} -MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*.zip}}} +MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}} +MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \ module load nlpl-udpipe nlpl-opus && -WIKILANGS = ${sort $(patsubst >%wiki-,%,${shell grep -o '>..wiki-' index.html})} \ - ${sort $(patsubst >%wiki-,%,${shell grep -o '>...wiki-' index.html})} +WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \ + ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})} LANGID = ${SRC} -WIKI_TXT = wiki.${LANGID}.gz -WIKI_SRC = wiki.${LANGPAIR}.${SRC}.gz -WIKI_PRE = wiki.${LANGPAIR}.${SRC}.spm.gz -WIKI_TRG = wiki.${LANGPAIR}.${TRG}.gz +WIKI_TXT = ${WIKISOURCE}.${LANGID}.gz +WIKI_SRC = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${SRC}.gz +WIKI_PRE = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz +WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${TRG}.gz + +## don't delete translated text if the process crashes +.PRECIOUS: ${WIKI_TRG} ## find wiki downloads -WIKI_JSON = $(shell grep -o '${LANGID}wiki-[0-9]*-cirrussearch-content.json.gz' index.html | head -1) +WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1) + +## we don't need to keep the json file +.INTERMEDIATE: ${WIKI_JSON} + ## find UDPipe model +ifndef UDPIPE_MODELS + UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models +endif LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \ cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'} UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)} @@ -51,11 +67,11 @@ all: index.html all-wikis: index.html for l in ${WIKILANGS}; do \ - ${MAKE} LANGID=$$l wiki-txt; \ + ${MAKE} LANGID=$$l extract-text; \ done -wiki-txt: ${WIKI_TXT} +extract-text: ${WIKI_TXT} prepare-model: ${LANGPAIR}/decoder.yml prepare-data: ${WIKI_PRE} translate: ${WIKI_SRC} ${WIKI_TRG} @@ -66,8 +82,10 @@ print-names: echo ${WIKI_JSON} +## fetch the latest model +## ---> TODO: should we fetch from ObjectStorage instead? -${LANGPAIR}/decoder.yml: +${LANGPAIR}/${MODELNAME}/decoder.yml: ifneq (${MODELZIP},) mkdir -p ${dir $@} cp ${MODELZIP} ${dir $@} @@ -75,45 +93,66 @@ ifneq (${MODELZIP},) endif -%.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz +## pre-process data +## ---> TODO: does that work for multilingual data that need prefix? + +${LANGPAIR}/%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz ifneq (${MODELZIP},) - ${MAKE} ${LANGPAIR}/decoder.yml + mkdir -p ${dir $@} + ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml zcat $< |\ - ${LANGPAIR}/preprocess.sh af ${LANGPAIR}/source.spm |\ - perl -pe 'print if (split(/\s+/)>${MAX_LENGTH});' |\ + ${LANGPAIR}/${MODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm |\ + perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\ + head -${MAX_SENTENCES} |\ gzip -c > $@ endif + +## merge SentencePiece segments in the source text +## (Why? because we filter out some data from the original wiki text, see above) + ${WIKI_SRC}: ${WIKI_PRE} ifneq (${MODELZIP},) + mkdir -p ${dir $@} zcat $< |\ sed 's/ //g;s/▁/ /g' | \ sed 's/^ *//;s/ *$$//' |\ gzip -c > $@ endif + +## translate + %.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz ifneq (${MODELZIP},) - ${MAKE} ${LANGPAIR}/decoder.yml - ${LOADMODS} && cd ${LANGPAIR} && ${MARIAN}/marian-decoder \ + mkdir -p ${dir $@} + ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml + ${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN}/marian-decoder \ -i ${PWD}/$< \ -c decoder.yml \ -d ${MARIAN_GPUS} \ ${MARIAN_DECODER_FLAGS} |\ sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\ gzip -c > ${PWD}/$@ +ifneq (${LANGPAIR},) +ifneq (${MODELNAME},) + rm -fr ${LANGPAIR}/${MODELNAME} +endif +endif endif ## index of all downloadable files index.html: - wget -O $@ https://dumps.wikimedia.org/other/cirrussearch/current + wget -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current ## wiki in json format ${WIKI_JSON}: - wget https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON} + wget -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON} +## check whether there is a UDPipe model +## backoff to moses tools ifneq (${UDPIPE_MODEL},) SENTSPLITTER = udpipe --input=horizontal --tokenize \ ${UDPIPE_MODELS}/${UDPIPE_MODEL} |\ diff --git a/backtranslate/README.md b/backtranslate/README.md index 8267eba4..86591413 100644 --- a/backtranslate/README.md +++ b/backtranslate/README.md @@ -1,6 +1,15 @@ +# Translate data as synthetic training data + +Use Wiki data: * json processor: https://stedolan.github.io/jq/ * wiki JSON dumps: https://dumps.wikimedia.org/other/cirrussearch/current/ +NOTE: this only works for SentencePiece models + +## TODO + +* download base models from ObjectStorage +* make it work with multilingual models (need to adjust preprocess-scripts for those models) diff --git a/finetune/Makefile b/finetune/Makefile new file mode 100644 index 00000000..18e2ad9a --- /dev/null +++ b/finetune/Makefile @@ -0,0 +1,225 @@ +# +# fine-tune an existing model +# +# make news-tune-data ...... create tunig data from newstest sets +# make all ................. tune and eval +# +# +# NOTE: this only works for SentencePiece models +# +# TODO +# - download base models from ObjectStorage +# - make it work with multilingual models +# --> need to adjust preprocess-scripts for those models +# + +include ../Makefile.env +include ../Makefile.config +include ../Makefile.slurm + + +SRC = en +TRG = de +LANGPAIR = ${SRC}-${TRG} +MODEL = news + +TRAIN_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/train/*.${SRC}.gz)} +DEV_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/dev/*.${SRC}.gz)} +TEST_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/test/*.${SRC}.gz)} + +TRAIN_TRG = ${patsubst %.${SRC},%.${TRG},${TRAIN_SRC}} +DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}} +TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}} + + +BASEMODELHOME = ../models/${LANGPAIR} +BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}} +BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}} + + +TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model +TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml + + +MARIAN_WORKSPACE = 5000 +MARIAN_VALID_FREQ = 100 +MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ} +MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ} +MARIAN_EARLY_STOPPING = 5 + + + + +.PHONY: all +all: ${TEST_SRC}.${TRG}.compare + + +.PHONY: news-enfi +news-enfi: + ${MAKE} SRC=en TRG=fi MODEL=news \ + TRAIN_SRC=en-fi/news/train/newstest2015-2018.en \ + TRAIN_TRG=en-fi/news/train/newstest2015-2018.fi \ + DEV_SRC=en-fi/news/dev/newsdev2015-enfi.en \ + DEV_TRG=en-fi/news/dev/newsdev2015-enfi.fi \ + TEST_SRC=en-fi/news/test/newstest2019-enfi.en \ + TEST_TRG=en-fi/news/test/newstest2019-enfi.fi \ + all + +.PHONY: goethe-fide +goethe-ende: + ${MAKE} SRC=fi TRG=de MODEL=goethe \ + TRAIN_SRC=fi-de/goethe/train/goethe-institute-train.fi \ + TRAIN_TRG=fi-de/goethe/train/goethe-institute-train.de \ + DEV_SRC=fi-de/goethe/dev/goethe-institute-dev1.fi \ + DEV_TRG=fi-de/goethe/dev/goethe-institute-dev1.de \ + TEST_SRC=fi-de/goethe/test/goethe-institute-test1.fi \ + TEST_TRG=fi-de/goethe/test/goethe-institute-test1.de \ + all + + + +## make news tuning data from testsets + +TESTSETS_HOME = ../testsets/${LANGPAIR} +NEWS_ALLSETS_SRC = ${sort ${wildcard ${TESTSETS_HOME}/news*.${SRC}.gz}} +NEWS_ALLSETS_TRG = ${sort ${wildcard ${TESTSETS_HOME}/news*.${TRG}.gz}} +NEWS_DEVSET_SRC = ${firstword ${NEWS_ALLSETS_SRC}} +NEWS_DEVSET_TRG = ${firstword ${NEWS_ALLSETS_TRG}} +NEWS_TESTSET_SRC = ${lastword ${NEWS_ALLSETS_SRC}} +NEWS_TESTSET_TRG = ${lastword ${NEWS_ALLSETS_TRG}} +NEWS_TRAINSET_SRC = ${filter-out ${NEWS_DEVSET_SRC} ${NEWS_TESTSET_SRC},${NEWS_ALLSETS_SRC}} +NEWS_TRAINSET_TRG = ${filter-out ${NEWS_DEVSET_TRG} ${NEWS_TESTSET_TRG},${NEWS_ALLSETS_TRG}} + +.PHONY: news-tune-data +news-tune-data: +ifneq (${words ${NEWS_ALLSETS_SRC}},0) +ifneq (${words ${NEWS_ALLSETS_SRC}},1) +ifneq (${words ${NEWS_ALLSETS_SRC}},2) + mkdir -p ${LANGPAIR}/news/train + mkdir -p ${LANGPAIR}/news/dev + mkdir -p ${LANGPAIR}/news/test + cp ${NEWS_TESTSET_SRC} ${LANGPAIR}/news/test/ + cp ${NEWS_TESTSET_TRG} ${LANGPAIR}/news/test/ + cp ${NEWS_DEVSET_SRC} ${LANGPAIR}/news/dev/ + cp ${NEWS_DEVSET_TRG} ${LANGPAIR}/news/dev/ + zcat ${NEWS_TRAINSET_SRC} | gzip -c > ${LANGPAIR}/news/train/news.${SRC}.gz + zcat ${NEWS_TRAINSET_TRG} | gzip -c > ${LANGPAIR}/news/train/news.${TRG}.gz +endif +endif +endif + + + +.PHONY: data +data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz + +.INTERMEDIATE: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml +${LANGPAIR}/${BASEMODELNAME}/decoder.yml: + mkdir -p ${dir $@} + cp ${BASEMODELZIP} ${dir $@} + cd ${dir $@} && unzip -u *.zip + +${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml + zcat $< |\ + ${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\ + gzip -c > $@ + +${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml + zcat $< |\ + ${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\ + gzip -c > $@ + + + + +.PHONY: tune +tune: ${TUNED_MODEL}.done + +## train transformer model +${TUNED_MODEL}.npz.best-perplexity.npz: ${TUNED_MODEL}.done + +${TUNED_MODEL}.done: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz \ + ${LANGPAIR}/${BASEMODELNAME}/decoder.yml + mkdir -p ${dir $@} + if [ ! -e ${@:done=npz} ]; then \ + cp ${LANGPAIR}/${BASEMODELNAME}/*.npz ${@:done=npz}; \ + cp ${LANGPAIR}/${BASEMODELNAME}/*.vocab.yml ${TUNED_MODEL_VOCAB}; \ + fi + ${LOADMODS} && ${MARIAN}/marian ${MARIAN_EXTRA} \ + --model $(@:.done=.npz) \ + --type transformer \ + --train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \ + --max-length 500 \ + --vocabs ${TUNED_MODEL_VOCAB} ${TUNED_MODEL_VOCAB} \ + --mini-batch-fit \ + -w ${MARIAN_WORKSPACE} \ + --maxi-batch ${MARIAN_MAXI_BATCH} \ + --early-stopping ${MARIAN_EARLY_STOPPING} \ + --valid-freq ${MARIAN_VALID_FREQ} \ + --save-freq ${MARIAN_SAVE_FREQ} \ + --disp-freq ${MARIAN_DISP_FREQ} \ + --valid-sets ${word 3,$^} ${word 4,$^} \ + --valid-metrics perplexity \ + --valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \ + --beam-size 12 --normalize 1 \ + --log $(@:.model.done=.train.log) --valid-log $(@:.model.done=.valid.log) \ + --enc-depth 6 --dec-depth 6 \ + --transformer-heads 8 \ + --transformer-postprocess-emb d \ + --transformer-postprocess dan \ + --transformer-dropout ${MARIAN_DROPOUT} \ + --label-smoothing 0.1 \ + --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ + --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ + --tied-embeddings-all \ + --overwrite --keep-best \ + --devices ${MARIAN_GPUS} \ + --sync-sgd --seed ${SEED} \ + --sqlite \ + --tempdir ${TMPDIR} \ + --exponential-smoothing + touch $@ + + + +.PHONY: translate +translate: ${TEST_SRC}.${TRG}.gz + +## translate test set +${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz + mkdir -p ${dir $@} + ${LOADMODS} && ${MARIAN}/marian-decoder -i $< \ + -c ${word 2,$^}.decoder.yml \ + -d ${MARIAN_GPUS} \ + ${MARIAN_DECODER_FLAGS} |\ + sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\ + gzip -c > $@ + + + +.PHONY: eval +eval: ${TEST_SRC}.${TRG}.eval + +${TEST_SRC}.${TRG}.eval: ${TEST_SRC}.${TRG}.gz ${TEST_TRG}.gz + zcat ${TEST_TRG} > $@.ref + zcat $< | sacrebleu $@.ref > $@ + zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@ + rm -f $@.ref + + + +.PHONY: compare +compare: ${TEST_SRC}.${TRG}.compare + +${TEST_SRC}.${TRG}.compare: ${TEST_SRC}.${TRG}.eval + zcat ${TEST_SRC}.gz > $@.1 + zcat ${TEST_TRG}.gz > $@.2 + zcat ${<:.eval=.gz} > $@.3 + paste -d "\n" $@.1 $@.2 $@.3 |\ + sed -e "s/'/'/g" \ + -e 's/"/"/g' \ + -e 's/<//g' \ + -e 's/&/&/g' |\ + sed 'n;n;G;' > $@ + rm -f $@.1 $@.2 $@.3 diff --git a/finetune/README.md b/finetune/README.md new file mode 100644 index 00000000..f2d6bd16 --- /dev/null +++ b/finetune/README.md @@ -0,0 +1,12 @@ + +# Model fine-tuning + +Scripts for fine-tuning transformer models using some small in-domain data. + +* NOTE: this only works for SentencePiece models + + +## TODO + +* download base models from ObjectStorage +* make it work with multilingual models (need to adjust preprocess-scripts for those models) diff --git a/finetune/de-fi/goethe/dev/goethe-institute-dev1.de.gz b/finetune/de-fi/goethe/dev/goethe-institute-dev1.de.gz new file mode 100644 index 00000000..f9bc8d33 Binary files /dev/null and b/finetune/de-fi/goethe/dev/goethe-institute-dev1.de.gz differ diff --git a/finetune/de-fi/goethe/dev/goethe-institute-dev1.fi.gz b/finetune/de-fi/goethe/dev/goethe-institute-dev1.fi.gz new file mode 100644 index 00000000..30102706 Binary files /dev/null and b/finetune/de-fi/goethe/dev/goethe-institute-dev1.fi.gz differ diff --git a/finetune/de-fi/goethe/dev/goethe-institute-dev2.de.gz b/finetune/de-fi/goethe/dev/goethe-institute-dev2.de.gz new file mode 100644 index 00000000..2cddf202 Binary files /dev/null and b/finetune/de-fi/goethe/dev/goethe-institute-dev2.de.gz differ diff --git a/finetune/de-fi/goethe/dev/goethe-institute-dev2.fi.gz b/finetune/de-fi/goethe/dev/goethe-institute-dev2.fi.gz new file mode 100644 index 00000000..5364b0bb Binary files /dev/null and b/finetune/de-fi/goethe/dev/goethe-institute-dev2.fi.gz differ diff --git a/finetune/de-fi/goethe/test/goethe-institute-test1.de.fi.gz b/finetune/de-fi/goethe/test/goethe-institute-test1.de.fi.gz new file mode 100644 index 00000000..aa490749 Binary files /dev/null and b/finetune/de-fi/goethe/test/goethe-institute-test1.de.fi.gz differ diff --git a/finetune/de-fi/goethe/test/goethe-institute-test1.de.gz b/finetune/de-fi/goethe/test/goethe-institute-test1.de.gz new file mode 100644 index 00000000..c9ff2075 Binary files /dev/null and b/finetune/de-fi/goethe/test/goethe-institute-test1.de.gz differ diff --git a/finetune/de-fi/goethe/test/goethe-institute-test1.fi.gz b/finetune/de-fi/goethe/test/goethe-institute-test1.fi.gz new file mode 100644 index 00000000..512bcae1 Binary files /dev/null and b/finetune/de-fi/goethe/test/goethe-institute-test1.fi.gz differ diff --git a/finetune/de-fi/goethe/test/goethe-institute-test2.de.gz b/finetune/de-fi/goethe/test/goethe-institute-test2.de.gz new file mode 100644 index 00000000..118f0b09 Binary files /dev/null and b/finetune/de-fi/goethe/test/goethe-institute-test2.de.gz differ diff --git a/finetune/de-fi/goethe/test/goethe-institute-test2.fi.gz b/finetune/de-fi/goethe/test/goethe-institute-test2.fi.gz new file mode 100644 index 00000000..b8ee36bc Binary files /dev/null and b/finetune/de-fi/goethe/test/goethe-institute-test2.fi.gz differ diff --git a/finetune/de-fi/goethe/train/goethe-institute-train.de.gz b/finetune/de-fi/goethe/train/goethe-institute-train.de.gz new file mode 100644 index 00000000..462d8056 Binary files /dev/null and b/finetune/de-fi/goethe/train/goethe-institute-train.de.gz differ diff --git a/finetune/de-fi/goethe/train/goethe-institute-train.fi.gz b/finetune/de-fi/goethe/train/goethe-institute-train.fi.gz new file mode 100644 index 00000000..117dce27 Binary files /dev/null and b/finetune/de-fi/goethe/train/goethe-institute-train.fi.gz differ diff --git a/finetune/en-fi/news/dev/newsdev2015-enfi.en.gz b/finetune/en-fi/news/dev/newsdev2015-enfi.en.gz new file mode 100644 index 00000000..c1bfb91c Binary files /dev/null and b/finetune/en-fi/news/dev/newsdev2015-enfi.en.gz differ diff --git a/finetune/en-fi/news/dev/newsdev2015-enfi.fi.gz b/finetune/en-fi/news/dev/newsdev2015-enfi.fi.gz new file mode 100644 index 00000000..bbe6d30e Binary files /dev/null and b/finetune/en-fi/news/dev/newsdev2015-enfi.fi.gz differ diff --git a/finetune/en-fi/news/test/newstest2019-enfi.en.fi.gz b/finetune/en-fi/news/test/newstest2019-enfi.en.fi.gz new file mode 100644 index 00000000..b45b9e37 Binary files /dev/null and b/finetune/en-fi/news/test/newstest2019-enfi.en.fi.gz differ diff --git a/finetune/en-fi/news/test/newstest2019-enfi.en.gz b/finetune/en-fi/news/test/newstest2019-enfi.en.gz new file mode 100644 index 00000000..0fd245e5 Binary files /dev/null and b/finetune/en-fi/news/test/newstest2019-enfi.en.gz differ diff --git a/finetune/en-fi/news/test/newstest2019-enfi.fi.gz b/finetune/en-fi/news/test/newstest2019-enfi.fi.gz new file mode 100644 index 00000000..f447a6cb Binary files /dev/null and b/finetune/en-fi/news/test/newstest2019-enfi.fi.gz differ diff --git a/finetune/en-fi/news/train/newstest2015-2018.en.gz b/finetune/en-fi/news/train/newstest2015-2018.en.gz new file mode 100644 index 00000000..fa644fb2 Binary files /dev/null and b/finetune/en-fi/news/train/newstest2015-2018.en.gz differ diff --git a/finetune/en-fi/news/train/newstest2015-2018.fi.gz b/finetune/en-fi/news/train/newstest2015-2018.fi.gz new file mode 100644 index 00000000..3ad7adf8 Binary files /dev/null and b/finetune/en-fi/news/train/newstest2015-2018.fi.gz differ diff --git a/finetune/fi-en/news/dev/newsdev2015-enfi.en.gz b/finetune/fi-en/news/dev/newsdev2015-enfi.en.gz new file mode 100644 index 00000000..c1bfb91c Binary files /dev/null and b/finetune/fi-en/news/dev/newsdev2015-enfi.en.gz differ diff --git a/finetune/fi-en/news/dev/newsdev2015-enfi.fi.gz b/finetune/fi-en/news/dev/newsdev2015-enfi.fi.gz new file mode 100644 index 00000000..bbe6d30e Binary files /dev/null and b/finetune/fi-en/news/dev/newsdev2015-enfi.fi.gz differ diff --git a/finetune/fi-en/news/test/newstest2019-enfi.en.gz b/finetune/fi-en/news/test/newstest2019-enfi.en.gz new file mode 100644 index 00000000..0fd245e5 Binary files /dev/null and b/finetune/fi-en/news/test/newstest2019-enfi.en.gz differ diff --git a/finetune/fi-en/news/test/newstest2019-enfi.fi.gz b/finetune/fi-en/news/test/newstest2019-enfi.fi.gz new file mode 100644 index 00000000..f447a6cb Binary files /dev/null and b/finetune/fi-en/news/test/newstest2019-enfi.fi.gz differ diff --git a/finetune/fi-en/news/train/newstest2015-2018.en.gz b/finetune/fi-en/news/train/newstest2015-2018.en.gz new file mode 100644 index 00000000..fa644fb2 Binary files /dev/null and b/finetune/fi-en/news/train/newstest2015-2018.en.gz differ diff --git a/finetune/fi-en/news/train/newstest2015-2018.fi.gz b/finetune/fi-en/news/train/newstest2015-2018.fi.gz new file mode 100644 index 00000000..3ad7adf8 Binary files /dev/null and b/finetune/fi-en/news/train/newstest2015-2018.fi.gz differ diff --git a/testsets/de-fi/goethe-institute-test1.de.gz b/testsets/de-fi/goethe-institute-test1.de.gz new file mode 100644 index 00000000..c9ff2075 Binary files /dev/null and b/testsets/de-fi/goethe-institute-test1.de.gz differ diff --git a/testsets/de-fi/goethe-institute-test1.fi.gz b/testsets/de-fi/goethe-institute-test1.fi.gz new file mode 100644 index 00000000..512bcae1 Binary files /dev/null and b/testsets/de-fi/goethe-institute-test1.fi.gz differ diff --git a/testsets/de-fi/goethe-institute-test2.de.gz b/testsets/de-fi/goethe-institute-test2.de.gz new file mode 100644 index 00000000..118f0b09 Binary files /dev/null and b/testsets/de-fi/goethe-institute-test2.de.gz differ diff --git a/testsets/de-fi/goethe-institute-test2.fi.gz b/testsets/de-fi/goethe-institute-test2.fi.gz new file mode 100644 index 00000000..b8ee36bc Binary files /dev/null and b/testsets/de-fi/goethe-institute-test2.fi.gz differ