mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
finetuning and backtranslation
This commit is contained in:
parent
fe16a0c4dd
commit
e2ed3d85d1
@ -882,6 +882,9 @@ endif
|
||||
|
||||
|
||||
|
||||
## make symbolic links to spm-models
|
||||
## (previously we had data-specific models but now we want to re-use existing ones)
|
||||
|
||||
fix-spm-models:
|
||||
cd work-spm; \
|
||||
for l in ${ALL_LANG_PAIRS}; do \
|
||||
|
@ -11,35 +11,51 @@ include ../Makefile.slurm
|
||||
SRC = af
|
||||
TRG = en
|
||||
|
||||
## maximum input length (number sentence piece segments)
|
||||
MAX_LENGTH = 250
|
||||
## various sources are available
|
||||
## can be general wikipedia, wikinews, wikibooks, ...
|
||||
WIKISOURCE = wiki
|
||||
|
||||
## maximum input length (number sentence piece segments)
|
||||
## maximum number of sentences to be translated (top N lines)
|
||||
MAX_LENGTH = 100
|
||||
MAX_SENTENCES = 1000000
|
||||
|
||||
LANGPAIR = ${SRC}-${TRG}
|
||||
|
||||
|
||||
MODELHOME = ../models/${LANGPAIR}
|
||||
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*.zip}}}
|
||||
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
|
||||
LOAD_MODULES = module use -a /projappl/nlpl/software/modules/etc/ && \
|
||||
module load nlpl-udpipe nlpl-opus &&
|
||||
|
||||
WIKILANGS = ${sort $(patsubst >%wiki-,%,${shell grep -o '>..wiki-' index.html})} \
|
||||
${sort $(patsubst >%wiki-,%,${shell grep -o '>...wiki-' index.html})}
|
||||
WIKILANGS = ${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>..${WIKISOURCE}-' index.html})} \
|
||||
${sort $(patsubst >%${WIKISOURCE}-,%,${shell grep -o '>...${WIKISOURCE}-' index.html})}
|
||||
|
||||
|
||||
LANGID = ${SRC}
|
||||
WIKI_TXT = wiki.${LANGID}.gz
|
||||
WIKI_SRC = wiki.${LANGPAIR}.${SRC}.gz
|
||||
WIKI_PRE = wiki.${LANGPAIR}.${SRC}.spm.gz
|
||||
WIKI_TRG = wiki.${LANGPAIR}.${TRG}.gz
|
||||
WIKI_TXT = ${WIKISOURCE}.${LANGID}.gz
|
||||
WIKI_SRC = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${SRC}.gz
|
||||
WIKI_PRE = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz
|
||||
WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
|
||||
|
||||
## don't delete translated text if the process crashes
|
||||
.PRECIOUS: ${WIKI_TRG}
|
||||
|
||||
|
||||
## find wiki downloads
|
||||
WIKI_JSON = $(shell grep -o '${LANGID}wiki-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
|
||||
WIKI_JSON = $(shell grep -o '${LANGID}${WIKISOURCE}-[0-9]*-cirrussearch-content.json.gz' index.html | head -1)
|
||||
|
||||
## we don't need to keep the json file
|
||||
.INTERMEDIATE: ${WIKI_JSON}
|
||||
|
||||
|
||||
## find UDPipe model
|
||||
ifndef UDPIPE_MODELS
|
||||
UDPIPE_MODELS=/projappl/nlpl/software/modules/udpipe/1.2.1-devel/models
|
||||
endif
|
||||
LANGNAME = ${shell ${LOAD_MODULES} opus-iso639 -e ${LANGID} | \
|
||||
cut -f1 -d';' | tr ' ' '-' | tr '[:upper:]' '[:lower:]'}
|
||||
UDPIPE_MODEL = ${notdir $(shell ${LOAD_MODULES} find ${UDPIPE_MODELS}/ -name "${LANGNAME}*.udpipe" | head -1)}
|
||||
@ -51,11 +67,11 @@ all: index.html
|
||||
|
||||
all-wikis: index.html
|
||||
for l in ${WIKILANGS}; do \
|
||||
${MAKE} LANGID=$$l wiki-txt; \
|
||||
${MAKE} LANGID=$$l extract-text; \
|
||||
done
|
||||
|
||||
|
||||
wiki-txt: ${WIKI_TXT}
|
||||
extract-text: ${WIKI_TXT}
|
||||
prepare-model: ${LANGPAIR}/decoder.yml
|
||||
prepare-data: ${WIKI_PRE}
|
||||
translate: ${WIKI_SRC} ${WIKI_TRG}
|
||||
@ -66,8 +82,10 @@ print-names:
|
||||
echo ${WIKI_JSON}
|
||||
|
||||
|
||||
## fetch the latest model
|
||||
## ---> TODO: should we fetch from ObjectStorage instead?
|
||||
|
||||
${LANGPAIR}/decoder.yml:
|
||||
${LANGPAIR}/${MODELNAME}/decoder.yml:
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
cp ${MODELZIP} ${dir $@}
|
||||
@ -75,45 +93,66 @@ ifneq (${MODELZIP},)
|
||||
endif
|
||||
|
||||
|
||||
%.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz
|
||||
## pre-process data
|
||||
## ---> TODO: does that work for multilingual data that need prefix?
|
||||
|
||||
${LANGPAIR}/%_${MODELNAME}.${LANGPAIR}.${SRC}.spm.gz: %.${SRC}.gz
|
||||
ifneq (${MODELZIP},)
|
||||
${MAKE} ${LANGPAIR}/decoder.yml
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
zcat $< |\
|
||||
${LANGPAIR}/preprocess.sh af ${LANGPAIR}/source.spm |\
|
||||
perl -pe 'print if (split(/\s+/)>${MAX_LENGTH});' |\
|
||||
${LANGPAIR}/${MODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm |\
|
||||
perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
|
||||
head -${MAX_SENTENCES} |\
|
||||
gzip -c > $@
|
||||
endif
|
||||
|
||||
|
||||
## merge SentencePiece segments in the source text
|
||||
## (Why? because we filter out some data from the original wiki text, see above)
|
||||
|
||||
${WIKI_SRC}: ${WIKI_PRE}
|
||||
ifneq (${MODELZIP},)
|
||||
mkdir -p ${dir $@}
|
||||
zcat $< |\
|
||||
sed 's/ //g;s/▁/ /g' | \
|
||||
sed 's/^ *//;s/ *$$//' |\
|
||||
gzip -c > $@
|
||||
endif
|
||||
|
||||
|
||||
## translate
|
||||
|
||||
%.${LANGPAIR}.${TRG}.gz: %.${LANGPAIR}.${SRC}.spm.gz
|
||||
ifneq (${MODELZIP},)
|
||||
${MAKE} ${LANGPAIR}/decoder.yml
|
||||
${LOADMODS} && cd ${LANGPAIR} && ${MARIAN}/marian-decoder \
|
||||
mkdir -p ${dir $@}
|
||||
${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
|
||||
${LOADMODS} && cd ${LANGPAIR}/${MODELNAME} && ${MARIAN}/marian-decoder \
|
||||
-i ${PWD}/$< \
|
||||
-c decoder.yml \
|
||||
-d ${MARIAN_GPUS} \
|
||||
${MARIAN_DECODER_FLAGS} |\
|
||||
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
||||
gzip -c > ${PWD}/$@
|
||||
ifneq (${LANGPAIR},)
|
||||
ifneq (${MODELNAME},)
|
||||
rm -fr ${LANGPAIR}/${MODELNAME}
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
## index of all downloadable files
|
||||
index.html:
|
||||
wget -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
|
||||
wget -nv -O $@ https://dumps.wikimedia.org/other/cirrussearch/current
|
||||
|
||||
## wiki in json format
|
||||
${WIKI_JSON}:
|
||||
wget https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
|
||||
wget -nv https://dumps.wikimedia.org/other/cirrussearch/current/${WIKI_JSON}
|
||||
|
||||
|
||||
## check whether there is a UDPipe model
|
||||
## backoff to moses tools
|
||||
ifneq (${UDPIPE_MODEL},)
|
||||
SENTSPLITTER = udpipe --input=horizontal --tokenize \
|
||||
${UDPIPE_MODELS}/${UDPIPE_MODEL} |\
|
||||
|
@ -1,6 +1,15 @@
|
||||
|
||||
# Translate data as synthetic training data
|
||||
|
||||
Use Wiki data:
|
||||
|
||||
* json processor: https://stedolan.github.io/jq/
|
||||
* wiki JSON dumps: https://dumps.wikimedia.org/other/cirrussearch/current/
|
||||
|
||||
NOTE: this only works for SentencePiece models
|
||||
|
||||
|
||||
## TODO
|
||||
|
||||
* download base models from ObjectStorage
|
||||
* make it work with multilingual models (need to adjust preprocess-scripts for those models)
|
||||
|
225
finetune/Makefile
Normal file
225
finetune/Makefile
Normal file
@ -0,0 +1,225 @@
|
||||
#
|
||||
# fine-tune an existing model
|
||||
#
|
||||
# make news-tune-data ...... create tunig data from newstest sets
|
||||
# make all ................. tune and eval
|
||||
#
|
||||
#
|
||||
# NOTE: this only works for SentencePiece models
|
||||
#
|
||||
# TODO
|
||||
# - download base models from ObjectStorage
|
||||
# - make it work with multilingual models
|
||||
# --> need to adjust preprocess-scripts for those models
|
||||
#
|
||||
|
||||
include ../Makefile.env
|
||||
include ../Makefile.config
|
||||
include ../Makefile.slurm
|
||||
|
||||
|
||||
SRC = en
|
||||
TRG = de
|
||||
LANGPAIR = ${SRC}-${TRG}
|
||||
MODEL = news
|
||||
|
||||
TRAIN_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/train/*.${SRC}.gz)}
|
||||
DEV_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/dev/*.${SRC}.gz)}
|
||||
TEST_SRC = ${patsubst %.gz,%,$(wildcard ${LANGPAIR}/${MODEL}/test/*.${SRC}.gz)}
|
||||
|
||||
TRAIN_TRG = ${patsubst %.${SRC},%.${TRG},${TRAIN_SRC}}
|
||||
DEV_TRG = ${patsubst %.${SRC},%.${TRG},${DEV_SRC}}
|
||||
TEST_TRG = ${patsubst %.${SRC},%.${TRG},${TEST_SRC}}
|
||||
|
||||
|
||||
BASEMODELHOME = ../models/${LANGPAIR}
|
||||
BASEMODELZIP = ${lastword ${sort ${wildcard ${BASEMODELHOME}/*-20*.zip}}}
|
||||
BASEMODELNAME = ${patsubst %.zip,%,${notdir ${BASEMODELZIP}}}
|
||||
|
||||
|
||||
TUNED_MODEL = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.transformer.model
|
||||
TUNED_MODEL_VOCAB = ${LANGPAIR}/${MODEL}/model/${BASEMODELNAME}_${MODEL}.vocab.yml
|
||||
|
||||
|
||||
MARIAN_WORKSPACE = 5000
|
||||
MARIAN_VALID_FREQ = 100
|
||||
MARIAN_SAVE_FREQ = ${MARIAN_VALID_FREQ}
|
||||
MARIAN_DISP_FREQ = ${MARIAN_VALID_FREQ}
|
||||
MARIAN_EARLY_STOPPING = 5
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: all
|
||||
all: ${TEST_SRC}.${TRG}.compare
|
||||
|
||||
|
||||
.PHONY: news-enfi
|
||||
news-enfi:
|
||||
${MAKE} SRC=en TRG=fi MODEL=news \
|
||||
TRAIN_SRC=en-fi/news/train/newstest2015-2018.en \
|
||||
TRAIN_TRG=en-fi/news/train/newstest2015-2018.fi \
|
||||
DEV_SRC=en-fi/news/dev/newsdev2015-enfi.en \
|
||||
DEV_TRG=en-fi/news/dev/newsdev2015-enfi.fi \
|
||||
TEST_SRC=en-fi/news/test/newstest2019-enfi.en \
|
||||
TEST_TRG=en-fi/news/test/newstest2019-enfi.fi \
|
||||
all
|
||||
|
||||
.PHONY: goethe-fide
|
||||
goethe-ende:
|
||||
${MAKE} SRC=fi TRG=de MODEL=goethe \
|
||||
TRAIN_SRC=fi-de/goethe/train/goethe-institute-train.fi \
|
||||
TRAIN_TRG=fi-de/goethe/train/goethe-institute-train.de \
|
||||
DEV_SRC=fi-de/goethe/dev/goethe-institute-dev1.fi \
|
||||
DEV_TRG=fi-de/goethe/dev/goethe-institute-dev1.de \
|
||||
TEST_SRC=fi-de/goethe/test/goethe-institute-test1.fi \
|
||||
TEST_TRG=fi-de/goethe/test/goethe-institute-test1.de \
|
||||
all
|
||||
|
||||
|
||||
|
||||
## make news tuning data from testsets
|
||||
|
||||
TESTSETS_HOME = ../testsets/${LANGPAIR}
|
||||
NEWS_ALLSETS_SRC = ${sort ${wildcard ${TESTSETS_HOME}/news*.${SRC}.gz}}
|
||||
NEWS_ALLSETS_TRG = ${sort ${wildcard ${TESTSETS_HOME}/news*.${TRG}.gz}}
|
||||
NEWS_DEVSET_SRC = ${firstword ${NEWS_ALLSETS_SRC}}
|
||||
NEWS_DEVSET_TRG = ${firstword ${NEWS_ALLSETS_TRG}}
|
||||
NEWS_TESTSET_SRC = ${lastword ${NEWS_ALLSETS_SRC}}
|
||||
NEWS_TESTSET_TRG = ${lastword ${NEWS_ALLSETS_TRG}}
|
||||
NEWS_TRAINSET_SRC = ${filter-out ${NEWS_DEVSET_SRC} ${NEWS_TESTSET_SRC},${NEWS_ALLSETS_SRC}}
|
||||
NEWS_TRAINSET_TRG = ${filter-out ${NEWS_DEVSET_TRG} ${NEWS_TESTSET_TRG},${NEWS_ALLSETS_TRG}}
|
||||
|
||||
.PHONY: news-tune-data
|
||||
news-tune-data:
|
||||
ifneq (${words ${NEWS_ALLSETS_SRC}},0)
|
||||
ifneq (${words ${NEWS_ALLSETS_SRC}},1)
|
||||
ifneq (${words ${NEWS_ALLSETS_SRC}},2)
|
||||
mkdir -p ${LANGPAIR}/news/train
|
||||
mkdir -p ${LANGPAIR}/news/dev
|
||||
mkdir -p ${LANGPAIR}/news/test
|
||||
cp ${NEWS_TESTSET_SRC} ${LANGPAIR}/news/test/
|
||||
cp ${NEWS_TESTSET_TRG} ${LANGPAIR}/news/test/
|
||||
cp ${NEWS_DEVSET_SRC} ${LANGPAIR}/news/dev/
|
||||
cp ${NEWS_DEVSET_TRG} ${LANGPAIR}/news/dev/
|
||||
zcat ${NEWS_TRAINSET_SRC} | gzip -c > ${LANGPAIR}/news/train/news.${SRC}.gz
|
||||
zcat ${NEWS_TRAINSET_TRG} | gzip -c > ${LANGPAIR}/news/train/news.${TRG}.gz
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
.PHONY: data
|
||||
data: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz
|
||||
|
||||
.INTERMEDIATE: ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
${LANGPAIR}/${BASEMODELNAME}/decoder.yml:
|
||||
mkdir -p ${dir $@}
|
||||
cp ${BASEMODELZIP} ${dir $@}
|
||||
cd ${dir $@} && unzip -u *.zip
|
||||
|
||||
${TRAIN_SRC}.pre.gz ${DEV_SRC}.pre.gz ${TEST_SRC}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
zcat $< |\
|
||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/source.spm |\
|
||||
gzip -c > $@
|
||||
|
||||
${TRAIN_TRG}.pre.gz ${DEV_TRG}.pre.gz: %.pre.gz: %.gz ${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
zcat $< |\
|
||||
${LANGPAIR}/${BASEMODELNAME}/preprocess.sh ${SRC} ${LANGPAIR}/${BASEMODELNAME}/target.spm |\
|
||||
gzip -c > $@
|
||||
|
||||
|
||||
|
||||
|
||||
.PHONY: tune
|
||||
tune: ${TUNED_MODEL}.done
|
||||
|
||||
## train transformer model
|
||||
${TUNED_MODEL}.npz.best-perplexity.npz: ${TUNED_MODEL}.done
|
||||
|
||||
${TUNED_MODEL}.done: ${TRAIN_SRC}.pre.gz ${TRAIN_TRG}.pre.gz ${DEV_SRC}.pre.gz ${DEV_TRG}.pre.gz \
|
||||
${LANGPAIR}/${BASEMODELNAME}/decoder.yml
|
||||
mkdir -p ${dir $@}
|
||||
if [ ! -e ${@:done=npz} ]; then \
|
||||
cp ${LANGPAIR}/${BASEMODELNAME}/*.npz ${@:done=npz}; \
|
||||
cp ${LANGPAIR}/${BASEMODELNAME}/*.vocab.yml ${TUNED_MODEL_VOCAB}; \
|
||||
fi
|
||||
${LOADMODS} && ${MARIAN}/marian ${MARIAN_EXTRA} \
|
||||
--model $(@:.done=.npz) \
|
||||
--type transformer \
|
||||
--train-sets ${word 1,$^} ${word 2,$^} ${MARIAN_TRAIN_WEIGHTS} \
|
||||
--max-length 500 \
|
||||
--vocabs ${TUNED_MODEL_VOCAB} ${TUNED_MODEL_VOCAB} \
|
||||
--mini-batch-fit \
|
||||
-w ${MARIAN_WORKSPACE} \
|
||||
--maxi-batch ${MARIAN_MAXI_BATCH} \
|
||||
--early-stopping ${MARIAN_EARLY_STOPPING} \
|
||||
--valid-freq ${MARIAN_VALID_FREQ} \
|
||||
--save-freq ${MARIAN_SAVE_FREQ} \
|
||||
--disp-freq ${MARIAN_DISP_FREQ} \
|
||||
--valid-sets ${word 3,$^} ${word 4,$^} \
|
||||
--valid-metrics perplexity \
|
||||
--valid-mini-batch ${MARIAN_VALID_MINI_BATCH} \
|
||||
--beam-size 12 --normalize 1 \
|
||||
--log $(@:.model.done=.train.log) --valid-log $(@:.model.done=.valid.log) \
|
||||
--enc-depth 6 --dec-depth 6 \
|
||||
--transformer-heads 8 \
|
||||
--transformer-postprocess-emb d \
|
||||
--transformer-postprocess dan \
|
||||
--transformer-dropout ${MARIAN_DROPOUT} \
|
||||
--label-smoothing 0.1 \
|
||||
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
|
||||
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
|
||||
--tied-embeddings-all \
|
||||
--overwrite --keep-best \
|
||||
--devices ${MARIAN_GPUS} \
|
||||
--sync-sgd --seed ${SEED} \
|
||||
--sqlite \
|
||||
--tempdir ${TMPDIR} \
|
||||
--exponential-smoothing
|
||||
touch $@
|
||||
|
||||
|
||||
|
||||
.PHONY: translate
|
||||
translate: ${TEST_SRC}.${TRG}.gz
|
||||
|
||||
## translate test set
|
||||
${TEST_SRC}.${TRG}.gz: ${TEST_SRC}.pre.gz ${TUNED_MODEL}.npz.best-perplexity.npz
|
||||
mkdir -p ${dir $@}
|
||||
${LOADMODS} && ${MARIAN}/marian-decoder -i $< \
|
||||
-c ${word 2,$^}.decoder.yml \
|
||||
-d ${MARIAN_GPUS} \
|
||||
${MARIAN_DECODER_FLAGS} |\
|
||||
sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' |\
|
||||
gzip -c > $@
|
||||
|
||||
|
||||
|
||||
.PHONY: eval
|
||||
eval: ${TEST_SRC}.${TRG}.eval
|
||||
|
||||
${TEST_SRC}.${TRG}.eval: ${TEST_SRC}.${TRG}.gz ${TEST_TRG}.gz
|
||||
zcat ${TEST_TRG} > $@.ref
|
||||
zcat $< | sacrebleu $@.ref > $@
|
||||
zcat $< | sacrebleu --metrics=chrf --width=3 $@.ref >> $@
|
||||
rm -f $@.ref
|
||||
|
||||
|
||||
|
||||
.PHONY: compare
|
||||
compare: ${TEST_SRC}.${TRG}.compare
|
||||
|
||||
${TEST_SRC}.${TRG}.compare: ${TEST_SRC}.${TRG}.eval
|
||||
zcat ${TEST_SRC}.gz > $@.1
|
||||
zcat ${TEST_TRG}.gz > $@.2
|
||||
zcat ${<:.eval=.gz} > $@.3
|
||||
paste -d "\n" $@.1 $@.2 $@.3 |\
|
||||
sed -e "s/'/'/g" \
|
||||
-e 's/"/"/g' \
|
||||
-e 's/</</g' \
|
||||
-e 's/>/>/g' \
|
||||
-e 's/&/&/g' |\
|
||||
sed 'n;n;G;' > $@
|
||||
rm -f $@.1 $@.2 $@.3
|
12
finetune/README.md
Normal file
12
finetune/README.md
Normal file
@ -0,0 +1,12 @@
|
||||
|
||||
# Model fine-tuning
|
||||
|
||||
Scripts for fine-tuning transformer models using some small in-domain data.
|
||||
|
||||
* NOTE: this only works for SentencePiece models
|
||||
|
||||
|
||||
## TODO
|
||||
|
||||
* download base models from ObjectStorage
|
||||
* make it work with multilingual models (need to adjust preprocess-scripts for those models)
|
BIN
finetune/de-fi/goethe/dev/goethe-institute-dev1.de.gz
Normal file
BIN
finetune/de-fi/goethe/dev/goethe-institute-dev1.de.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/dev/goethe-institute-dev1.fi.gz
Normal file
BIN
finetune/de-fi/goethe/dev/goethe-institute-dev1.fi.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/dev/goethe-institute-dev2.de.gz
Normal file
BIN
finetune/de-fi/goethe/dev/goethe-institute-dev2.de.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/dev/goethe-institute-dev2.fi.gz
Normal file
BIN
finetune/de-fi/goethe/dev/goethe-institute-dev2.fi.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/test/goethe-institute-test1.de.fi.gz
Normal file
BIN
finetune/de-fi/goethe/test/goethe-institute-test1.de.fi.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/test/goethe-institute-test1.de.gz
Normal file
BIN
finetune/de-fi/goethe/test/goethe-institute-test1.de.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/test/goethe-institute-test1.fi.gz
Normal file
BIN
finetune/de-fi/goethe/test/goethe-institute-test1.fi.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/test/goethe-institute-test2.de.gz
Normal file
BIN
finetune/de-fi/goethe/test/goethe-institute-test2.de.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/test/goethe-institute-test2.fi.gz
Normal file
BIN
finetune/de-fi/goethe/test/goethe-institute-test2.fi.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/train/goethe-institute-train.de.gz
Normal file
BIN
finetune/de-fi/goethe/train/goethe-institute-train.de.gz
Normal file
Binary file not shown.
BIN
finetune/de-fi/goethe/train/goethe-institute-train.fi.gz
Normal file
BIN
finetune/de-fi/goethe/train/goethe-institute-train.fi.gz
Normal file
Binary file not shown.
BIN
finetune/en-fi/news/dev/newsdev2015-enfi.en.gz
Normal file
BIN
finetune/en-fi/news/dev/newsdev2015-enfi.en.gz
Normal file
Binary file not shown.
BIN
finetune/en-fi/news/dev/newsdev2015-enfi.fi.gz
Normal file
BIN
finetune/en-fi/news/dev/newsdev2015-enfi.fi.gz
Normal file
Binary file not shown.
BIN
finetune/en-fi/news/test/newstest2019-enfi.en.fi.gz
Normal file
BIN
finetune/en-fi/news/test/newstest2019-enfi.en.fi.gz
Normal file
Binary file not shown.
BIN
finetune/en-fi/news/test/newstest2019-enfi.en.gz
Normal file
BIN
finetune/en-fi/news/test/newstest2019-enfi.en.gz
Normal file
Binary file not shown.
BIN
finetune/en-fi/news/test/newstest2019-enfi.fi.gz
Normal file
BIN
finetune/en-fi/news/test/newstest2019-enfi.fi.gz
Normal file
Binary file not shown.
BIN
finetune/en-fi/news/train/newstest2015-2018.en.gz
Normal file
BIN
finetune/en-fi/news/train/newstest2015-2018.en.gz
Normal file
Binary file not shown.
BIN
finetune/en-fi/news/train/newstest2015-2018.fi.gz
Normal file
BIN
finetune/en-fi/news/train/newstest2015-2018.fi.gz
Normal file
Binary file not shown.
BIN
finetune/fi-en/news/dev/newsdev2015-enfi.en.gz
Normal file
BIN
finetune/fi-en/news/dev/newsdev2015-enfi.en.gz
Normal file
Binary file not shown.
BIN
finetune/fi-en/news/dev/newsdev2015-enfi.fi.gz
Normal file
BIN
finetune/fi-en/news/dev/newsdev2015-enfi.fi.gz
Normal file
Binary file not shown.
BIN
finetune/fi-en/news/test/newstest2019-enfi.en.gz
Normal file
BIN
finetune/fi-en/news/test/newstest2019-enfi.en.gz
Normal file
Binary file not shown.
BIN
finetune/fi-en/news/test/newstest2019-enfi.fi.gz
Normal file
BIN
finetune/fi-en/news/test/newstest2019-enfi.fi.gz
Normal file
Binary file not shown.
BIN
finetune/fi-en/news/train/newstest2015-2018.en.gz
Normal file
BIN
finetune/fi-en/news/train/newstest2015-2018.en.gz
Normal file
Binary file not shown.
BIN
finetune/fi-en/news/train/newstest2015-2018.fi.gz
Normal file
BIN
finetune/fi-en/news/train/newstest2015-2018.fi.gz
Normal file
Binary file not shown.
BIN
testsets/de-fi/goethe-institute-test1.de.gz
Normal file
BIN
testsets/de-fi/goethe-institute-test1.de.gz
Normal file
Binary file not shown.
BIN
testsets/de-fi/goethe-institute-test1.fi.gz
Normal file
BIN
testsets/de-fi/goethe-institute-test1.fi.gz
Normal file
Binary file not shown.
BIN
testsets/de-fi/goethe-institute-test2.de.gz
Normal file
BIN
testsets/de-fi/goethe-institute-test2.de.gz
Normal file
Binary file not shown.
BIN
testsets/de-fi/goethe-institute-test2.fi.gz
Normal file
BIN
testsets/de-fi/goethe-institute-test2.fi.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user