mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2025-01-07 18:26:52 +03:00
61 lines
1.9 KiB
Makefile
61 lines
1.9 KiB
Makefile
# -*-makefile-*-
|
|
|
|
|
|
DOCLEVEL_BENCHMARK_DATA = https://zenodo.org/record/3525366/files/doclevel-MT-benchmark-discomt2019.zip
|
|
|
|
|
|
## use the doclevel benchmark data sets
|
|
%-ost:
|
|
${MAKE} ost-datasets
|
|
${MAKE} SRCLANGS=en TRGLANGS=de \
|
|
TRAINSET=ost-train \
|
|
DEVSET=ost-dev \
|
|
TESTSET=ost-test \
|
|
DEVSIZE=100000 TESTSIZE=100000 HELDOUTSIZE=0 \
|
|
${@:-ost=}
|
|
|
|
|
|
|
|
|
|
ost-datasets: ${DATADIR}/${PRE}/ost-train.de-en.clean.de.gz \
|
|
${DATADIR}/${PRE}/ost-train.de-en.clean.en.gz \
|
|
${DATADIR}/${PRE}/ost-dev.de-en.clean.de.gz \
|
|
${DATADIR}/${PRE}/ost-dev.de-en.clean.en.gz \
|
|
${DATADIR}/${PRE}/ost-test.de-en.clean.de.gz \
|
|
${DATADIR}/${PRE}/ost-test.de-en.clean.en.gz
|
|
|
|
|
|
.INTERMEDIATE: ${WORKHOME}/doclevel-MT-benchmark
|
|
|
|
## download the doc-level data set
|
|
${WORKHOME}/doclevel-MT-benchmark:
|
|
wget -O $@.zip DOCLEVEL_BENCHMARK_DATA?download=1
|
|
unzip -d ${dir $@} $@.zip
|
|
rm -f $@.zip
|
|
|
|
${DATADIR}/${PRE}/ost-train.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
|
|
mkdir -p ${dir $@}
|
|
$(TOKENIZER)/detokenizer.perl -l de < $</train/ost.tok.de | gzip -c > $@
|
|
|
|
${DATADIR}/${PRE}/ost-train.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
|
|
mkdir -p ${dir $@}
|
|
$(TOKENIZER)/detokenizer.perl -l en < $</train/ost.tok.en | gzip -c > $@
|
|
|
|
${DATADIR}/${PRE}/ost-dev.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
|
|
mkdir -p ${dir $@}
|
|
$(TOKENIZER)/detokenizer.perl -l de < $</dev/ost.tok.de | gzip -c > $@
|
|
|
|
${DATADIR}/${PRE}/ost-dev.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
|
|
mkdir -p ${dir $@}
|
|
$(TOKENIZER)/detokenizer.perl -l en < $</dev/ost.tok.en | gzip -c > $@
|
|
|
|
${DATADIR}/${PRE}/ost-test.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
|
|
mkdir -p ${dir $@}
|
|
$(TOKENIZER)/detokenizer.perl -l de < $</test/ost.tok.de | gzip -c > $@
|
|
|
|
${DATADIR}/${PRE}/ost-test.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
|
|
mkdir -p ${dir $@}
|
|
$(TOKENIZER)/detokenizer.perl -l en < $</test/ost.tok.en | gzip -c > $@
|
|
|
|
|