OPUS-MT-train/Makefile.doclevel
2020-01-10 16:45:42 +02:00

61 lines
1.9 KiB
Makefile

# -*-makefile-*-
DOCLEVEL_BENCHMARK_DATA = https://zenodo.org/record/3525366/files/doclevel-MT-benchmark-discomt2019.zip
## use the doclevel benchmark data sets
%-ost:
${MAKE} ost-datasets
${MAKE} SRCLANGS=en TRGLANGS=de \
TRAINSET=ost-train \
DEVSET=ost-dev \
TESTSET=ost-test \
DEVSIZE=100000 TESTSIZE=100000 HELDOUTSIZE=0 \
${@:-ost=}
ost-datasets: ${DATADIR}/${PRE}/ost-train.de-en.clean.de.gz \
${DATADIR}/${PRE}/ost-train.de-en.clean.en.gz \
${DATADIR}/${PRE}/ost-dev.de-en.clean.de.gz \
${DATADIR}/${PRE}/ost-dev.de-en.clean.en.gz \
${DATADIR}/${PRE}/ost-test.de-en.clean.de.gz \
${DATADIR}/${PRE}/ost-test.de-en.clean.en.gz
.INTERMEDIATE: ${WORKHOME}/doclevel-MT-benchmark
## download the doc-level data set
${WORKHOME}/doclevel-MT-benchmark:
wget -O $@.zip DOCLEVEL_BENCHMARK_DATA?download=1
unzip -d ${dir $@} $@.zip
rm -f $@.zip
${DATADIR}/${PRE}/ost-train.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l de < $</train/ost.tok.de | gzip -c > $@
${DATADIR}/${PRE}/ost-train.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l en < $</train/ost.tok.en | gzip -c > $@
${DATADIR}/${PRE}/ost-dev.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l de < $</dev/ost.tok.de | gzip -c > $@
${DATADIR}/${PRE}/ost-dev.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l en < $</dev/ost.tok.en | gzip -c > $@
${DATADIR}/${PRE}/ost-test.de-en.clean.de.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l de < $</test/ost.tok.de | gzip -c > $@
${DATADIR}/${PRE}/ost-test.de-en.clean.en.gz: ${WORKHOME}/doclevel-MT-benchmark
mkdir -p ${dir $@}
$(TOKENIZER)/detokenizer.perl -l en < $</test/ost.tok.en | gzip -c > $@