mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
simplification evaluation with BLEU
This commit is contained in:
parent
3f57e4f873
commit
d13a9461f0
8
Makefile
8
Makefile
@ -185,9 +185,9 @@ eval-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.
|
|||||||
## and all trokenized test sets that can be found in that directory
|
## and all trokenized test sets that can be found in that directory
|
||||||
TESTSET_HOME = ${PWD}/testsets
|
TESTSET_HOME = ${PWD}/testsets
|
||||||
TESTSET_DIR = ${TESTSET_HOME}/${SRC}-${TRG}
|
TESTSET_DIR = ${TESTSET_HOME}/${SRC}-${TRG}
|
||||||
TESTSETS = $(sort $(patsubst ${TESTSET_DIR}/%.${SRC}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.gz}))
|
TESTSETS = $(sort $(patsubst ${TESTSET_DIR}/%.${SRCEXT}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRCEXT}.gz}))
|
||||||
TESTSETS_PRESRC = $(patsubst %,${TESTSET_DIR}/%.${SRC}.${PRE}.gz,${TESTSETS})
|
TESTSETS_PRESRC = $(patsubst %,${TESTSET_DIR}/%.${SRCEXT}.${PRE}.gz,${TESTSETS})
|
||||||
TESTSETS_PRETRG = $(patsubst %,${TESTSET_DIR}/%.${TRG}.${PRE}.gz,${TESTSETS})
|
TESTSETS_PRETRG = $(patsubst %,${TESTSET_DIR}/%.${TRGEXT}.${PRE}.gz,${TESTSETS})
|
||||||
|
|
||||||
# TESTSETS_PRESRC = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})})
|
# TESTSETS_PRESRC = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})})
|
||||||
# TESTSETS_PRETRG = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${TRG}.gz})})
|
# TESTSETS_PRETRG = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${TRG}.gz})})
|
||||||
@ -204,7 +204,7 @@ eval-heldout:
|
|||||||
${MAKE} TESTSET_HOME=${HELDOUT_DIR} eval-testsets
|
${MAKE} TESTSET_HOME=${HELDOUT_DIR} eval-testsets
|
||||||
|
|
||||||
%-testsets-langpair: ${TESTSETS_PRESRC} ${TESTSETS_PRETRG}
|
%-testsets-langpair: ${TESTSETS_PRESRC} ${TESTSETS_PRETRG}
|
||||||
@echo "testsets: ${TESTSET_DIR}/*.${SRC}.gz"
|
@echo "testsets: ${TESTSET_DIR}/*.${SRCEXT}.gz"
|
||||||
for t in ${TESTSETS}; do \
|
for t in ${TESTSETS}; do \
|
||||||
${MAKE} TESTSET=$$t ${@:-testsets-langpair=}; \
|
${MAKE} TESTSET=$$t ${@:-testsets-langpair=}; \
|
||||||
done
|
done
|
||||||
|
@ -184,6 +184,10 @@ ifndef DATASET
|
|||||||
DATASET = opus
|
DATASET = opus
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef BPEMODELNAME
|
||||||
|
BPEMODELNAME = opus
|
||||||
|
endif
|
||||||
|
|
||||||
##-------------------------------------
|
##-------------------------------------
|
||||||
## OLD OLD OLD
|
## OLD OLD OLD
|
||||||
## name of the data set (and the model)
|
## name of the data set (and the model)
|
||||||
|
@ -99,6 +99,9 @@ endif
|
|||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ifndef OLDMODELTYPE
|
ifndef OLDMODELTYPE
|
||||||
OLDMODELTYPE=transformer-align
|
OLDMODELTYPE=transformer-align
|
||||||
endif
|
endif
|
||||||
@ -108,6 +111,8 @@ ifndef NEWMODELTYPE
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
## TODO: this does not seem to work as the config does not match
|
||||||
|
## (optmiser cannot contintue to run ....)
|
||||||
## move model files to a new name
|
## move model files to a new name
|
||||||
## (useful if using as starting point for another modeltyp
|
## (useful if using as starting point for another modeltyp
|
||||||
## for example, continue training without guided alignment)
|
## for example, continue training without guided alignment)
|
||||||
@ -557,9 +562,9 @@ ${TEST_SRC}: ${DEV_SRC}
|
|||||||
ifneq (${TESTSET},${DEVSET})
|
ifneq (${TESTSET},${DEVSET})
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
rm -f ${TEST_SRC} ${TEST_TRG}
|
rm -f ${TEST_SRC} ${TEST_TRG}
|
||||||
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz ]; then \
|
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \
|
||||||
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz \
|
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \
|
||||||
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRG}.${PRE}.gz \
|
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \
|
||||||
add-to-test-data; \
|
add-to-test-data; \
|
||||||
else \
|
else \
|
||||||
for s in ${SRCLANGS}; do \
|
for s in ${SRCLANGS}; do \
|
||||||
@ -576,9 +581,9 @@ ifneq (${TESTSET},${DEVSET})
|
|||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz ]; then \
|
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \
|
||||||
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz \
|
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \
|
||||||
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRG}.${PRE}.gz \
|
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \
|
||||||
add-to-test-data; \
|
add-to-test-data; \
|
||||||
elif (( `zcat $<.shuffled.gz | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
|
elif (( `zcat $<.shuffled.gz | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
|
||||||
zcat $<.shuffled.gz | cut -f1 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_SRC}; \
|
zcat $<.shuffled.gz | cut -f1 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_SRC}; \
|
||||||
@ -714,6 +719,13 @@ add-to-local-mono-data:
|
|||||||
### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS
|
### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS
|
||||||
|
|
||||||
## only normalisation
|
## only normalisation
|
||||||
|
%.norm.gz: %.gz
|
||||||
|
$(LOAD_MOSES) zcat $< |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||||
|
|
||||||
%.norm: %.raw
|
%.norm: %.raw
|
||||||
$(LOAD_MOSES) cat $< |\
|
$(LOAD_MOSES) cat $< |\
|
||||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
@ -737,6 +749,13 @@ add-to-local-mono-data:
|
|||||||
|
|
||||||
|
|
||||||
## minimal pre-processing
|
## minimal pre-processing
|
||||||
|
%.simple.gz: %.gz
|
||||||
|
$(LOAD_MOSES) zcat $< |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||||
|
|
||||||
%.simple: %.raw
|
%.simple: %.raw
|
||||||
$(LOAD_MOSES) cat $< |\
|
$(LOAD_MOSES) cat $< |\
|
||||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
@ -871,8 +890,8 @@ tokenize-testsets prepare-testsets: ${ALLTEST}
|
|||||||
## NEW: always use the same name for the BPE models
|
## NEW: always use the same name for the BPE models
|
||||||
## --> avoid overwriting validation/test data with new segmentation models
|
## --> avoid overwriting validation/test data with new segmentation models
|
||||||
## if a new data set is used
|
## if a new data set is used
|
||||||
BPESRCMODEL = ${WORKDIR}/train/opus.src.bpe${SRCBPESIZE:000=}k-model
|
BPESRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
|
||||||
BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
|
BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
|
||||||
|
|
||||||
|
|
||||||
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||||
@ -954,8 +973,8 @@ endif
|
|||||||
## NEW: always use the same name for the SPM models
|
## NEW: always use the same name for the SPM models
|
||||||
## --> avoid overwriting validation/test data with new segmentation models
|
## --> avoid overwriting validation/test data with new segmentation models
|
||||||
## if a new data set is used
|
## if a new data set is used
|
||||||
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
SPMSRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
|
||||||
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
SPMTRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
|
||||||
# SPMEXTRA = --split_by_whitespace=false
|
# SPMEXTRA = --split_by_whitespace=false
|
||||||
SPMEXTRA =
|
SPMEXTRA =
|
||||||
|
|
||||||
@ -1021,14 +1040,14 @@ endif
|
|||||||
|
|
||||||
|
|
||||||
## sentence piece model trained on monolingual data
|
## sentence piece model trained on monolingual data
|
||||||
SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
|
SPMMODEL = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k-model
|
||||||
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
|
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k-model
|
||||||
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
|
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k-model
|
||||||
|
|
||||||
## vocabulary files created from monolingual data
|
## vocabulary files created from monolingual data
|
||||||
SPMVOCAB = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k.vocab.yml
|
SPMVOCAB = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k.vocab.yml
|
||||||
SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k.vocab.yml
|
SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k.vocab.yml
|
||||||
SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k.vocab.yml
|
SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k.vocab.yml
|
||||||
|
|
||||||
.PRECIOUS: ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} ${SPMVOCAB}
|
.PRECIOUS: ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} ${SPMVOCAB}
|
||||||
|
|
||||||
|
@ -2,6 +2,48 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## evaluation tool
|
||||||
|
## fails on puhti
|
||||||
|
easse:
|
||||||
|
git clone https://github.com/feralvam/easse.git
|
||||||
|
cd $@ && pip install --user .
|
||||||
|
|
||||||
|
## do we need this?
|
||||||
|
text-simplification-evaluation:
|
||||||
|
git clone git@github.com:facebookresearch/text-simplification-evaluation.git
|
||||||
|
cd text-simplification-evaluation
|
||||||
|
pip install -e . --user
|
||||||
|
pip install --user -r requirements.txt
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#---------------------------------------------------------------------
|
||||||
|
# simplification test set
|
||||||
|
#---------------------------------------------------------------------
|
||||||
|
|
||||||
|
simplification:
|
||||||
|
git clone https://github.com/cocoxu/simplification.git
|
||||||
|
|
||||||
|
testsets/en-en/simplification.en1.gz: simplification
|
||||||
|
mkdir -p ${dir $@}
|
||||||
|
cut -f2 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en | \
|
||||||
|
gzip -c > $@
|
||||||
|
|
||||||
|
testsets/en-en/simplification.en2.gz: simplification
|
||||||
|
mkdir -p ${dir $@}
|
||||||
|
cut -f3 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en | \
|
||||||
|
gzip -c > $@
|
||||||
|
|
||||||
|
simplify-testset: testsets/en-en/simplification.en1.gz testsets/en-en/simplification.en2.gz
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#---------------------------------------------------------------------
|
||||||
|
# data from https://cs.pomona.edu/~dkauchak/simplification/
|
||||||
|
#---------------------------------------------------------------------
|
||||||
|
|
||||||
SIMPLEWIKI_DATA1_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v1/
|
SIMPLEWIKI_DATA1_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v1/
|
||||||
SIMPLEWIKI_DATA2_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v2/
|
SIMPLEWIKI_DATA2_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v2/
|
||||||
|
|
||||||
@ -10,6 +52,8 @@ SIMPLEWIKI_DATA2_SENT = sentence-aligned.v2
|
|||||||
SIMPLEWIKI_DATA2_DOC = document-aligned.v2
|
SIMPLEWIKI_DATA2_DOC = document-aligned.v2
|
||||||
|
|
||||||
|
|
||||||
|
# v1 - standard split
|
||||||
|
|
||||||
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
|
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz
|
wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz
|
||||||
@ -23,24 +67,84 @@ ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
|
|||||||
${TOKENIZER}/detokenizer.perl -l en < $@/simple.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en2.raw
|
${TOKENIZER}/detokenizer.perl -l en < $@/simple.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en2.raw
|
||||||
|
|
||||||
|
|
||||||
${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en1.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/normal.%.txt
|
## v2 - sentence aligned - my split
|
||||||
mkdir -p ${dir $@}
|
|
||||||
${TOKENIZER}/detokenizer.perl -l en < $< > $@
|
|
||||||
|
|
||||||
${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en2.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/simple.%.txt
|
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}:
|
||||||
mkdir -p ${dir $@}
|
mkdir -p ${dir $@}
|
||||||
${TOKENIZER}/detokenizer.perl -l en < $< > $@
|
wget -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz
|
||||||
|
tar -C ${dir $@} -xzf $@.tar.gz
|
||||||
|
rm -f $@.tar.gz
|
||||||
|
cut -f3 $@/normal.aligned | tail -n +10001 |\
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en1.raw
|
||||||
|
cut -f3 $@/simple.aligned | tail -n +10001 |\
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en2.raw
|
||||||
|
cut -f3 $@/normal.aligned | head -10000 | tail -5000 |\
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en1.raw
|
||||||
|
cut -f3 $@/simple.aligned | head -10000 | tail -5000 |\
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en2.raw
|
||||||
|
cut -f3 $@/normal.aligned | head -5000 |\
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en1.raw
|
||||||
|
cut -f3 $@/simple.aligned | head -5000 |\
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en2.raw
|
||||||
|
|
||||||
simplify-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
|
||||||
|
simplewiki-v1-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
||||||
|
|
||||||
## train a simplification model from simplewiki for English
|
## train a simplification model from simplewiki for English
|
||||||
|
|
||||||
%-simplify-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
%-simplewiki-v1-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
||||||
|
rm -f ${WORKDIR}/*.submit
|
||||||
${MAKE} DATASET=simplewiki_v1 \
|
${MAKE} DATASET=simplewiki_v1 \
|
||||||
|
BPEMODELNAME=simplewiki_v1 \
|
||||||
TRAINSET=simplewiki_v1-training \
|
TRAINSET=simplewiki_v1-training \
|
||||||
DEVSET=simplewiki_v1-tuning \
|
DEVSET=simplewiki_v1-tuning \
|
||||||
TESTSET=simplewiki_v1-testing \
|
TESTSET=simplewiki_v1-testing \
|
||||||
HELDOUTSIZE=0 \
|
HELDOUTSIZE=0 \
|
||||||
SRCLANGS=en TRGLANGS=en \
|
SRCLANGS=en TRGLANGS=en \
|
||||||
${@:-simplify-english=}
|
${@:-simplewiki-v1-english=}
|
||||||
|
|
||||||
|
%-simplewiki-v2sent-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}
|
||||||
|
rm -f ${WORKDIR}/*.submit
|
||||||
|
${MAKE} DATASET=simplewiki_v2_sent \
|
||||||
|
BPEMODELNAME=simplewiki_v2_sent \
|
||||||
|
TRAINSET=simplewiki_v2_sent-training \
|
||||||
|
DEVSET=simplewiki_v2_sent-tuning \
|
||||||
|
TESTSET=simplewiki_v2_sent-testing \
|
||||||
|
HELDOUTSIZE=0 \
|
||||||
|
SRCLANGS=en TRGLANGS=en \
|
||||||
|
${@:-simplewiki-v2sent-english=}
|
||||||
|
|
||||||
|
|
||||||
|
#---------------------------------------------------------------------
|
||||||
|
# data from https://github.com/XingxingZhang/dress
|
||||||
|
#---------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
SIMPLEWIKI_LARGE_URL = https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2
|
||||||
|
SIMPLEWIKI_LARGE = data-simplification/wikilarge
|
||||||
|
|
||||||
|
|
||||||
|
${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}:
|
||||||
|
mkdir -p ${dir $@}
|
||||||
|
wget -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL}
|
||||||
|
tar -C ${dir $@} -xf $@.tar.bz2
|
||||||
|
rm -f $@.tar.bz2
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.src > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en1.raw
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.dst > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en2.raw
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en1.raw
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en2.raw
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en1.raw
|
||||||
|
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en2.raw
|
||||||
|
|
||||||
|
simplelarge-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}
|
||||||
|
|
||||||
|
%-simplewikilarge-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}
|
||||||
|
rm -f ${WORKDIR}/*.submit
|
||||||
|
${MAKE} DATASET=simplewiki_large \
|
||||||
|
BPEMODELNAME=simplewiki_large \
|
||||||
|
TRAINSET=simplewiki_large-train \
|
||||||
|
DEVSET=simplewiki_large-tune \
|
||||||
|
TESTSET=simplewiki_large-test \
|
||||||
|
HELDOUTSIZE=0 \
|
||||||
|
SRCLANGS=en TRGLANGS=en \
|
||||||
|
${@:-simplewikilarge-english=}
|
||||||
|
Loading…
Reference in New Issue
Block a user