simplification evaluation with BLEU

This commit is contained in:
Joerg Tiedemann 2020-03-01 00:25:05 +02:00
parent 3f57e4f873
commit d13a9461f0
4 changed files with 155 additions and 28 deletions

View File

@ -185,9 +185,9 @@ eval-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.
## and all trokenized test sets that can be found in that directory
TESTSET_HOME = ${PWD}/testsets
TESTSET_DIR = ${TESTSET_HOME}/${SRC}-${TRG}
TESTSETS = $(sort $(patsubst ${TESTSET_DIR}/%.${SRC}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.gz}))
TESTSETS_PRESRC = $(patsubst %,${TESTSET_DIR}/%.${SRC}.${PRE}.gz,${TESTSETS})
TESTSETS_PRETRG = $(patsubst %,${TESTSET_DIR}/%.${TRG}.${PRE}.gz,${TESTSETS})
TESTSETS = $(sort $(patsubst ${TESTSET_DIR}/%.${SRCEXT}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRCEXT}.gz}))
TESTSETS_PRESRC = $(patsubst %,${TESTSET_DIR}/%.${SRCEXT}.${PRE}.gz,${TESTSETS})
TESTSETS_PRETRG = $(patsubst %,${TESTSET_DIR}/%.${TRGEXT}.${PRE}.gz,${TESTSETS})
# TESTSETS_PRESRC = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})})
# TESTSETS_PRETRG = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${TRG}.gz})})
@ -204,7 +204,7 @@ eval-heldout:
${MAKE} TESTSET_HOME=${HELDOUT_DIR} eval-testsets
%-testsets-langpair: ${TESTSETS_PRESRC} ${TESTSETS_PRETRG}
@echo "testsets: ${TESTSET_DIR}/*.${SRC}.gz"
@echo "testsets: ${TESTSET_DIR}/*.${SRCEXT}.gz"
for t in ${TESTSETS}; do \
${MAKE} TESTSET=$$t ${@:-testsets-langpair=}; \
done

View File

@ -184,6 +184,10 @@ ifndef DATASET
DATASET = opus
endif
ifndef BPEMODELNAME
BPEMODELNAME = opus
endif
##-------------------------------------
## OLD OLD OLD
## name of the data set (and the model)

View File

@ -99,6 +99,9 @@ endif
endif
endif
ifndef OLDMODELTYPE
OLDMODELTYPE=transformer-align
endif
@ -108,6 +111,8 @@ ifndef NEWMODELTYPE
endif
## TODO: this does not seem to work as the config does not match
## (optmiser cannot contintue to run ....)
## move model files to a new name
## (useful if using as starting point for another modeltyp
## for example, continue training without guided alignment)
@ -557,9 +562,9 @@ ${TEST_SRC}: ${DEV_SRC}
ifneq (${TESTSET},${DEVSET})
mkdir -p ${dir $@}
rm -f ${TEST_SRC} ${TEST_TRG}
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz ]; then \
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz \
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRG}.${PRE}.gz \
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \
add-to-test-data; \
else \
for s in ${SRCLANGS}; do \
@ -576,9 +581,9 @@ ifneq (${TESTSET},${DEVSET})
fi
else
mkdir -p ${dir $@}
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz ]; then \
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz \
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRG}.${PRE}.gz \
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \
add-to-test-data; \
elif (( `zcat $<.shuffled.gz | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
zcat $<.shuffled.gz | cut -f1 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_SRC}; \
@ -714,6 +719,13 @@ add-to-local-mono-data:
### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS
## only normalisation
%.norm.gz: %.gz
$(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
%.norm: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
@ -737,6 +749,13 @@ add-to-local-mono-data:
## minimal pre-processing
%.simple.gz: %.gz
$(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
%.simple: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
@ -871,8 +890,8 @@ tokenize-testsets prepare-testsets: ${ALLTEST}
## NEW: always use the same name for the BPE models
## --> avoid overwriting validation/test data with new segmentation models
## if a new data set is used
BPESRCMODEL = ${WORKDIR}/train/opus.src.bpe${SRCBPESIZE:000=}k-model
BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
BPESRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
@ -954,8 +973,8 @@ endif
## NEW: always use the same name for the SPM models
## --> avoid overwriting validation/test data with new segmentation models
## if a new data set is used
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
SPMSRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
SPMTRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
# SPMEXTRA = --split_by_whitespace=false
SPMEXTRA =
@ -1021,14 +1040,14 @@ endif
## sentence piece model trained on monolingual data
SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
SPMMODEL = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k-model
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k-model
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k-model
## vocabulary files created from monolingual data
SPMVOCAB = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k.vocab.yml
SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k.vocab.yml
SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k.vocab.yml
SPMVOCAB = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k.vocab.yml
SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k.vocab.yml
SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k.vocab.yml
.PRECIOUS: ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} ${SPMVOCAB}

View File

@ -2,6 +2,48 @@
## evaluation tool
## fails on puhti
easse:
git clone https://github.com/feralvam/easse.git
cd $@ && pip install --user .
## do we need this?
text-simplification-evaluation:
git clone git@github.com:facebookresearch/text-simplification-evaluation.git
cd text-simplification-evaluation
pip install -e . --user
pip install --user -r requirements.txt
#---------------------------------------------------------------------
# simplification test set
#---------------------------------------------------------------------
simplification:
git clone https://github.com/cocoxu/simplification.git
testsets/en-en/simplification.en1.gz: simplification
mkdir -p ${dir $@}
cut -f2 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\
${TOKENIZER}/detokenizer.perl -l en | \
gzip -c > $@
testsets/en-en/simplification.en2.gz: simplification
mkdir -p ${dir $@}
cut -f3 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\
${TOKENIZER}/detokenizer.perl -l en | \
gzip -c > $@
simplify-testset: testsets/en-en/simplification.en1.gz testsets/en-en/simplification.en2.gz
#---------------------------------------------------------------------
# data from https://cs.pomona.edu/~dkauchak/simplification/
#---------------------------------------------------------------------
SIMPLEWIKI_DATA1_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v1/
SIMPLEWIKI_DATA2_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v2/
@ -10,6 +52,8 @@ SIMPLEWIKI_DATA2_SENT = sentence-aligned.v2
SIMPLEWIKI_DATA2_DOC = document-aligned.v2
# v1 - standard split
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
mkdir -p ${dir $@}
wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz
@ -23,24 +67,84 @@ ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
${TOKENIZER}/detokenizer.perl -l en < $@/simple.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en2.raw
${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en1.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/normal.%.txt
mkdir -p ${dir $@}
${TOKENIZER}/detokenizer.perl -l en < $< > $@
## v2 - sentence aligned - my split
${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en2.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/simple.%.txt
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}:
mkdir -p ${dir $@}
${TOKENIZER}/detokenizer.perl -l en < $< > $@
wget -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz
tar -C ${dir $@} -xzf $@.tar.gz
rm -f $@.tar.gz
cut -f3 $@/normal.aligned | tail -n +10001 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en1.raw
cut -f3 $@/simple.aligned | tail -n +10001 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en2.raw
cut -f3 $@/normal.aligned | head -10000 | tail -5000 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en1.raw
cut -f3 $@/simple.aligned | head -10000 | tail -5000 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en2.raw
cut -f3 $@/normal.aligned | head -5000 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en1.raw
cut -f3 $@/simple.aligned | head -5000 |\
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en2.raw
simplify-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
simplewiki-v1-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
## train a simplification model from simplewiki for English
%-simplify-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
%-simplewiki-v1-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_v1 \
BPEMODELNAME=simplewiki_v1 \
TRAINSET=simplewiki_v1-training \
DEVSET=simplewiki_v1-tuning \
TESTSET=simplewiki_v1-testing \
HELDOUTSIZE=0 \
SRCLANGS=en TRGLANGS=en \
${@:-simplify-english=}
${@:-simplewiki-v1-english=}
%-simplewiki-v2sent-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_v2_sent \
BPEMODELNAME=simplewiki_v2_sent \
TRAINSET=simplewiki_v2_sent-training \
DEVSET=simplewiki_v2_sent-tuning \
TESTSET=simplewiki_v2_sent-testing \
HELDOUTSIZE=0 \
SRCLANGS=en TRGLANGS=en \
${@:-simplewiki-v2sent-english=}
#---------------------------------------------------------------------
# data from https://github.com/XingxingZhang/dress
#---------------------------------------------------------------------
SIMPLEWIKI_LARGE_URL = https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2
SIMPLEWIKI_LARGE = data-simplification/wikilarge
${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}:
mkdir -p ${dir $@}
wget -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL}
tar -C ${dir $@} -xf $@.tar.bz2
rm -f $@.tar.bz2
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.src > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.dst > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en2.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en2.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en1.raw
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en2.raw
simplelarge-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}
%-simplewikilarge-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}
rm -f ${WORKDIR}/*.submit
${MAKE} DATASET=simplewiki_large \
BPEMODELNAME=simplewiki_large \
TRAINSET=simplewiki_large-train \
DEVSET=simplewiki_large-tune \
TESTSET=simplewiki_large-test \
HELDOUTSIZE=0 \
SRCLANGS=en TRGLANGS=en \
${@:-simplewikilarge-english=}