From d13a9461f065be288d599b5ce4cdea2a6a168f09 Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Sun, 1 Mar 2020 00:25:05 +0200 Subject: [PATCH] simplification evaluation with BLEU --- Makefile | 8 ++-- Makefile.config | 4 ++ Makefile.data | 51 +++++++++++++------- Makefile.simplify | 120 ++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 155 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index 37e12abb..e6e13fd1 100644 --- a/Makefile +++ b/Makefile @@ -185,9 +185,9 @@ eval-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}. ## and all trokenized test sets that can be found in that directory TESTSET_HOME = ${PWD}/testsets TESTSET_DIR = ${TESTSET_HOME}/${SRC}-${TRG} -TESTSETS = $(sort $(patsubst ${TESTSET_DIR}/%.${SRC}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})) -TESTSETS_PRESRC = $(patsubst %,${TESTSET_DIR}/%.${SRC}.${PRE}.gz,${TESTSETS}) -TESTSETS_PRETRG = $(patsubst %,${TESTSET_DIR}/%.${TRG}.${PRE}.gz,${TESTSETS}) +TESTSETS = $(sort $(patsubst ${TESTSET_DIR}/%.${SRCEXT}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRCEXT}.gz})) +TESTSETS_PRESRC = $(patsubst %,${TESTSET_DIR}/%.${SRCEXT}.${PRE}.gz,${TESTSETS}) +TESTSETS_PRETRG = $(patsubst %,${TESTSET_DIR}/%.${TRGEXT}.${PRE}.gz,${TESTSETS}) # TESTSETS_PRESRC = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})}) # TESTSETS_PRETRG = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${TRG}.gz})}) @@ -204,7 +204,7 @@ eval-heldout: ${MAKE} TESTSET_HOME=${HELDOUT_DIR} eval-testsets %-testsets-langpair: ${TESTSETS_PRESRC} ${TESTSETS_PRETRG} - @echo "testsets: ${TESTSET_DIR}/*.${SRC}.gz" + @echo "testsets: ${TESTSET_DIR}/*.${SRCEXT}.gz" for t in ${TESTSETS}; do \ ${MAKE} TESTSET=$$t ${@:-testsets-langpair=}; \ done diff --git a/Makefile.config b/Makefile.config index 6dfe7140..f3b27da0 100644 --- a/Makefile.config +++ b/Makefile.config @@ -184,6 +184,10 @@ ifndef DATASET DATASET = opus endif +ifndef BPEMODELNAME + BPEMODELNAME = opus +endif + ##------------------------------------- ## OLD OLD OLD ## name of the data set (and the model) diff --git a/Makefile.data b/Makefile.data index 1a6fdb5c..e0234f22 100644 --- a/Makefile.data +++ b/Makefile.data @@ -99,6 +99,9 @@ endif endif endif + + + ifndef OLDMODELTYPE OLDMODELTYPE=transformer-align endif @@ -108,6 +111,8 @@ ifndef NEWMODELTYPE endif +## TODO: this does not seem to work as the config does not match +## (optmiser cannot contintue to run ....) ## move model files to a new name ## (useful if using as starting point for another modeltyp ## for example, continue training without guided alignment) @@ -557,9 +562,9 @@ ${TEST_SRC}: ${DEV_SRC} ifneq (${TESTSET},${DEVSET}) mkdir -p ${dir $@} rm -f ${TEST_SRC} ${TEST_TRG} - if [ -e ${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz ]; then \ - ${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz \ - CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRG}.${PRE}.gz \ + if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \ + ${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \ + CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \ add-to-test-data; \ else \ for s in ${SRCLANGS}; do \ @@ -576,9 +581,9 @@ ifneq (${TESTSET},${DEVSET}) fi else mkdir -p ${dir $@} - if [ -e ${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz ]; then \ - ${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz \ - CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRG}.${PRE}.gz \ + if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \ + ${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \ + CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \ add-to-test-data; \ elif (( `zcat $<.shuffled.gz | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \ zcat $<.shuffled.gz | cut -f1 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_SRC}; \ @@ -714,6 +719,13 @@ add-to-local-mono-data: ### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS ## only normalisation +%.norm.gz: %.gz + $(LOAD_MOSES) zcat $< |\ + $(TOKENIZER)/replace-unicode-punctuation.perl |\ + $(TOKENIZER)/remove-non-printing-char.perl |\ + $(TOKENIZER)/normalize-punctuation.perl |\ + sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ + %.norm: %.raw $(LOAD_MOSES) cat $< |\ $(TOKENIZER)/replace-unicode-punctuation.perl |\ @@ -737,6 +749,13 @@ add-to-local-mono-data: ## minimal pre-processing +%.simple.gz: %.gz + $(LOAD_MOSES) zcat $< |\ + $(TOKENIZER)/replace-unicode-punctuation.perl |\ + $(TOKENIZER)/remove-non-printing-char.perl |\ + $(TOKENIZER)/deescape-special-chars.perl |\ + sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ + %.simple: %.raw $(LOAD_MOSES) cat $< |\ $(TOKENIZER)/replace-unicode-punctuation.perl |\ @@ -871,8 +890,8 @@ tokenize-testsets prepare-testsets: ${ALLTEST} ## NEW: always use the same name for the BPE models ## --> avoid overwriting validation/test data with new segmentation models ## if a new data set is used -BPESRCMODEL = ${WORKDIR}/train/opus.src.bpe${SRCBPESIZE:000=}k-model -BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model +BPESRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model +BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model .PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL} @@ -954,8 +973,8 @@ endif ## NEW: always use the same name for the SPM models ## --> avoid overwriting validation/test data with new segmentation models ## if a new data set is used -SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model -SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model +SPMSRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model +SPMTRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model # SPMEXTRA = --split_by_whitespace=false SPMEXTRA = @@ -1021,14 +1040,14 @@ endif ## sentence piece model trained on monolingual data -SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model -SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model -SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model +SPMMODEL = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k-model +SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k-model +SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k-model ## vocabulary files created from monolingual data -SPMVOCAB = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k.vocab.yml -SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k.vocab.yml -SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k.vocab.yml +SPMVOCAB = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k.vocab.yml +SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k.vocab.yml +SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k.vocab.yml .PRECIOUS: ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} ${SPMVOCAB} diff --git a/Makefile.simplify b/Makefile.simplify index 30366624..ea39e708 100644 --- a/Makefile.simplify +++ b/Makefile.simplify @@ -2,6 +2,48 @@ +## evaluation tool +## fails on puhti +easse: + git clone https://github.com/feralvam/easse.git + cd $@ && pip install --user . + +## do we need this? +text-simplification-evaluation: + git clone git@github.com:facebookresearch/text-simplification-evaluation.git + cd text-simplification-evaluation + pip install -e . --user + pip install --user -r requirements.txt + + + +#--------------------------------------------------------------------- +# simplification test set +#--------------------------------------------------------------------- + +simplification: + git clone https://github.com/cocoxu/simplification.git + +testsets/en-en/simplification.en1.gz: simplification + mkdir -p ${dir $@} + cut -f2 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\ + ${TOKENIZER}/detokenizer.perl -l en | \ + gzip -c > $@ + +testsets/en-en/simplification.en2.gz: simplification + mkdir -p ${dir $@} + cut -f3 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\ + ${TOKENIZER}/detokenizer.perl -l en | \ + gzip -c > $@ + +simplify-testset: testsets/en-en/simplification.en1.gz testsets/en-en/simplification.en2.gz + + + +#--------------------------------------------------------------------- +# data from https://cs.pomona.edu/~dkauchak/simplification/ +#--------------------------------------------------------------------- + SIMPLEWIKI_DATA1_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v1/ SIMPLEWIKI_DATA2_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v2/ @@ -10,6 +52,8 @@ SIMPLEWIKI_DATA2_SENT = sentence-aligned.v2 SIMPLEWIKI_DATA2_DOC = document-aligned.v2 +# v1 - standard split + ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}: mkdir -p ${dir $@} wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz @@ -23,24 +67,84 @@ ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}: ${TOKENIZER}/detokenizer.perl -l en < $@/simple.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en2.raw -${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en1.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/normal.%.txt - mkdir -p ${dir $@} - ${TOKENIZER}/detokenizer.perl -l en < $< > $@ +## v2 - sentence aligned - my split -${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en2.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/simple.%.txt +${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}: mkdir -p ${dir $@} - ${TOKENIZER}/detokenizer.perl -l en < $< > $@ + wget -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz + tar -C ${dir $@} -xzf $@.tar.gz + rm -f $@.tar.gz + cut -f3 $@/normal.aligned | tail -n +10001 |\ + ${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en1.raw + cut -f3 $@/simple.aligned | tail -n +10001 |\ + ${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en2.raw + cut -f3 $@/normal.aligned | head -10000 | tail -5000 |\ + ${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en1.raw + cut -f3 $@/simple.aligned | head -10000 | tail -5000 |\ + ${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en2.raw + cut -f3 $@/normal.aligned | head -5000 |\ + ${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en1.raw + cut -f3 $@/simple.aligned | head -5000 |\ + ${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en2.raw -simplify-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1} + +simplewiki-v1-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1} ## train a simplification model from simplewiki for English -%-simplify-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1} +%-simplewiki-v1-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1} + rm -f ${WORKDIR}/*.submit ${MAKE} DATASET=simplewiki_v1 \ + BPEMODELNAME=simplewiki_v1 \ TRAINSET=simplewiki_v1-training \ DEVSET=simplewiki_v1-tuning \ TESTSET=simplewiki_v1-testing \ HELDOUTSIZE=0 \ SRCLANGS=en TRGLANGS=en \ - ${@:-simplify-english=} + ${@:-simplewiki-v1-english=} +%-simplewiki-v2sent-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT} + rm -f ${WORKDIR}/*.submit + ${MAKE} DATASET=simplewiki_v2_sent \ + BPEMODELNAME=simplewiki_v2_sent \ + TRAINSET=simplewiki_v2_sent-training \ + DEVSET=simplewiki_v2_sent-tuning \ + TESTSET=simplewiki_v2_sent-testing \ + HELDOUTSIZE=0 \ + SRCLANGS=en TRGLANGS=en \ + ${@:-simplewiki-v2sent-english=} + + +#--------------------------------------------------------------------- +# data from https://github.com/XingxingZhang/dress +#--------------------------------------------------------------------- + + +SIMPLEWIKI_LARGE_URL = https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2 +SIMPLEWIKI_LARGE = data-simplification/wikilarge + + +${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}: + mkdir -p ${dir $@} + wget -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL} + tar -C ${dir $@} -xf $@.tar.bz2 + rm -f $@.tar.bz2 + ${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.src > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en1.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.dst > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en2.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en1.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en2.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en1.raw + ${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en2.raw + +simplelarge-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE} + +%-simplewikilarge-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE} + rm -f ${WORKDIR}/*.submit + ${MAKE} DATASET=simplewiki_large \ + BPEMODELNAME=simplewiki_large \ + TRAINSET=simplewiki_large-train \ + DEVSET=simplewiki_large-tune \ + TESTSET=simplewiki_large-test \ + HELDOUTSIZE=0 \ + SRCLANGS=en TRGLANGS=en \ + ${@:-simplewikilarge-english=}