mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
simplification evaluation with BLEU
This commit is contained in:
parent
3f57e4f873
commit
d13a9461f0
8
Makefile
8
Makefile
@ -185,9 +185,9 @@ eval-ensemble: ${WORKDIR}/${TESTSET}.${MODEL}${NR}.${MODELTYPE}.ensemble.${SRC}.
|
||||
## and all trokenized test sets that can be found in that directory
|
||||
TESTSET_HOME = ${PWD}/testsets
|
||||
TESTSET_DIR = ${TESTSET_HOME}/${SRC}-${TRG}
|
||||
TESTSETS = $(sort $(patsubst ${TESTSET_DIR}/%.${SRC}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRC}.gz}))
|
||||
TESTSETS_PRESRC = $(patsubst %,${TESTSET_DIR}/%.${SRC}.${PRE}.gz,${TESTSETS})
|
||||
TESTSETS_PRETRG = $(patsubst %,${TESTSET_DIR}/%.${TRG}.${PRE}.gz,${TESTSETS})
|
||||
TESTSETS = $(sort $(patsubst ${TESTSET_DIR}/%.${SRCEXT}.gz,%,${wildcard ${TESTSET_DIR}/*.${SRCEXT}.gz}))
|
||||
TESTSETS_PRESRC = $(patsubst %,${TESTSET_DIR}/%.${SRCEXT}.${PRE}.gz,${TESTSETS})
|
||||
TESTSETS_PRETRG = $(patsubst %,${TESTSET_DIR}/%.${TRGEXT}.${PRE}.gz,${TESTSETS})
|
||||
|
||||
# TESTSETS_PRESRC = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${SRC}.gz})})
|
||||
# TESTSETS_PRETRG = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard ${TESTSET_DIR}/*.${TRG}.gz})})
|
||||
@ -204,7 +204,7 @@ eval-heldout:
|
||||
${MAKE} TESTSET_HOME=${HELDOUT_DIR} eval-testsets
|
||||
|
||||
%-testsets-langpair: ${TESTSETS_PRESRC} ${TESTSETS_PRETRG}
|
||||
@echo "testsets: ${TESTSET_DIR}/*.${SRC}.gz"
|
||||
@echo "testsets: ${TESTSET_DIR}/*.${SRCEXT}.gz"
|
||||
for t in ${TESTSETS}; do \
|
||||
${MAKE} TESTSET=$$t ${@:-testsets-langpair=}; \
|
||||
done
|
||||
|
@ -184,6 +184,10 @@ ifndef DATASET
|
||||
DATASET = opus
|
||||
endif
|
||||
|
||||
ifndef BPEMODELNAME
|
||||
BPEMODELNAME = opus
|
||||
endif
|
||||
|
||||
##-------------------------------------
|
||||
## OLD OLD OLD
|
||||
## name of the data set (and the model)
|
||||
|
@ -99,6 +99,9 @@ endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
ifndef OLDMODELTYPE
|
||||
OLDMODELTYPE=transformer-align
|
||||
endif
|
||||
@ -108,6 +111,8 @@ ifndef NEWMODELTYPE
|
||||
endif
|
||||
|
||||
|
||||
## TODO: this does not seem to work as the config does not match
|
||||
## (optmiser cannot contintue to run ....)
|
||||
## move model files to a new name
|
||||
## (useful if using as starting point for another modeltyp
|
||||
## for example, continue training without guided alignment)
|
||||
@ -557,9 +562,9 @@ ${TEST_SRC}: ${DEV_SRC}
|
||||
ifneq (${TESTSET},${DEVSET})
|
||||
mkdir -p ${dir $@}
|
||||
rm -f ${TEST_SRC} ${TEST_TRG}
|
||||
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz ]; then \
|
||||
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz \
|
||||
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRG}.${PRE}.gz \
|
||||
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \
|
||||
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \
|
||||
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \
|
||||
add-to-test-data; \
|
||||
else \
|
||||
for s in ${SRCLANGS}; do \
|
||||
@ -576,9 +581,9 @@ ifneq (${TESTSET},${DEVSET})
|
||||
fi
|
||||
else
|
||||
mkdir -p ${dir $@}
|
||||
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz ]; then \
|
||||
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRC}.${PRE}.gz \
|
||||
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRG}.${PRE}.gz \
|
||||
if [ -e ${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz ]; then \
|
||||
${MAKE} CLEAN_TEST_SRC=${TESTSET_DIR}/${TESTSET}.${SRCEXT}.${PRE}.gz \
|
||||
CLEAN_TEST_TRG=${TESTSET_DIR}/${TESTSET}.${TRGEXT}.${PRE}.gz \
|
||||
add-to-test-data; \
|
||||
elif (( `zcat $<.shuffled.gz | wc -l` < $$((${DEVSIZE} + ${TESTSIZE})) )); then \
|
||||
zcat $<.shuffled.gz | cut -f1 | tail -n +$$((${DEVMINSIZE} + 1)) > ${TEST_SRC}; \
|
||||
@ -714,6 +719,13 @@ add-to-local-mono-data:
|
||||
### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS
|
||||
|
||||
## only normalisation
|
||||
%.norm.gz: %.gz
|
||||
$(LOAD_MOSES) zcat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||
|
||||
%.norm: %.raw
|
||||
$(LOAD_MOSES) cat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
@ -737,6 +749,13 @@ add-to-local-mono-data:
|
||||
|
||||
|
||||
## minimal pre-processing
|
||||
%.simple.gz: %.gz
|
||||
$(LOAD_MOSES) zcat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/deescape-special-chars.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||
|
||||
%.simple: %.raw
|
||||
$(LOAD_MOSES) cat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
@ -871,8 +890,8 @@ tokenize-testsets prepare-testsets: ${ALLTEST}
|
||||
## NEW: always use the same name for the BPE models
|
||||
## --> avoid overwriting validation/test data with new segmentation models
|
||||
## if a new data set is used
|
||||
BPESRCMODEL = ${WORKDIR}/train/opus.src.bpe${SRCBPESIZE:000=}k-model
|
||||
BPETRGMODEL = ${WORKDIR}/train/opus.trg.bpe${TRGBPESIZE:000=}k-model
|
||||
BPESRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.bpe${SRCBPESIZE:000=}k-model
|
||||
BPETRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.bpe${TRGBPESIZE:000=}k-model
|
||||
|
||||
|
||||
.PRECIOUS: ${BPESRCMODEL} ${BPETRGMODEL}
|
||||
@ -954,8 +973,8 @@ endif
|
||||
## NEW: always use the same name for the SPM models
|
||||
## --> avoid overwriting validation/test data with new segmentation models
|
||||
## if a new data set is used
|
||||
SPMSRCMODEL = ${WORKDIR}/train/opus.src.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMODEL = ${WORKDIR}/train/opus.trg.spm${TRGBPESIZE:000=}k-model
|
||||
SPMSRCMODEL = ${WORKDIR}/train/${BPEMODELNAME}.src.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMODEL = ${WORKDIR}/train/${BPEMODELNAME}.trg.spm${TRGBPESIZE:000=}k-model
|
||||
# SPMEXTRA = --split_by_whitespace=false
|
||||
SPMEXTRA =
|
||||
|
||||
@ -1021,14 +1040,14 @@ endif
|
||||
|
||||
|
||||
## sentence piece model trained on monolingual data
|
||||
SPMMODEL = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k-model
|
||||
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k-model
|
||||
SPMMODEL = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k-model
|
||||
SPMSRCMONO = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k-model
|
||||
SPMTRGMONO = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k-model
|
||||
|
||||
## vocabulary files created from monolingual data
|
||||
SPMVOCAB = ${SPMDIR}/${LANGSTR}/opus.spm${BPESIZE:000=}k.vocab.yml
|
||||
SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/opus.spm${SRCBPESIZE:000=}k.vocab.yml
|
||||
SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/opus.spm${TRGBPESIZE:000=}k.vocab.yml
|
||||
SPMVOCAB = ${SPMDIR}/${LANGSTR}/${BPEMODELNAME}.spm${BPESIZE:000=}k.vocab.yml
|
||||
SPMSRCVOCAB = ${SPMDIR}/${LANGSRCSTR}/${BPEMODELNAME}.spm${SRCBPESIZE:000=}k.vocab.yml
|
||||
SPMTRGVOCAB = ${SPMDIR}/${LANGTRGSTR}/${BPEMODELNAME}.spm${TRGBPESIZE:000=}k.vocab.yml
|
||||
|
||||
.PRECIOUS: ${SPMMODEL} ${SPMSRCMONO} ${SPMTRGMONO} ${SPMVOCAB}
|
||||
|
||||
|
@ -2,6 +2,48 @@
|
||||
|
||||
|
||||
|
||||
## evaluation tool
|
||||
## fails on puhti
|
||||
easse:
|
||||
git clone https://github.com/feralvam/easse.git
|
||||
cd $@ && pip install --user .
|
||||
|
||||
## do we need this?
|
||||
text-simplification-evaluation:
|
||||
git clone git@github.com:facebookresearch/text-simplification-evaluation.git
|
||||
cd text-simplification-evaluation
|
||||
pip install -e . --user
|
||||
pip install --user -r requirements.txt
|
||||
|
||||
|
||||
|
||||
#---------------------------------------------------------------------
|
||||
# simplification test set
|
||||
#---------------------------------------------------------------------
|
||||
|
||||
simplification:
|
||||
git clone https://github.com/cocoxu/simplification.git
|
||||
|
||||
testsets/en-en/simplification.en1.gz: simplification
|
||||
mkdir -p ${dir $@}
|
||||
cut -f2 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\
|
||||
${TOKENIZER}/detokenizer.perl -l en | \
|
||||
gzip -c > $@
|
||||
|
||||
testsets/en-en/simplification.en2.gz: simplification
|
||||
mkdir -p ${dir $@}
|
||||
cut -f3 simplification/data/turkcorpus/truecased/test.8turkers.organized.tsv |\
|
||||
${TOKENIZER}/detokenizer.perl -l en | \
|
||||
gzip -c > $@
|
||||
|
||||
simplify-testset: testsets/en-en/simplification.en1.gz testsets/en-en/simplification.en2.gz
|
||||
|
||||
|
||||
|
||||
#---------------------------------------------------------------------
|
||||
# data from https://cs.pomona.edu/~dkauchak/simplification/
|
||||
#---------------------------------------------------------------------
|
||||
|
||||
SIMPLEWIKI_DATA1_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v1/
|
||||
SIMPLEWIKI_DATA2_URL = https://cs.pomona.edu/~dkauchak/simplification/data.v2/
|
||||
|
||||
@ -10,6 +52,8 @@ SIMPLEWIKI_DATA2_SENT = sentence-aligned.v2
|
||||
SIMPLEWIKI_DATA2_DOC = document-aligned.v2
|
||||
|
||||
|
||||
# v1 - standard split
|
||||
|
||||
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
|
||||
mkdir -p ${dir $@}
|
||||
wget -O $@.tar.gz ${SIMPLEWIKI_DATA1_URL}/${SIMPLEWIKI_DATA1}.tar.gz
|
||||
@ -23,24 +67,84 @@ ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}:
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@/simple.testing.txt > ${DATADIR}/${PRE}/simplewiki_v1-testing.en-en.en2.raw
|
||||
|
||||
|
||||
${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en1.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/normal.%.txt
|
||||
mkdir -p ${dir $@}
|
||||
${TOKENIZER}/detokenizer.perl -l en < $< > $@
|
||||
## v2 - sentence aligned - my split
|
||||
|
||||
${DATADIR}/${PRE}/simplewiki_v1-%.en-en.en2.raw: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}/simple.%.txt
|
||||
${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}:
|
||||
mkdir -p ${dir $@}
|
||||
${TOKENIZER}/detokenizer.perl -l en < $< > $@
|
||||
wget -O $@.tar.gz ${SIMPLEWIKI_DATA2_URL}/${SIMPLEWIKI_DATA2_SENT}.tar.gz
|
||||
tar -C ${dir $@} -xzf $@.tar.gz
|
||||
rm -f $@.tar.gz
|
||||
cut -f3 $@/normal.aligned | tail -n +10001 |\
|
||||
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en1.raw
|
||||
cut -f3 $@/simple.aligned | tail -n +10001 |\
|
||||
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-training.en-en.en2.raw
|
||||
cut -f3 $@/normal.aligned | head -10000 | tail -5000 |\
|
||||
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en1.raw
|
||||
cut -f3 $@/simple.aligned | head -10000 | tail -5000 |\
|
||||
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-tuning.en-en.en2.raw
|
||||
cut -f3 $@/normal.aligned | head -5000 |\
|
||||
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en1.raw
|
||||
cut -f3 $@/simple.aligned | head -5000 |\
|
||||
${TOKENIZER}/detokenizer.perl -l en > ${DATADIR}/${PRE}/simplewiki_v2_sent-testing.en-en.en2.raw
|
||||
|
||||
simplify-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
||||
|
||||
simplewiki-v1-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
||||
|
||||
## train a simplification model from simplewiki for English
|
||||
|
||||
%-simplify-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
||||
%-simplewiki-v1-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA1}
|
||||
rm -f ${WORKDIR}/*.submit
|
||||
${MAKE} DATASET=simplewiki_v1 \
|
||||
BPEMODELNAME=simplewiki_v1 \
|
||||
TRAINSET=simplewiki_v1-training \
|
||||
DEVSET=simplewiki_v1-tuning \
|
||||
TESTSET=simplewiki_v1-testing \
|
||||
HELDOUTSIZE=0 \
|
||||
SRCLANGS=en TRGLANGS=en \
|
||||
${@:-simplify-english=}
|
||||
${@:-simplewiki-v1-english=}
|
||||
|
||||
%-simplewiki-v2sent-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_DATA2_SENT}
|
||||
rm -f ${WORKDIR}/*.submit
|
||||
${MAKE} DATASET=simplewiki_v2_sent \
|
||||
BPEMODELNAME=simplewiki_v2_sent \
|
||||
TRAINSET=simplewiki_v2_sent-training \
|
||||
DEVSET=simplewiki_v2_sent-tuning \
|
||||
TESTSET=simplewiki_v2_sent-testing \
|
||||
HELDOUTSIZE=0 \
|
||||
SRCLANGS=en TRGLANGS=en \
|
||||
${@:-simplewiki-v2sent-english=}
|
||||
|
||||
|
||||
#---------------------------------------------------------------------
|
||||
# data from https://github.com/XingxingZhang/dress
|
||||
#---------------------------------------------------------------------
|
||||
|
||||
|
||||
SIMPLEWIKI_LARGE_URL = https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2
|
||||
SIMPLEWIKI_LARGE = data-simplification/wikilarge
|
||||
|
||||
|
||||
${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}:
|
||||
mkdir -p ${dir $@}
|
||||
wget -O $@.tar.bz2 ${SIMPLEWIKI_LARGE_URL}
|
||||
tar -C ${dir $@} -xf $@.tar.bz2
|
||||
rm -f $@.tar.bz2
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.src > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en1.raw
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.train.dst > ${DATADIR}/${PRE}/simplewiki_large-train.en-en.en2.raw
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en1.raw
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.valid.src > ${DATADIR}/${PRE}/simplewiki_large-tune.en-en.en2.raw
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en1.raw
|
||||
${TOKENIZER}/detokenizer.perl -l en < $@/wiki.full.aner.test.src > ${DATADIR}/${PRE}/simplewiki_large-test.en-en.en2.raw
|
||||
|
||||
simplelarge-english-prepare: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}
|
||||
|
||||
%-simplewikilarge-english: ${WORKHOME}/simplewiki/${SIMPLEWIKI_LARGE}
|
||||
rm -f ${WORKDIR}/*.submit
|
||||
${MAKE} DATASET=simplewiki_large \
|
||||
BPEMODELNAME=simplewiki_large \
|
||||
TRAINSET=simplewiki_large-train \
|
||||
DEVSET=simplewiki_large-tune \
|
||||
TESTSET=simplewiki_large-test \
|
||||
HELDOUTSIZE=0 \
|
||||
SRCLANGS=en TRGLANGS=en \
|
||||
${@:-simplewikilarge-english=}
|
||||
|
Loading…
Reference in New Issue
Block a user