changes to preprocessing

This commit is contained in:
Joerg Tiedemann 2022-01-11 16:10:43 +02:00
parent ed1bde6ac5
commit a08ad41fbd
7 changed files with 196 additions and 60 deletions

View File

@ -4,6 +4,16 @@
https://github.com/UKPLab/EasyNMT
# Data cleanup
Need better data filtering:
* integrate OpusFilter
* Tatoeba MT challenge data sets are noisy for smaller languages like Breton (but the similarity scores are not available), CC-Matrix etc is not very good for those languages
* cleanup script also before subword splitting?
* stronger filters in cleanup script?
* idea: compare character diversity between the two languages and use a threshold to filter sentences? (language-specific?)
# more efficient parallelisation
from Bergamot:

View File

@ -358,9 +358,22 @@ VOCABSIZE ?= $$((${SUBWORD_SRCVOCAB_SIZE} + ${SUBWORD_TRGVOCAB_SIZE} + 1000))
## for document-level models
CONTEXT_SIZE = 100
## pre-processing type
# PRE = norm
PRE = simple
## pre-processing/data-cleanup type
## PRE .......... apply basic normalisation scripts
## CLEAN_TYPE ... clean = simple noise filtering
## strict = some additional cleanup based on test set stats
## CLEAN_TESTDATA_TYPE should stay as 'clean' because
## we need those data sets to get the parameters
## for the strict mode
PRE = simple
CLEAN_TRAINDATA_TYPE = strict
CLEAN_DEVDATA_TYPE = strict
CLEAN_TESTDATA_TYPE = clean
## subword splitting type
PRE_SRC = ${SUBWORDS}${SUBWORD_SRCVOCAB_SIZE:000=}k
PRE_TRG = ${SUBWORDS}${SUBWORD_TRGVOCAB_SIZE:000=}k
@ -426,8 +439,8 @@ TEST_TRG ?= ${WORKDIR}/test/${TESTSET_NAME}.trg
## model basename and optional sub-dir
MODEL_SUBDIR =
MODEL_VARIANT =
# MODEL_SUBDIR =
# MODEL_VARIANT =
MODEL = ${MODEL_SUBDIR}${DATASET}${MODEL_VARIANT}${TRAINSIZE}.${PRE_SRC}-${PRE_TRG}
MODEL_BASENAME = ${MODEL}.${MODELTYPE}.model${NR}
@ -619,7 +632,8 @@ endif
# load model-specific configuration parameters
# if they exist in the work directory
MODELCONFIG ?= ${MODEL}.${MODELTYPE}.mk
# MODELCONFIG ?= ${MODEL}.${MODELTYPE}.mk
MODELCONFIG = ${DATASET}.${MODELTYPE}.mk
ifneq ($(wildcard ${WORKDIR}/${MODELCONFIG}),)
include ${WORKDIR}/${MODELCONFIG}
endif

View File

@ -124,16 +124,20 @@ print-datasets:
## data sets to be included in the train/dev/test sets
## with some basic pre-processing (see lib/preprocess.mk)
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TRAINSET}} \
CLEAN_TRAIN_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${SRCEXT}.gz,${TRAINSET}} \
${BACKTRANS_SRC} ${FORWARDTRANS_SRC} ${FORWARDTRANSMONO_SRC} ${PIVOTING_SRC}
CLEAN_TRAIN_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TRAIN_SRC}}
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${DEVSET}}
CLEAN_DEV_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_DEVDATA_TYPE}.${SRCEXT}.gz,${DEVSET}}
CLEAN_DEV_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_DEV_SRC}}
CLEAN_TEST_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.clean.${SRCEXT}.gz,${TESTSET}}
CLEAN_TEST_SRC = ${patsubst %,${DATADIR}/${PRE}/%.${LANGPAIR}.${CLEAN_TESTDATA_TYPE}.${SRCEXT}.gz,${TESTSET}}
CLEAN_TEST_TRG = ${patsubst %.${SRCEXT}.gz,%.${TRGEXT}.gz,${CLEAN_TEST_SRC}}
CLEAN_TEST_SRC_STATS = ${CLEAN_TEST_SRC:.gz=.stats}
CLEAN_TEST_TRG_STATS = ${CLEAN_TEST_TRG:.gz=.stats}
DATA_SRC := ${sort ${CLEAN_TRAIN_SRC} ${CLEAN_DEV_SRC} ${CLEAN_TEST_SRC}}
DATA_TRG := ${sort ${CLEAN_TRAIN_TRG} ${CLEAN_DEV_TRG} ${CLEAN_TEST_TRG}}
@ -223,7 +227,10 @@ clean-data rawdata:
done
.PHONY: clean-data-source
clean-data-source: ${DATA_SRC} ${DATA_TRG}
clean-data-source:
${MAKE} ${CLEAN_TEST_SRC} ${CLEAN_TEST_TRG}
${MAKE} ${CLEAN_TEST_SRC_STATS} ${CLEAN_TEST_TRG_STATS}
${MAKE} ${DATA_SRC} ${DATA_TRG}
@ -759,13 +766,13 @@ add-to-local-mono-data:
## get data from local space and compress ...
##----------------------------------------------
${WORKDIR}/%.clean.${PRE_SRC}.gz: ${TMPWORKDIR}/${LANGPAIRSTR}/%.clean.${PRE_SRC}
${WORKDIR}/%.${PRE_SRC}.gz: ${TMPWORKDIR}/${LANGPAIRSTR}/%.${PRE_SRC}
mkdir -p ${dir $@}
${GZIP} -c < $< > $@
-cat ${dir $<}README.md >> ${dir $@}README.md
ifneq (${PRE_SRC},${PRE_TRG})
${WORKDIR}/%.clean.${PRE_TRG}.gz: ${TMPWORKDIR}/${LANGPAIRSTR}/%.clean.${PRE_TRG}
${WORKDIR}/%.${PRE_TRG}.gz: ${TMPWORKDIR}/${LANGPAIRSTR}/%.${PRE_TRG}
mkdir -p ${dir $@}
${GZIP} -c < $< > $@
endif

View File

@ -1,17 +1,132 @@
# -*-makefile-*-
## clean data
## OLD: apply cleanup script from Moses
## --> this might not be a good idea before subword splitting for languages without spaces
## NEW: do this later after splitting into subword units
## clean-corpus script parameters
##
## TODO:
## - does this effect sentence piece / BPE models in some negative way?
## - should we increase the length filter when cleaning later? How much?
## - should we apply some other cleanup scripts here to get rid of some messy stuff?
## increase max number of tokens to 250
## (TODO: should MIN_NTOKENS be 1?)
MIN_NR_TOKENS = 0
MAX_NR_TOKENS = 250
NR_TOKEN_RATIO = 2
MAX_TOKEN_LENGTH = 100
## default values in the original script:
##
# MAX_TOKEN_LENGTH = 1000
# NR_TOKEN_RATIO = 9
## compute some ratios and thresholds that could be useful for filtering training data
## use test sets for those stats assuming that they are representative and clean
##
## - word-ratio threshold = max ratio between number of words
## - char-ratio threshold = max ratio between number of characters
ifneq ($(wildcard ${CLEAN_TEST_SRC_STATS}),)
NR_LINES_RAWSRCTEST = $(word 1,$(shell cat ${CLEAN_TEST_SRC_STATS}))
NR_WORDS_RAWSRCTEST = $(word 2,$(shell cat ${CLEAN_TEST_SRC_STATS}))
NR_CHARS_RAWSRCTEST = $(word 3,$(shell cat ${CLEAN_TEST_SRC_STATS}))
NR_BYTES_RAWSRCTEST = $(word 4,$(shell cat ${CLEAN_TEST_SRC_STATS}))
UNIQUE_CHARS_RAWSRCTEST = $(shell sed -n 2,2p ${CLEAN_TEST_SRC_STATS})
LONGEST_LINE_RAWSRCTEST = $(shell sed -n 3,3p ${CLEAN_TEST_SRC_STATS})
LONGEST_WORD_RAWSRCTEST = $(shell sed -n 4,4p ${CLEAN_TEST_SRC_STATS})
endif
ifneq ($(wildcard ${CLEAN_TEST_TRG_STATS}),)
NR_LINES_RAWTRGTEST = $(word 1,$(shell cat ${CLEAN_TEST_TRG_STATS}))
NR_WORDS_RAWTRGTEST = $(word 2,$(shell cat ${CLEAN_TEST_TRG_STATS}))
NR_CHARS_RAWTRGTEST = $(word 3,$(shell cat ${CLEAN_TEST_TRG_STATS}))
NR_BYTES_RAWTRGTEST = $(word 4,$(shell cat ${CLEAN_TEST_TRG_STATS}))
UNIQUE_CHARS_RAWTRGTEST = $(shell sed -n 2,2p ${CLEAN_TEST_TRG_STATS})
LONGEST_LINE_RAWTRGTEST = $(shell sed -n 3,3p ${CLEAN_TEST_TRG_STATS})
LONGEST_WORD_RAWTRGTEST = $(shell sed -n 4,4p ${CLEAN_TEST_TRG_STATS})
endif
ifdef NR_WORDS_RAWSRCTEST
ifdef NR_WORDS_RAWTRGTEST
WORD_RATIO_SRCTRG_RAWTEST = $$(( ${NR_WORDS_RAWSRCTEST} / ${NR_WORDS_RAWTRGTEST} ))
WORD_RATIO_TRGSRC_RAWTEST = $$(( ${NR_WORDS_RAWTRGTEST} / ${NR_WORDS_RAWSRCTEST} ))
WORD_RATIO_RAWTEST = ${shell printf "%s\n" ${WORD_RATIO_SRCTRG_RAWTEST} ${WORD_RATIO_SRCTRG_RAWTEST} | sort -n | head -1}
WORD_RATIO_THRESHOLD = $$(( ${WORD_RATIO_RAWTEST} + 1 ))
endif
endif
ifdef NR_CHARS_RAWSRCTEST
ifdef NR_CHARS_RAWTRGTEST
CHAR_RATIO_SRCTRG_RAWTEST = $$(( ${NR_CHARS_RAWSRCTEST} / ${NR_CHARS_RAWTRGTEST} ))
CHAR_RATIO_TRGSRC_RAWTEST = $$(( ${NR_CHARS_RAWTRGTEST} / ${NR_CHARS_RAWSRCTEST} ))
CHAR_RATIO_RAWTEST = ${shell printf "%s\n" ${CHAR_RATIO_SRCTRG_RAWTEST} ${CHAR_RATIO_SRCTRG_RAWTEST} | sort -n | head -1}
CHAR_RATIO_THRESHOLD = $$(( ${CHAR_RATIO_RAWTEST} + 1 ))
endif
endif
ifdef UNIQUE_CHARS_RAWSRCTEST
ifdef UNIQUE_CHARS_RAWTRGTEST
CHARSET_RATIO_SRCTRG_RAWTEST = $$(( ${UNIQUE_CHARS_RAWSRCTEST} / ${UNIQUE_CHARS_RAWTRGTEST} ))
CHARSET_RATIO_TRGSRC_RAWTEST = $$(( ${UNIQUE_CHARS_RAWTRGTEST} / ${UNIQUE_CHARS_RAWSRCTEST} ))
CHARSET_RATIO_RAWTEST = ${shell printf "%s\n" ${CHARSET_RATIO_SRCTRG_RAWTEST} ${CHARSET_RATIO_SRCTRG_RAWTEST} | sort -n | head -1}
CHARSET_RATIO_THRESHOLD = $$(( ${CHARSET_RATIO_RAWTEST} + 1 ))
endif
endif
ifdef LONGEST_LINE_RAWSRCTEST
ifdef LONGEST_LINE_RAWTRGTEST
LONGEST_LINE_RAWTEST = ${shell printf "%s\n" ${LONGEST_LINE_RAWSRCTEST} ${LONGEST_LINE_RAWTRGTEST} | sort -n | head -1}
LONGEST_LINE_THRESHOLD = $$(( ${LONGEST_LINE_RAWTEST} * 3 ))
endif
endif
ifdef LONGEST_WORD_RAWSRCTEST
ifdef LONGEST_WORD_RAWTRGTEST
LONGEST_WORD_RAWTEST = ${shell printf "%s\n" ${LONGEST_WORD_RAWSRCTEST} ${LONGEST_WORD_RAWTRGTEST} | sort -n | head -1}
LONGEST_WORD_THRESHOLD = $$(( ${LONGEST_WORD_RAWTEST} * 3 ))
endif
endif
## print thresholds that are conmputed from
## test set statistics
print_data_thresholds:
@echo ${WORD_RATIO_THRESHOLD}
@echo ${CHAR_RATIO_THRESHOLD}
@echo ${CHARSET_RATIO_THRESHOLD}
@echo ${LONGEST_LINE_RAWTEST}
@echo ${LONGEST_LINE_THRESHOLD}
@echo ${LONGEST_WORD_THRESHOLD}
STRICT_TRAIN_SRC = $(patsubst %.clean.${SRCEXT}.gz,%.strict.${SRCEXT}.gz,${CLEAN_TRAIN_SRC})
strict-clean-data: ${STRICT_TRAIN_SRC}
%.strict.${SRCEXT}.gz: %.clean.${SRCEXT}.gz
ifdef WORD_RATIO_THRESHOLD
$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
-ratio ${WORD_RATIO_THRESHOLD} \
-max-word-length ${LONGEST_WORD_THRESHOLD} \
$(<:.${SRCEXT}.gz=) \
$(SRCEXT) $(TRGEXT) \
$(@:.${SRCEXT}.gz=) \
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
${GZIP} -f $(@:.gz=) $(@:.${SRCEXT}.gz=.${TRGEXT})
else
-ln -s $< $@
-ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz)
endif
%.strict.${TRGEXT}.gz: %.strict.${SRCEXT}.gz
@echo "done!"
## basic data cleanup pipeline
## TODO: integrate OpusFilter
## should we remove zero-width spaces?
@ -41,6 +156,19 @@
## store some file size statistics
## - line 1: nr-of-lines nr-of-words nr-of-characters nr-of-bytes
## - line 2: nr-of-unique-characters
## - line 3: length-of-longest-line
## - line 4: length-of-longest-word
%.stats: %.gz
${ZCAT} $< | wc -lwmc > $@
${ZCAT} $< | sed 's/./& /g' | tr ' ' "\n" | sort -u | wc -l >> $@
${ZCAT} $< | wc -L >> $@
${ZCAT} $< | tr ' ' "\n" | wc -L >> $@
##----------------------------------------------
## tokenization
##----------------------------------------------
@ -210,17 +338,16 @@
## increase max number of tokens to 250
## (TODO: should MIN_NTOKENS be 1?)
MIN_NR_TOKENS = 0
MAX_NR_TOKENS = 250
## apply the cleanup script from Moses
%.src.clean.${PRE_SRC}: %.src.${PRE_SRC} %.trg.${PRE_TRG}
rm -f $@.${SRCEXT} $<.${TRGEXT}
ln -s ${word 1,$^} $<.${SRCEXT}
ln -s ${word 2,$^} $<.${TRGEXT}
$(MOSESSCRIPTS)/training/clean-corpus-n.perl $< $(SRCEXT) $(TRGEXT) $@ ${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
-ratio ${NR_TOKEN_RATIO} \
-max-word-length ${MAX_TOKEN_LENGTH} \
$< $(SRCEXT) $(TRGEXT) $@ ${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
rm -f $<.${SRCEXT} $<.${TRGEXT}
mv $@.${SRCEXT} $@
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})

View File

@ -21,12 +21,14 @@ include ${REPOHOME}lib/dist.mk
#------------------------------------------------------------------------
.PHONY: data
data: ${TRAINDATA_SRC} ${TRAINDATA_TRG}
${MAKE} ${DEVDATA_SRC} ${DEVDATA_TRG}
${MAKE} ${TESTDATA_SRC} ${TESTDATA_TRG}
${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
data:
@${MAKE} rawdata
@${MAKE} ${TRAINDATA_SRC} ${TRAINDATA_TRG}
@${MAKE} ${DEVDATA_SRC} ${DEVDATA_TRG}
@${MAKE} ${TESTDATA_SRC} ${TESTDATA_TRG}
@${MAKE} ${MODEL_SRCVOCAB} ${MODEL_TRGVOCAB}
ifeq ($(filter align,${subst -, ,${MODELTYPE}}),align)
${MAKE} ${TRAIN_ALG}
@${MAKE} ${TRAIN_ALG}
endif
traindata: ${TRAINDATA_SRC} ${TRAINDATA_TRG}

View File

@ -87,8 +87,7 @@ OUTPUT_DIR ?= ${LANGPAIR}
BITEXT_DATADIR = ${PWD}/../work/data/simple
MODEL_WORKDIR = ${PWD}/../work/${LANGPAIR}
BITEXT_SRCRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${SRC}.gz
BITEXT_SRCPRE = ${wildcard ${MODEL_WORKDIR}/train/opusTC${TATOEBA_VERSION_NOHYPHEN}.src.clean.spm*.gz}
BITEXT_SRCRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${SRC}.gz
BITEXT_BASE = ${OUTPUT_DIR}/Tatoeba-train.${MODELNAME}.${LANGPAIR}
BITEXT_SRC = ${BITEXT_BASE}.${SRC}.${PART}.gz
@ -231,13 +230,9 @@ else
PREPROCESS_ARGS = ${SRC} ${LANGPAIR}/${MODELNAME}/source.spm
endif
## OLD: check whether we have source files in the work directory
## NEW: make them from scratch from raw bitexts
##
# ifeq (${BITEXT_SRCPRE},)
${BITEXT_SRCRAW}:
${MAKE} -C .. SRCLANGS=${SRC} TRGLANGS=${TRG} clean-data-tatoeba
${MAKE} -C .. SRCLANGS=${SRC} TRGLANGS=${TRG} rawdata-tatoeba
${BITEXT_PRE}: ${BITEXT_SRCRAW}
ifneq (${MODELZIP},)
@ -252,24 +247,6 @@ ifneq (${MODELZIP},)
endif
## NEW: skip this option
##
# else
# ${BITEXT_PRE}: ${BITEXT_SRCPRE}
# ifneq (${MODELZIP},)
# mkdir -p ${dir $@}
# ${MAKE} ${LANGPAIR}/${MODELNAME}/decoder.yml
# ${GZCAT} $< |\
# sed 's/ //g;s/▁/ /g' | sed 's/^ *//;s/ *$$//' | \
# grep -v '[<>{}]' |\
# ${LANGPAIR}/${MODELNAME}/preprocess.sh ${PREPROCESS_ARGS} |\
# perl -e 'while (<>){next if (split(/\s+/)>${MAX_LENGTH});print;}' |\
# split -l ${SPLIT_SIZE} - ${patsubst %${PART}.gz,%,$@}
# ${GZIP} -f ${patsubst %${PART}.gz,%,$@}??
# endif
# endif

View File

@ -74,9 +74,8 @@ endif
BITEXT_DATADIR = ${PWD}/../work/data/simple
MODEL_WORKDIR = ${PWD}/../work/${PIVOT}-${TRG}
BITEXT_PIVOTRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${PIVOT}.gz
BITEXT_TRGRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.clean.${TRG}.gz
BITEXT_PIVOTPRE = ${wildcard ${MODEL_WORKDIR}/train/opusTC${TATOEBA_VERSION_NOHYPHEN}.src.clean.spm*.gz}
BITEXT_PIVOTRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${PIVOT}.gz
BITEXT_TRGRAW = ${BITEXT_DATADIR}/Tatoeba-train-${TATOEBA_VERSION}.${SORTED_LANGPAIR}.${CLEAN_TRAINDATA_TYPE}.${TRG}.gz
BITEXT_BASE = ${OUTPUT_DIR}/Tatoeba-train.${MODELNAME}.${PIVOT}-${SRC}-${TRG}
BITEXT_PIVOT = ${BITEXT_BASE}.${PIVOT}.${PART}.gz