OPUS-MT-train/lib/preprocess.mk
2021-05-04 08:49:16 +03:00

246 lines
7.4 KiB
Makefile

# -*-makefile-*-
## clean data
## OLD: apply cleanup script from Moses
## --> this might not be a good idea before subword splitting for languages without spaces
## NEW: do this later after splitting into subword units
##
## TODO:
## - does this effect sentence piece / BPE models in some negative way?
## - should we increase the length filter when cleaning later? How much?
## - should we apply some other cleanup scripts here to get rid of some messy stuff?
## should we remove zero-width spaces?
## perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
cat ${word 1,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.1
cat ${word 2,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.2
paste $@.1 $@.2 |\
scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
cut -f1 $@.bitext | ${GZIP} -c > $@
cut -f2 $@.bitext | ${GZIP} -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
rm -f $@.bitext $@.1 $@.2
if [ ! `${ZCAT} "$@" | head | wc -l` -gt 0 ]; then rm -f $@; fi
if [ ! `${ZCAT} "$(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)" | head | wc -l` -gt 0 ]; then \
rm -f $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz); \
fi
%.clean.${TRGEXT}.gz: %.clean.${SRCEXT}.gz
@echo "done!"
##----------------------------------------------
## tokenization
##----------------------------------------------
## normalisation for Chinese
%.zh_tw.tok: %.zh_tw.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.zh_cn.tok: %.zh_cn.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.zh.tok: %.zh.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
## generic target for tokenization
%.tok: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl \
-l ${lastword ${subst 1,,${subst 2,,${subst ., ,$(<:.raw=)}}}} |\
$(TOKENIZER)/tokenizer.perl -a -threads $(THREADS) \
-l ${lastword ${subst 1,,${subst 2,,${subst ., ,$(<:.raw=)}}}} |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
### TODO: make language-specific pre-processing ....
### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS
## only normalisation
%.norm.gz: %.gz
$(LOAD_MOSES) ${GZIP} -cd < $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
%.norm: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.${SRCEXT}.norm: %.${SRCEXT}.raw
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.${TRGEXT}.norm: %.${TRGEXT}.raw
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
## minimal pre-processing
%.simple.gz: %.gz
$(LOAD_MOSES) ${GZIP} -cd < $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
%.simple: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.${SRCEXT}.simple: %.${SRCEXT}.raw
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
%.${TRGEXT}.simple: %.${TRGEXT}.raw
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
## remove all spaces (treat everything as a long string)
%.nospace: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/deescape-special-chars.perl |\
sed 's/^ *//;s/ */ /g;s/ *$$//g' |\
sed 's/ /▁/g' > $@
## generic targets to make it possible to work with compressed data
## when running the same pre-processing pipeline
## TODO: does that destroy anything?
## TODO: do we need this?
# %.raw: %.gz
# ${GZIP} -cd < $< > $@
# %.${PRE}.gz: %.${PRE}
# ${GZIP} -c < $< > $@
## the above should avoid having repeating the pipeline below
# %.norm.gz: %.gz
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/normalize-punctuation.perl |\
# sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
# %.simple.gz: %.gz
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/deescape-special-chars.perl |\
# sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
# %.nospace.gz: %.gz
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
# $(TOKENIZER)/remove-non-printing-char.perl |\
# $(TOKENIZER)/deescape-special-chars.perl |\
# sed 's/^ *//;s/ */ /g;s/ *$$//g' |\
# sed 's/ /▁/g' |\
# ${GZIP} -c > $@
## no further pre-processing
%.src.plain: %.src
mv $< $@
ln -s $@ $<
%.trg.plain: %.trg
mv $< $@
ln -s $@ $<
## increase max number of tokens to 250
## (TODO: should MIN_NTOKENS be 1?)
MIN_NR_TOKENS = 0
MAX_NR_TOKENS = 250
## apply the cleanup script from Moses
%.src.clean.${PRE_SRC}: %.src.${PRE_SRC} %.trg.${PRE_TRG}
rm -f $@.${SRCEXT} $<.${TRGEXT}
ln -s ${word 1,$^} $<.${SRCEXT}
ln -s ${word 2,$^} $<.${TRGEXT}
$(MOSESSCRIPTS)/training/clean-corpus-n.perl $< $(SRCEXT) $(TRGEXT) $@ ${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
rm -f $<.${SRCEXT} $<.${TRGEXT}
mv $@.${SRCEXT} $@
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
echo -n "* total size (${DATASET}): " >> ${dir $@}README.md
cat $@ | wc -l >> ${dir $@}README.md
%.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC}
@echo "done!"
# tokenize testsets
testsets/%.raw: testsets/%.gz
${GZIP} -cd < $< > $@
testsets/%.${PRE}.gz: testsets/%.${PRE}
${GZIP} -c < $< > $@
ALLTEST = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard testsets/*/*.??.gz})})
tokenize-testsets prepare-testsets: ${ALLTEST}