2020-06-03 15:39:18 +03:00
|
|
|
# -*-makefile-*-
|
|
|
|
|
|
|
|
|
2022-03-17 22:02:11 +03:00
|
|
|
## moved to config.mk
|
|
|
|
##
|
|
|
|
# ## clean-corpus script parameters
|
|
|
|
# ## (for filtering subword-segmented bitexts)
|
|
|
|
# ##
|
|
|
|
# ## (TODO: should MIN_NTOKENS be 1?)
|
|
|
|
# # MIN_NR_TOKENS = 0
|
|
|
|
# # MAX_NR_TOKENS = 250
|
|
|
|
# MIN_NR_TOKENS = 1
|
|
|
|
# MAX_NR_TOKENS = 500
|
|
|
|
# NR_TOKEN_RATIO = 2
|
|
|
|
# MAX_TOKEN_LENGTH = 100
|
2022-01-11 17:10:43 +03:00
|
|
|
|
2022-03-17 22:02:11 +03:00
|
|
|
# ## default values in the original script:
|
|
|
|
# ##
|
|
|
|
# # MAX_TOKEN_LENGTH = 1000
|
|
|
|
# # NR_TOKEN_RATIO = 9
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
|
2022-01-11 17:10:43 +03:00
|
|
|
## compute some ratios and thresholds that could be useful for filtering training data
|
|
|
|
## use test sets for those stats assuming that they are representative and clean
|
2020-06-03 15:39:18 +03:00
|
|
|
##
|
2022-01-11 17:10:43 +03:00
|
|
|
## - word-ratio threshold = max ratio between number of words
|
|
|
|
## - char-ratio threshold = max ratio between number of characters
|
|
|
|
|
|
|
|
ifneq ($(wildcard ${CLEAN_TEST_SRC_STATS}),)
|
|
|
|
NR_LINES_RAWSRCTEST = $(word 1,$(shell cat ${CLEAN_TEST_SRC_STATS}))
|
|
|
|
NR_WORDS_RAWSRCTEST = $(word 2,$(shell cat ${CLEAN_TEST_SRC_STATS}))
|
|
|
|
NR_CHARS_RAWSRCTEST = $(word 3,$(shell cat ${CLEAN_TEST_SRC_STATS}))
|
|
|
|
NR_BYTES_RAWSRCTEST = $(word 4,$(shell cat ${CLEAN_TEST_SRC_STATS}))
|
|
|
|
UNIQUE_CHARS_RAWSRCTEST = $(shell sed -n 2,2p ${CLEAN_TEST_SRC_STATS})
|
|
|
|
LONGEST_LINE_RAWSRCTEST = $(shell sed -n 3,3p ${CLEAN_TEST_SRC_STATS})
|
|
|
|
LONGEST_WORD_RAWSRCTEST = $(shell sed -n 4,4p ${CLEAN_TEST_SRC_STATS})
|
|
|
|
endif
|
|
|
|
|
|
|
|
ifneq ($(wildcard ${CLEAN_TEST_TRG_STATS}),)
|
|
|
|
NR_LINES_RAWTRGTEST = $(word 1,$(shell cat ${CLEAN_TEST_TRG_STATS}))
|
|
|
|
NR_WORDS_RAWTRGTEST = $(word 2,$(shell cat ${CLEAN_TEST_TRG_STATS}))
|
|
|
|
NR_CHARS_RAWTRGTEST = $(word 3,$(shell cat ${CLEAN_TEST_TRG_STATS}))
|
|
|
|
NR_BYTES_RAWTRGTEST = $(word 4,$(shell cat ${CLEAN_TEST_TRG_STATS}))
|
|
|
|
UNIQUE_CHARS_RAWTRGTEST = $(shell sed -n 2,2p ${CLEAN_TEST_TRG_STATS})
|
|
|
|
LONGEST_LINE_RAWTRGTEST = $(shell sed -n 3,3p ${CLEAN_TEST_TRG_STATS})
|
|
|
|
LONGEST_WORD_RAWTRGTEST = $(shell sed -n 4,4p ${CLEAN_TEST_TRG_STATS})
|
|
|
|
endif
|
|
|
|
|
|
|
|
ifdef NR_WORDS_RAWSRCTEST
|
|
|
|
ifdef NR_WORDS_RAWTRGTEST
|
2022-02-07 21:55:31 +03:00
|
|
|
WORD_RATIO_SRCTRG_RAWTEST = $$(( (${NR_WORDS_RAWSRCTEST} + 1) / (${NR_WORDS_RAWTRGTEST} + 1) ))
|
|
|
|
WORD_RATIO_TRGSRC_RAWTEST = $$(( (${NR_WORDS_RAWTRGTEST} + 1) / (${NR_WORDS_RAWSRCTEST} + 1) ))
|
2022-01-21 15:44:47 +03:00
|
|
|
WORD_RATIO_RAWTEST = ${shell printf "%s\n" ${WORD_RATIO_SRCTRG_RAWTEST} ${WORD_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
|
2022-01-11 17:10:43 +03:00
|
|
|
WORD_RATIO_THRESHOLD = $$(( ${WORD_RATIO_RAWTEST} + 1 ))
|
|
|
|
endif
|
|
|
|
endif
|
|
|
|
|
|
|
|
ifdef NR_CHARS_RAWSRCTEST
|
|
|
|
ifdef NR_CHARS_RAWTRGTEST
|
2022-02-07 21:55:31 +03:00
|
|
|
CHAR_RATIO_SRCTRG_RAWTEST = $$(( (${NR_CHARS_RAWSRCTEST} + 1) / (${NR_CHARS_RAWTRGTEST} + 1) ))
|
|
|
|
CHAR_RATIO_TRGSRC_RAWTEST = $$(( (${NR_CHARS_RAWTRGTEST} + 1) / (${NR_CHARS_RAWSRCTEST} + 1) ))
|
2022-01-21 15:44:47 +03:00
|
|
|
CHAR_RATIO_RAWTEST = ${shell printf "%s\n" ${CHAR_RATIO_SRCTRG_RAWTEST} ${CHAR_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
|
2022-01-11 17:10:43 +03:00
|
|
|
CHAR_RATIO_THRESHOLD = $$(( ${CHAR_RATIO_RAWTEST} + 1 ))
|
|
|
|
endif
|
|
|
|
endif
|
|
|
|
|
|
|
|
ifdef UNIQUE_CHARS_RAWSRCTEST
|
|
|
|
ifdef UNIQUE_CHARS_RAWTRGTEST
|
2022-02-07 21:55:31 +03:00
|
|
|
CHARSET_RATIO_SRCTRG_RAWTEST = $$(( (${UNIQUE_CHARS_RAWSRCTEST} + 1) / ( ${UNIQUE_CHARS_RAWTRGTEST} + 1) ))
|
|
|
|
CHARSET_RATIO_TRGSRC_RAWTEST = $$(( (${UNIQUE_CHARS_RAWTRGTEST} + 1) / ( ${UNIQUE_CHARS_RAWSRCTEST} + 1) ))
|
2022-01-21 15:44:47 +03:00
|
|
|
CHARSET_RATIO_RAWTEST = ${shell printf "%s\n" ${CHARSET_RATIO_SRCTRG_RAWTEST} ${CHARSET_RATIO_TRGSRC_RAWTEST} | sort -nr | head -1}
|
2022-01-11 17:10:43 +03:00
|
|
|
CHARSET_RATIO_THRESHOLD = $$(( ${CHARSET_RATIO_RAWTEST} + 1 ))
|
|
|
|
endif
|
|
|
|
endif
|
|
|
|
|
|
|
|
ifdef LONGEST_LINE_RAWSRCTEST
|
|
|
|
ifdef LONGEST_LINE_RAWTRGTEST
|
2022-01-21 15:44:47 +03:00
|
|
|
LONGEST_LINE_RAWTEST = ${shell printf "%s\n" ${LONGEST_LINE_RAWSRCTEST} ${LONGEST_LINE_RAWTRGTEST} | sort -nr | head -1}
|
2022-02-05 14:40:55 +03:00
|
|
|
LONGEST_LINE_THRESHOLD = $$(( 1 + ${LONGEST_LINE_RAWTEST} * 4 ))
|
2022-01-11 17:10:43 +03:00
|
|
|
endif
|
|
|
|
endif
|
|
|
|
|
|
|
|
ifdef LONGEST_WORD_RAWSRCTEST
|
|
|
|
ifdef LONGEST_WORD_RAWTRGTEST
|
2022-01-21 15:44:47 +03:00
|
|
|
LONGEST_WORD_RAWTEST = ${shell printf "%s\n" ${LONGEST_WORD_RAWSRCTEST} ${LONGEST_WORD_RAWTRGTEST} | sort -nr | head -1}
|
2022-02-05 14:40:55 +03:00
|
|
|
LONGEST_WORD_THRESHOLD = $$(( 1 + ${LONGEST_WORD_RAWTEST} * 4 ))
|
2022-01-11 17:10:43 +03:00
|
|
|
endif
|
|
|
|
endif
|
|
|
|
|
|
|
|
|
|
|
|
## print thresholds that are conmputed from
|
|
|
|
## test set statistics
|
|
|
|
|
|
|
|
print_data_thresholds:
|
2022-01-21 15:44:47 +03:00
|
|
|
@echo "source stats from ${CLEAN_TEST_SRC_STATS}"
|
|
|
|
@echo "target stats from ${CLEAN_TEST_TRG_STATS}"
|
|
|
|
@echo "Thresholds:"
|
|
|
|
@echo " word ratio: ${WORD_RATIO_THRESHOLD} (${NR_WORDS_RAWSRCTEST},${NR_WORDS_RAWTRGTEST})"
|
|
|
|
@echo " char ratio: ${CHAR_RATIO_THRESHOLD} (${NR_CHARS_RAWSRCTEST},${NR_CHARS_RAWTRGTEST})"
|
|
|
|
@echo "charset ratio: ${CHARSET_RATIO_THRESHOLD} (${UNIQUE_CHARS_RAWSRCTEST},${UNIQUE_CHARS_RAWTRGTEST})"
|
2022-02-05 14:40:55 +03:00
|
|
|
@echo " line length: ${LONGEST_LINE_THRESHOLD} (1 + 4 * max(${LONGEST_LINE_RAWSRCTEST},${LONGEST_LINE_RAWTRGTEST}))"
|
|
|
|
@echo " word length: ${LONGEST_WORD_THRESHOLD} (1 + 4 * max(${LONGEST_WORD_RAWSRCTEST},${LONGEST_WORD_RAWTRGTEST}))"
|
2022-01-11 17:10:43 +03:00
|
|
|
|
|
|
|
|
|
|
|
STRICT_TRAIN_SRC = $(patsubst %.clean.${SRCEXT}.gz,%.strict.${SRCEXT}.gz,${CLEAN_TRAIN_SRC})
|
|
|
|
|
2022-02-07 21:55:31 +03:00
|
|
|
|
2022-01-11 17:10:43 +03:00
|
|
|
strict-clean-data: ${STRICT_TRAIN_SRC}
|
|
|
|
|
|
|
|
%.strict.${SRCEXT}.gz: %.clean.${SRCEXT}.gz
|
|
|
|
ifdef WORD_RATIO_THRESHOLD
|
2022-02-07 21:55:31 +03:00
|
|
|
if [ -e $< ]; then \
|
|
|
|
$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
|
2022-01-11 17:10:43 +03:00
|
|
|
-ratio ${WORD_RATIO_THRESHOLD} \
|
|
|
|
-max-word-length ${LONGEST_WORD_THRESHOLD} \
|
|
|
|
$(<:.${SRCEXT}.gz=) \
|
|
|
|
$(SRCEXT) $(TRGEXT) \
|
|
|
|
$(@:.${SRCEXT}.gz=) \
|
2022-02-07 21:55:31 +03:00
|
|
|
${MIN_NR_TOKENS} ${MAX_NR_TOKENS}; \
|
|
|
|
${GZIP} -f $(@:.gz=) $(@:.${SRCEXT}.gz=.${TRGEXT}); \
|
|
|
|
fi
|
2022-01-11 17:10:43 +03:00
|
|
|
else
|
2022-02-07 21:55:31 +03:00
|
|
|
-if [ -e $< ]; then \
|
|
|
|
ln -s $< $@; \
|
|
|
|
ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz); \
|
|
|
|
fi
|
2022-01-11 17:10:43 +03:00
|
|
|
endif
|
|
|
|
|
|
|
|
%.strict.${TRGEXT}.gz: %.strict.${SRCEXT}.gz
|
|
|
|
@echo "done!"
|
|
|
|
|
|
|
|
|
2022-02-07 21:55:31 +03:00
|
|
|
## yet another filter
|
|
|
|
|
|
|
|
STRICT2_TRAIN_SRC = $(patsubst %.clean.${SRCEXT}.gz,%.strict2.${SRCEXT}.gz,${CLEAN_TRAIN_SRC})
|
|
|
|
strict2-clean-data: ${STRICT2_TRAIN_SRC}
|
|
|
|
|
|
|
|
%.strict2.${SRCEXT}.gz: %.strict.${SRCEXT}.gz
|
|
|
|
ifdef CHAR_RATIO_THRESHOLD
|
|
|
|
if [ -e $< ]; then \
|
|
|
|
$(SCRIPTDIR)/bitext_filter.pl \
|
|
|
|
-l ${CHAR_RATIO_THRESHOLD} \
|
|
|
|
-c ${CHARSET_RATIO_THRESHOLD} \
|
|
|
|
$(SRCEXT) $(TRGEXT) \
|
|
|
|
$(<:.${SRCEXT}.gz=) \
|
|
|
|
$(@:.${SRCEXT}.gz=); \
|
|
|
|
fi
|
|
|
|
else
|
|
|
|
-if [ -e $< ]; then \
|
|
|
|
ln -s $< $@; \
|
|
|
|
ln -s $(<:.${SRCEXT}.gz=.${TRGEXT}.gz) $(@:.${SRCEXT}.gz=.${TRGEXT}.gz); \
|
|
|
|
fi
|
|
|
|
endif
|
|
|
|
|
|
|
|
%.strict2.${TRGEXT}.gz: %.strict2.${SRCEXT}.gz
|
|
|
|
@echo "done!"
|
|
|
|
|
2022-01-11 17:10:43 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## basic data cleanup pipeline
|
|
|
|
## TODO: integrate OpusFilter
|
2020-06-03 15:39:18 +03:00
|
|
|
|
2021-05-04 08:49:16 +03:00
|
|
|
|
|
|
|
## should we remove zero-width spaces?
|
|
|
|
## perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g'
|
|
|
|
|
2020-06-03 15:39:18 +03:00
|
|
|
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
|
|
|
|
cat ${word 1,$^} |\
|
|
|
|
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
2021-05-04 08:49:16 +03:00
|
|
|
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
|
2020-06-03 15:39:18 +03:00
|
|
|
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.1
|
|
|
|
cat ${word 2,$^} |\
|
|
|
|
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
2021-05-04 08:49:16 +03:00
|
|
|
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
|
2020-06-03 15:39:18 +03:00
|
|
|
perl -CS -pe 's/\&\s*\#\s*160\s*\;/ /g' > $@.2
|
|
|
|
paste $@.1 $@.2 |\
|
2021-12-22 18:31:22 +03:00
|
|
|
${REPOHOME}scripts/filter/bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
2020-06-03 15:39:18 +03:00
|
|
|
cut -f1 $@.bitext | ${GZIP} -c > $@
|
|
|
|
cut -f2 $@.bitext | ${GZIP} -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
|
|
|
|
rm -f $@.bitext $@.1 $@.2
|
2020-09-02 15:52:34 +03:00
|
|
|
if [ ! `${ZCAT} "$@" | head | wc -l` -gt 0 ]; then rm -f $@; fi
|
|
|
|
if [ ! `${ZCAT} "$(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)" | head | wc -l` -gt 0 ]; then \
|
2020-06-03 15:39:18 +03:00
|
|
|
rm -f $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz); \
|
|
|
|
fi
|
|
|
|
|
|
|
|
%.clean.${TRGEXT}.gz: %.clean.${SRCEXT}.gz
|
|
|
|
@echo "done!"
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-01-11 17:10:43 +03:00
|
|
|
## store some file size statistics
|
2022-02-08 23:22:33 +03:00
|
|
|
## (only create those for files that have at least STATS_MIN_NROFLINES lines)
|
2022-01-11 17:10:43 +03:00
|
|
|
## - line 1: nr-of-lines nr-of-words nr-of-characters nr-of-bytes
|
|
|
|
## - line 2: nr-of-unique-characters
|
|
|
|
## - line 3: length-of-longest-line
|
|
|
|
## - line 4: length-of-longest-word
|
|
|
|
|
2022-02-08 23:22:33 +03:00
|
|
|
STATS_MIN_NROFLINES ?= 100
|
|
|
|
|
|
|
|
## alternatively: check for non-empty gzip files:
|
|
|
|
## gzip -l tt2.gz | awk 'NR==2 {print $2}'
|
|
|
|
|
2022-01-11 17:10:43 +03:00
|
|
|
%.stats: %.gz
|
2022-02-08 23:22:33 +03:00
|
|
|
if [ `${ZCAT} $< | head -${STATS_MIN_NROFLINES} | wc -l` -eq ${STATS_MIN_NROFLINES} ]; then \
|
2022-02-07 21:55:31 +03:00
|
|
|
echo ".... create some stats for $<"; \
|
|
|
|
${GZCAT} $< | wc -lwmc > $@; \
|
|
|
|
${GZCAT} $< | sed 's/./& /g' | tr ' ' "\n" | sort -u | wc -l >> $@; \
|
|
|
|
${GZCAT} $< | wc -L >> $@; \
|
|
|
|
${GZCAT} $< | tr ' ' "\n" | wc -L >> $@; \
|
|
|
|
fi
|
2022-01-11 17:10:43 +03:00
|
|
|
|
|
|
|
|
2020-06-03 15:39:18 +03:00
|
|
|
##----------------------------------------------
|
|
|
|
## tokenization
|
|
|
|
##----------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
## normalisation for Chinese
|
|
|
|
%.zh_tw.tok: %.zh_tw.raw
|
|
|
|
$(LOAD_MOSES) cat $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
%.zh_cn.tok: %.zh_cn.raw
|
|
|
|
$(LOAD_MOSES) cat $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
%.zh.tok: %.zh.raw
|
|
|
|
$(LOAD_MOSES) cat $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
## generic target for tokenization
|
|
|
|
%.tok: %.raw
|
|
|
|
$(LOAD_MOSES) cat $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/normalize-punctuation.perl \
|
|
|
|
-l ${lastword ${subst 1,,${subst 2,,${subst ., ,$(<:.raw=)}}}} |\
|
|
|
|
$(TOKENIZER)/tokenizer.perl -a -threads $(THREADS) \
|
|
|
|
-l ${lastword ${subst 1,,${subst 2,,${subst ., ,$(<:.raw=)}}}} |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### TODO: make language-specific pre-processing ....
|
|
|
|
### use SRC_CLEANUP_SCRIPTS TRG_CLEANUP_SCRIPTS
|
|
|
|
|
|
|
|
## only normalisation
|
|
|
|
%.norm.gz: %.gz
|
|
|
|
$(LOAD_MOSES) ${GZIP} -cd < $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
%.norm: %.raw
|
|
|
|
$(LOAD_MOSES) cat $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
%.${SRCEXT}.norm: %.${SRCEXT}.raw
|
|
|
|
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
%.${TRGEXT}.norm: %.${TRGEXT}.raw
|
|
|
|
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/normalize-punctuation.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
|
2021-12-22 18:31:22 +03:00
|
|
|
## minimal pre-processing (is that the same as norm?)
|
2020-06-03 15:39:18 +03:00
|
|
|
%.simple.gz: %.gz
|
|
|
|
$(LOAD_MOSES) ${GZIP} -cd < $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
%.simple: %.raw
|
|
|
|
$(LOAD_MOSES) cat $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
%.${SRCEXT}.simple: %.${SRCEXT}.raw
|
|
|
|
$(LOAD_MOSES) cat $< ${SRC_CLEANUP_SCRIPTS} |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
%.${TRGEXT}.simple: %.${TRGEXT}.raw
|
|
|
|
$(LOAD_MOSES) cat $< ${TRG_CLEANUP_SCRIPTS} |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## remove all spaces (treat everything as a long string)
|
|
|
|
%.nospace: %.raw
|
|
|
|
$(LOAD_MOSES) cat $< |\
|
|
|
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
$(TOKENIZER)/deescape-special-chars.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
sed 's/^ *//;s/ */ /g;s/ *$$//g' |\
|
2020-06-03 15:39:18 +03:00
|
|
|
sed 's/ /▁/g' > $@
|
|
|
|
|
|
|
|
|
|
|
|
## generic targets to make it possible to work with compressed data
|
|
|
|
## when running the same pre-processing pipeline
|
|
|
|
## TODO: does that destroy anything?
|
|
|
|
## TODO: do we need this?
|
|
|
|
|
|
|
|
# %.raw: %.gz
|
|
|
|
# ${GZIP} -cd < $< > $@
|
|
|
|
|
|
|
|
# %.${PRE}.gz: %.${PRE}
|
|
|
|
# ${GZIP} -c < $< > $@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## the above should avoid having repeating the pipeline below
|
|
|
|
|
|
|
|
# %.norm.gz: %.gz
|
|
|
|
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
|
|
|
|
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
# $(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
# $(TOKENIZER)/normalize-punctuation.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
# sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
# %.simple.gz: %.gz
|
|
|
|
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
|
|
|
|
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
# $(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
# $(TOKENIZER)/deescape-special-chars.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
# sed 's/^ *//;s/ */ /g;s/ *$$//g' | ${GZIP} -c > $@
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
# %.nospace.gz: %.gz
|
|
|
|
# $(LOAD_MOSES) ${GZIP} -cd < $< |\
|
|
|
|
# $(TOKENIZER)/replace-unicode-punctuation.perl |\
|
|
|
|
# $(TOKENIZER)/remove-non-printing-char.perl |\
|
|
|
|
# $(TOKENIZER)/deescape-special-chars.perl |\
|
2020-09-02 15:52:34 +03:00
|
|
|
# sed 's/^ *//;s/ */ /g;s/ *$$//g' |\
|
2020-06-03 15:39:18 +03:00
|
|
|
# sed 's/ /▁/g' |\
|
|
|
|
# ${GZIP} -c > $@
|
|
|
|
|
|
|
|
|
|
|
|
|
2020-09-13 00:00:15 +03:00
|
|
|
## no further pre-processing
|
|
|
|
|
|
|
|
%.src.plain: %.src
|
|
|
|
mv $< $@
|
|
|
|
ln -s $@ $<
|
|
|
|
|
|
|
|
%.trg.plain: %.trg
|
|
|
|
mv $< $@
|
|
|
|
ln -s $@ $<
|
|
|
|
|
2020-06-03 15:39:18 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## apply the cleanup script from Moses
|
|
|
|
%.src.clean.${PRE_SRC}: %.src.${PRE_SRC} %.trg.${PRE_TRG}
|
|
|
|
rm -f $@.${SRCEXT} $<.${TRGEXT}
|
|
|
|
ln -s ${word 1,$^} $<.${SRCEXT}
|
|
|
|
ln -s ${word 2,$^} $<.${TRGEXT}
|
2022-01-11 17:10:43 +03:00
|
|
|
$(MOSESSCRIPTS)/training/clean-corpus-n.perl \
|
|
|
|
-ratio ${NR_TOKEN_RATIO} \
|
|
|
|
-max-word-length ${MAX_TOKEN_LENGTH} \
|
|
|
|
$< $(SRCEXT) $(TRGEXT) $@ ${MIN_NR_TOKENS} ${MAX_NR_TOKENS}
|
2020-06-03 15:39:18 +03:00
|
|
|
rm -f $<.${SRCEXT} $<.${TRGEXT}
|
|
|
|
mv $@.${SRCEXT} $@
|
|
|
|
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
|
|
|
|
echo -n "* total size (${DATASET}): " >> ${dir $@}README.md
|
|
|
|
cat $@ | wc -l >> ${dir $@}README.md
|
|
|
|
|
|
|
|
|
|
|
|
%.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC}
|
|
|
|
@echo "done!"
|
|
|
|
|
|
|
|
|
|
|
|
# tokenize testsets
|
|
|
|
testsets/%.raw: testsets/%.gz
|
|
|
|
${GZIP} -cd < $< > $@
|
|
|
|
|
|
|
|
testsets/%.${PRE}.gz: testsets/%.${PRE}
|
|
|
|
${GZIP} -c < $< > $@
|
|
|
|
|
|
|
|
ALLTEST = $(patsubst %.gz,%.${PRE}.gz,${sort $(subst .${PRE},,${wildcard testsets/*/*.??.gz})})
|
|
|
|
|
|
|
|
tokenize-testsets prepare-testsets: ${ALLTEST}
|
|
|
|
|