removed punctuation normalisation and added language filter

This commit is contained in:
Joerg Tiedemann 2020-02-08 00:19:21 +02:00
parent 91576aa3e9
commit ee8c27e3db
9 changed files with 211 additions and 19 deletions

View File

@ -141,7 +141,8 @@ endif
CONTEXT_SIZE = 100
## pre-processing type
PRE = norm
# PRE = norm
PRE = simple
PRE_SRC = spm${SRCBPESIZE:000=}k
PRE_TRG = spm${TRGBPESIZE:000=}k

View File

@ -236,15 +236,37 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
# @echo "done!"
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
cat $< |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
gzip -c > $@
%.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE}
cat $< |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
gzip -c > $@
# %.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
# cat $< |\
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
# gzip -c > $@
# %.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE}
# cat $< |\
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
# gzip -c > $@
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
cat ${word 1,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.1
cat ${word 2,$^} |\
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.2
paste $@.1 $@.2 |\
python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
cut -f1 $@.bitext | gzip -c > $@
cut -f2 $@.bitext | gzip -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
rm -f $@.bitext $@.1 $@.2
# paste $@.${SRCEXT} $@.${TRGEXT} |\
# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
# cut -f1 $@.bitext > $@
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
%.clean.${TRGEXT}.gz: %.clean.${SRCEXT}.gz
@echo "done!"
@ -639,6 +661,21 @@ endif
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
## minimal pre-processing
%.simple: %.raw
$(LOAD_MOSES) cat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
%.simple.gz: %.gz
$(LOAD_MOSES) zcat $< |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
## increase max number of tokens to 250
## (TODO: should MIN_NTOKENS be 1?)
MIN_NR_TOKENS = 0
@ -654,6 +691,13 @@ MAX_NR_TOKENS = 250
mv $@.${SRCEXT} $@
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
# paste $@.${SRCEXT} $@.${TRGEXT} |\
# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
# cut -f1 $@.bitext > $@
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
# rm -f $@.${SRCEXT} $@.${TRGEXT} $@.bitext
%.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC}
@echo "done!"

View File

@ -55,14 +55,14 @@ LOADGPU = module load ${GPU_MODULES}
ifeq (${shell hostname},dx6-ibs-p2)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work-spm}
WORKHOME = ${shell realpath ${PWD}/work-filter}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MARIAN = ${APPLHOME}/marian/build
LOADMODS = echo "nothing to load"
else ifeq (${shell hostname},dx7-nkiel-4gpu)
APPLHOME = /opt/tools
WORKHOME = ${shell realpath ${PWD}/work-spm}
WORKHOME = ${shell realpath ${PWD}/work-filter}
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
MOSESHOME = ${APPLHOME}/mosesdecoder
MARIAN = ${APPLHOME}/marian/build
@ -71,7 +71,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
# APPLHOME = ${USERAPPL}/tools
APPLHOME = /proj/memad/tools
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-spm
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-filter
OPUSHOME = /proj/nlpl/data/OPUS
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
# MARIAN = /proj/nlpl/software/marian/1.2.0
@ -83,7 +83,7 @@ else
CSCPROJECT = project_2001194
# CSCPROJECT = project_2000309
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
WORKHOME = ${shell realpath ${PWD}/work-spm}
WORKHOME = ${shell realpath ${PWD}/work-filter}
APPLHOME = ${HOME}/projappl
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
OPUSHOME = /projappl/nlpl/data/OPUS

View File

@ -140,6 +140,15 @@ endif
${@:-spm-noalign=}
## sentence-piece models with langid-filtering (new default)
%-filter:
${MAKE} WORKHOME=${shell realpath ${PWD}/work-filter} \
PRE=simple \
PRE_SRC=spm${SRCBPESIZE:000=}k \
PRE_TRG=spm${TRGBPESIZE:000=}k \
${@:-spm=}
## BPE models
%-bpe:

View File

@ -29,7 +29,7 @@ MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
ifeq (${MODELNAME},)
MODELHOME = ../work-spm/models/${LANGPAIR}
MODELHOME = ../work-filter/models/${LANGPAIR}
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
endif
@ -52,8 +52,6 @@ WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
## all parts of this wiki
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
ttt:
echo ${PARTS}
## don't delete translated text if the process crashes
.PRECIOUS: ${WIKI_TRG}
@ -104,6 +102,7 @@ focus-wikis:
${MAKE} SRC=$$l TRG=en all-wikis; \
done
get-data: ${WIKI_JSON}
extract-text: ${WIKI_TXT}
prepare-model: ${LANGPAIR}/decoder.yml
prepare-data: ${WIKI_PRE}
@ -222,12 +221,15 @@ ${WIKI_TXT}: ${WIKI_JSON}
${SENTSPLITTER} |\
$(TOKENIZER)/replace-unicode-punctuation.perl |\
$(TOKENIZER)/remove-non-printing-char.perl |\
$(TOKENIZER)/normalize-punctuation.perl |\
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
python3 ../mono-match-lang.py -l ${LANGID} |\
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
gzip -f ${patsubst %${PART}.gz,%,$@}*
# $(TOKENIZER)/normalize-punctuation.perl |\
## OLD: without splitting into parts
#

64
bitext-match-lang.py Executable file
View File

@ -0,0 +1,64 @@
#!/usr/bin/env python3
#-*-python-*-
import pycld2 as cld2
import sys
import argparse
parser = argparse.ArgumentParser(description='language filter')
parser.add_argument('-s','--srclang','--source-language', type=str, default='en',
help='accepted language')
parser.add_argument('-t','--trglang','--target-language', type=str, default='de',
help='accepted language')
args = parser.parse_args()
def supported_language(lang):
supported = False
for l in cld2.LANGUAGES:
if l[1] == lang:
return True
return False
def is_accepted(line,accept,reject):
# isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang)
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
if accept:
if details[0][1] == accept:
if isReliable:
return True
else:
if details[0][1] != 'un':
if details[0][1] != reject:
return True
if not supported_language(args.srclang):
# print(args.srclang + " is not supported")
srcreject = 'en'
srcaccept = ''
else:
srcaccept = args.srclang
srcreject = ''
if not supported_language(args.trglang):
# print(args.trglang + " is not supported")
trgreject = 'en'
trgaccept = ''
else:
trgaccept = args.trglang
trgreject = ''
for line in sys.stdin:
text = line.rstrip().split("\t")
if len(text) > 1:
if text[0] and text[1]:
if is_accepted(text[0],srcaccept,srcreject):
if is_accepted(text[1],trgaccept,trgreject):
print(text[0] + "\t" + text[1])

70
mono-match-lang.py Executable file
View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
#-*-python-*-
import pycld2 as cld2
import argparse
import sys
parser = argparse.ArgumentParser(description='language filter')
parser.add_argument('-l','--lang','--language', type=str, default='en',
help='accepted language')
args = parser.parse_args()
def supported_language(lang):
supported = False
for l in cld2.LANGUAGES:
if l[1] == lang:
return True
return False
def is_accepted(line,accept,reject):
# isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang)
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
if accept:
if details[0][1] == accept:
if isReliable:
# print("ACCEPT")
# print(details)
return True
# else:
# print("REJECT - not reliable", file=sys.stderr)
# print(details, file=sys.stderr)
# print(line, file=sys.stderr)
# else:
# print("REJECT", file=sys.stderr)
# print(details, file=sys.stderr)
# print(line, file=sys.stderr)
else:
if details[0][1] != 'un':
if details[0][1] != reject:
# print("ACCEPT")
# print(details)
return True
# else:
# print("REJECT", file=sys.stderr)
# print(details, file=sys.stderr)
# print(line, file=sys.stderr)
# else:
# print("REJECT", file=sys.stderr)
# print(details, file=sys.stderr)
# print(line, file=sys.stderr)
if not supported_language(args.lang):
# print(args.lang + " is not supported")
reject = 'en'
accept = ''
else:
accept = args.lang
reject = ''
for line in sys.stdin:
text = line.rstrip()
if is_accepted(text,accept,reject):
print(text)

View File

@ -24,7 +24,8 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |

View File

@ -24,6 +24,7 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $2
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |