diff --git a/Makefile.config b/Makefile.config index c98d9045..c216249c 100644 --- a/Makefile.config +++ b/Makefile.config @@ -141,7 +141,8 @@ endif CONTEXT_SIZE = 100 ## pre-processing type -PRE = norm +# PRE = norm +PRE = simple PRE_SRC = spm${SRCBPESIZE:000=}k PRE_TRG = spm${TRGBPESIZE:000=}k diff --git a/Makefile.data b/Makefile.data index ac944d33..dc46044d 100644 --- a/Makefile.data +++ b/Makefile.data @@ -236,15 +236,37 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \ # @echo "done!" -%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE} - cat $< |\ - perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\ - gzip -c > $@ -%.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE} - cat $< |\ - perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\ - gzip -c > $@ +# %.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE} +# cat $< |\ +# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\ +# gzip -c > $@ + +# %.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE} +# cat $< |\ +# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\ +# gzip -c > $@ + + + +%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE} + cat ${word 1,$^} |\ + perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.1 + cat ${word 2,$^} |\ + perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.2 + paste $@.1 $@.2 |\ + python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext + cut -f1 $@.bitext | gzip -c > $@ + cut -f2 $@.bitext | gzip -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz) + rm -f $@.bitext $@.1 $@.2 + +# paste $@.${SRCEXT} $@.${TRGEXT} |\ +# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext +# cut -f1 $@.bitext > $@ +# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG}) + +%.clean.${TRGEXT}.gz: %.clean.${SRCEXT}.gz + @echo "done!" @@ -639,6 +661,21 @@ endif sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ +## minimal pre-processing +%.simple: %.raw + $(LOAD_MOSES) cat $< |\ + $(TOKENIZER)/replace-unicode-punctuation.perl |\ + $(TOKENIZER)/remove-non-printing-char.perl |\ + sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@ + +%.simple.gz: %.gz + $(LOAD_MOSES) zcat $< |\ + $(TOKENIZER)/replace-unicode-punctuation.perl |\ + $(TOKENIZER)/remove-non-printing-char.perl |\ + sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@ + + + ## increase max number of tokens to 250 ## (TODO: should MIN_NTOKENS be 1?) MIN_NR_TOKENS = 0 @@ -654,6 +691,13 @@ MAX_NR_TOKENS = 250 mv $@.${SRCEXT} $@ mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG}) +# paste $@.${SRCEXT} $@.${TRGEXT} |\ +# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext +# cut -f1 $@.bitext > $@ +# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG}) +# rm -f $@.${SRCEXT} $@.${TRGEXT} $@.bitext + + %.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC} @echo "done!" diff --git a/Makefile.env b/Makefile.env index 111c707a..12c295cd 100644 --- a/Makefile.env +++ b/Makefile.env @@ -55,14 +55,14 @@ LOADGPU = module load ${GPU_MODULES} ifeq (${shell hostname},dx6-ibs-p2) APPLHOME = /opt/tools - WORKHOME = ${shell realpath ${PWD}/work-spm} + WORKHOME = ${shell realpath ${PWD}/work-filter} OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/ MOSESHOME = ${APPLHOME}/mosesdecoder MARIAN = ${APPLHOME}/marian/build LOADMODS = echo "nothing to load" else ifeq (${shell hostname},dx7-nkiel-4gpu) APPLHOME = /opt/tools - WORKHOME = ${shell realpath ${PWD}/work-spm} + WORKHOME = ${shell realpath ${PWD}/work-filter} OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/ MOSESHOME = ${APPLHOME}/mosesdecoder MARIAN = ${APPLHOME}/marian/build @@ -71,7 +71,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),) DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR} # APPLHOME = ${USERAPPL}/tools APPLHOME = /proj/memad/tools - WORKHOME = /wrk/tiedeman/research/Opus-MT/work-spm + WORKHOME = /wrk/tiedeman/research/Opus-MT/work-filter OPUSHOME = /proj/nlpl/data/OPUS MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses # MARIAN = /proj/nlpl/software/marian/1.2.0 @@ -83,7 +83,7 @@ else CSCPROJECT = project_2001194 # CSCPROJECT = project_2000309 DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR} - WORKHOME = ${shell realpath ${PWD}/work-spm} + WORKHOME = ${shell realpath ${PWD}/work-filter} APPLHOME = ${HOME}/projappl # OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS OPUSHOME = /projappl/nlpl/data/OPUS diff --git a/Makefile.generic b/Makefile.generic index fa87fb8b..e0557da1 100644 --- a/Makefile.generic +++ b/Makefile.generic @@ -140,6 +140,15 @@ endif ${@:-spm-noalign=} +## sentence-piece models with langid-filtering (new default) +%-filter: + ${MAKE} WORKHOME=${shell realpath ${PWD}/work-filter} \ + PRE=simple \ + PRE_SRC=spm${SRCBPESIZE:000=}k \ + PRE_TRG=spm${TRGBPESIZE:000=}k \ + ${@:-spm=} + + ## BPE models %-bpe: diff --git a/backtranslate/Makefile b/backtranslate/Makefile index 5723accb..d3b8672f 100644 --- a/backtranslate/Makefile +++ b/backtranslate/Makefile @@ -29,7 +29,7 @@ MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} ifeq (${MODELNAME},) - MODELHOME = ../work-spm/models/${LANGPAIR} + MODELHOME = ../work-filter/models/${LANGPAIR} MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}} MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}} endif @@ -52,8 +52,6 @@ WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz ## all parts of this wiki PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}} -ttt: - echo ${PARTS} ## don't delete translated text if the process crashes .PRECIOUS: ${WIKI_TRG} @@ -104,6 +102,7 @@ focus-wikis: ${MAKE} SRC=$$l TRG=en all-wikis; \ done +get-data: ${WIKI_JSON} extract-text: ${WIKI_TXT} prepare-model: ${LANGPAIR}/decoder.yml prepare-data: ${WIKI_PRE} @@ -222,12 +221,15 @@ ${WIKI_TXT}: ${WIKI_JSON} ${SENTSPLITTER} |\ $(TOKENIZER)/replace-unicode-punctuation.perl |\ $(TOKENIZER)/remove-non-printing-char.perl |\ - $(TOKENIZER)/normalize-punctuation.perl |\ sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\ + python3 ../mono-match-lang.py -l ${LANGID} |\ split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@} gzip -f ${patsubst %${PART}.gz,%,$@}* +# $(TOKENIZER)/normalize-punctuation.perl |\ + + ## OLD: without splitting into parts # diff --git a/bitext-match-lang.py b/bitext-match-lang.py new file mode 100755 index 00000000..64cec7ed --- /dev/null +++ b/bitext-match-lang.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +#-*-python-*- + + +import pycld2 as cld2 +import sys +import argparse + +parser = argparse.ArgumentParser(description='language filter') +parser.add_argument('-s','--srclang','--source-language', type=str, default='en', + help='accepted language') +parser.add_argument('-t','--trglang','--target-language', type=str, default='de', + help='accepted language') +args = parser.parse_args() + + +def supported_language(lang): + supported = False + for l in cld2.LANGUAGES: + if l[1] == lang: + return True + return False + + +def is_accepted(line,accept,reject): + # isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang) + isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True) + if accept: + if details[0][1] == accept: + if isReliable: + return True + else: + if details[0][1] != 'un': + if details[0][1] != reject: + return True + + + +if not supported_language(args.srclang): + # print(args.srclang + " is not supported") + srcreject = 'en' + srcaccept = '' +else: + srcaccept = args.srclang + srcreject = '' + +if not supported_language(args.trglang): + # print(args.trglang + " is not supported") + trgreject = 'en' + trgaccept = '' +else: + trgaccept = args.trglang + trgreject = '' + + + +for line in sys.stdin: + text = line.rstrip().split("\t") + if len(text) > 1: + if text[0] and text[1]: + if is_accepted(text[0],srcaccept,srcreject): + if is_accepted(text[1],trgaccept,trgreject): + print(text[0] + "\t" + text[1]) + diff --git a/mono-match-lang.py b/mono-match-lang.py new file mode 100755 index 00000000..a1694b92 --- /dev/null +++ b/mono-match-lang.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +#-*-python-*- + + +import pycld2 as cld2 +import argparse +import sys + + +parser = argparse.ArgumentParser(description='language filter') +parser.add_argument('-l','--lang','--language', type=str, default='en', + help='accepted language') +args = parser.parse_args() + +def supported_language(lang): + supported = False + for l in cld2.LANGUAGES: + if l[1] == lang: + return True + return False + + +def is_accepted(line,accept,reject): + # isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang) + isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True) + if accept: + if details[0][1] == accept: + if isReliable: + # print("ACCEPT") + # print(details) + return True + # else: + # print("REJECT - not reliable", file=sys.stderr) + # print(details, file=sys.stderr) + # print(line, file=sys.stderr) + # else: + # print("REJECT", file=sys.stderr) + # print(details, file=sys.stderr) + # print(line, file=sys.stderr) + else: + if details[0][1] != 'un': + if details[0][1] != reject: + # print("ACCEPT") + # print(details) + return True + # else: + # print("REJECT", file=sys.stderr) + # print(details, file=sys.stderr) + # print(line, file=sys.stderr) + # else: + # print("REJECT", file=sys.stderr) + # print(details, file=sys.stderr) + # print(line, file=sys.stderr) + + + +if not supported_language(args.lang): + # print(args.lang + " is not supported") + reject = 'en' + accept = '' +else: + accept = args.lang + reject = '' + + +for line in sys.stdin: + text = line.rstrip() + if is_accepted(text,accept,reject): + print(text) + diff --git a/preprocess-spm-multi-target.sh b/preprocess-spm-multi-target.sh index 920edf47..47e98709 100755 --- a/preprocess-spm-multi-target.sh +++ b/preprocess-spm-multi-target.sh @@ -24,7 +24,8 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer ${TOKENIZER}/replace-unicode-punctuation.perl | ${TOKENIZER}/remove-non-printing-char.perl | -${TOKENIZER}/normalize-punctuation.perl -l $1 | sed 's/ */ /g;s/^ *//g;s/ *$//g' | ${SPMENCODE} --model $3 | sed "s/^/>>$2<< /" + +# ${TOKENIZER}/normalize-punctuation.perl -l $1 | diff --git a/preprocess-spm.sh b/preprocess-spm.sh index 33a477de..bf9a3f75 100755 --- a/preprocess-spm.sh +++ b/preprocess-spm.sh @@ -24,6 +24,7 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer ${TOKENIZER}/replace-unicode-punctuation.perl | ${TOKENIZER}/remove-non-printing-char.perl | -${TOKENIZER}/normalize-punctuation.perl -l $1 | sed 's/ */ /g;s/^ *//g;s/ *$//g' | ${SPMENCODE} --model $2 + +# ${TOKENIZER}/normalize-punctuation.perl -l $1 |