mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-09-11 20:27:19 +03:00
removed punctuation normalisation and added language filter
This commit is contained in:
parent
91576aa3e9
commit
ee8c27e3db
@ -141,7 +141,8 @@ endif
|
|||||||
CONTEXT_SIZE = 100
|
CONTEXT_SIZE = 100
|
||||||
|
|
||||||
## pre-processing type
|
## pre-processing type
|
||||||
PRE = norm
|
# PRE = norm
|
||||||
|
PRE = simple
|
||||||
PRE_SRC = spm${SRCBPESIZE:000=}k
|
PRE_SRC = spm${SRCBPESIZE:000=}k
|
||||||
PRE_TRG = spm${TRGBPESIZE:000=}k
|
PRE_TRG = spm${TRGBPESIZE:000=}k
|
||||||
|
|
||||||
|
@ -236,15 +236,37 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
|||||||
# @echo "done!"
|
# @echo "done!"
|
||||||
|
|
||||||
|
|
||||||
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
|
|
||||||
cat $< |\
|
|
||||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
|
||||||
gzip -c > $@
|
|
||||||
|
|
||||||
%.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE}
|
# %.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
|
||||||
cat $< |\
|
# cat $< |\
|
||||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||||
gzip -c > $@
|
# gzip -c > $@
|
||||||
|
|
||||||
|
# %.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE}
|
||||||
|
# cat $< |\
|
||||||
|
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||||
|
# gzip -c > $@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
|
||||||
|
cat ${word 1,$^} |\
|
||||||
|
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.1
|
||||||
|
cat ${word 2,$^} |\
|
||||||
|
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.2
|
||||||
|
paste $@.1 $@.2 |\
|
||||||
|
python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
||||||
|
cut -f1 $@.bitext | gzip -c > $@
|
||||||
|
cut -f2 $@.bitext | gzip -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
|
||||||
|
rm -f $@.bitext $@.1 $@.2
|
||||||
|
|
||||||
|
# paste $@.${SRCEXT} $@.${TRGEXT} |\
|
||||||
|
# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
||||||
|
# cut -f1 $@.bitext > $@
|
||||||
|
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
|
||||||
|
|
||||||
|
%.clean.${TRGEXT}.gz: %.clean.${SRCEXT}.gz
|
||||||
|
@echo "done!"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -639,6 +661,21 @@ endif
|
|||||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||||
|
|
||||||
|
|
||||||
|
## minimal pre-processing
|
||||||
|
%.simple: %.raw
|
||||||
|
$(LOAD_MOSES) cat $< |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||||
|
|
||||||
|
%.simple.gz: %.gz
|
||||||
|
$(LOAD_MOSES) zcat $< |\
|
||||||
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## increase max number of tokens to 250
|
## increase max number of tokens to 250
|
||||||
## (TODO: should MIN_NTOKENS be 1?)
|
## (TODO: should MIN_NTOKENS be 1?)
|
||||||
MIN_NR_TOKENS = 0
|
MIN_NR_TOKENS = 0
|
||||||
@ -654,6 +691,13 @@ MAX_NR_TOKENS = 250
|
|||||||
mv $@.${SRCEXT} $@
|
mv $@.${SRCEXT} $@
|
||||||
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
|
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
|
||||||
|
|
||||||
|
# paste $@.${SRCEXT} $@.${TRGEXT} |\
|
||||||
|
# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
||||||
|
# cut -f1 $@.bitext > $@
|
||||||
|
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
|
||||||
|
# rm -f $@.${SRCEXT} $@.${TRGEXT} $@.bitext
|
||||||
|
|
||||||
|
|
||||||
%.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC}
|
%.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC}
|
||||||
@echo "done!"
|
@echo "done!"
|
||||||
|
|
||||||
|
@ -55,14 +55,14 @@ LOADGPU = module load ${GPU_MODULES}
|
|||||||
|
|
||||||
ifeq (${shell hostname},dx6-ibs-p2)
|
ifeq (${shell hostname},dx6-ibs-p2)
|
||||||
APPLHOME = /opt/tools
|
APPLHOME = /opt/tools
|
||||||
WORKHOME = ${shell realpath ${PWD}/work-spm}
|
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||||
MARIAN = ${APPLHOME}/marian/build
|
MARIAN = ${APPLHOME}/marian/build
|
||||||
LOADMODS = echo "nothing to load"
|
LOADMODS = echo "nothing to load"
|
||||||
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
||||||
APPLHOME = /opt/tools
|
APPLHOME = /opt/tools
|
||||||
WORKHOME = ${shell realpath ${PWD}/work-spm}
|
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||||
MARIAN = ${APPLHOME}/marian/build
|
MARIAN = ${APPLHOME}/marian/build
|
||||||
@ -71,7 +71,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
|
|||||||
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
|
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
|
||||||
# APPLHOME = ${USERAPPL}/tools
|
# APPLHOME = ${USERAPPL}/tools
|
||||||
APPLHOME = /proj/memad/tools
|
APPLHOME = /proj/memad/tools
|
||||||
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-spm
|
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-filter
|
||||||
OPUSHOME = /proj/nlpl/data/OPUS
|
OPUSHOME = /proj/nlpl/data/OPUS
|
||||||
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
|
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||||
# MARIAN = /proj/nlpl/software/marian/1.2.0
|
# MARIAN = /proj/nlpl/software/marian/1.2.0
|
||||||
@ -83,7 +83,7 @@ else
|
|||||||
CSCPROJECT = project_2001194
|
CSCPROJECT = project_2001194
|
||||||
# CSCPROJECT = project_2000309
|
# CSCPROJECT = project_2000309
|
||||||
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
|
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
|
||||||
WORKHOME = ${shell realpath ${PWD}/work-spm}
|
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||||
APPLHOME = ${HOME}/projappl
|
APPLHOME = ${HOME}/projappl
|
||||||
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
|
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
|
||||||
OPUSHOME = /projappl/nlpl/data/OPUS
|
OPUSHOME = /projappl/nlpl/data/OPUS
|
||||||
|
@ -140,6 +140,15 @@ endif
|
|||||||
${@:-spm-noalign=}
|
${@:-spm-noalign=}
|
||||||
|
|
||||||
|
|
||||||
|
## sentence-piece models with langid-filtering (new default)
|
||||||
|
%-filter:
|
||||||
|
${MAKE} WORKHOME=${shell realpath ${PWD}/work-filter} \
|
||||||
|
PRE=simple \
|
||||||
|
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||||
|
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||||
|
${@:-spm=}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## BPE models
|
## BPE models
|
||||||
%-bpe:
|
%-bpe:
|
||||||
|
@ -29,7 +29,7 @@ MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
|||||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||||
|
|
||||||
ifeq (${MODELNAME},)
|
ifeq (${MODELNAME},)
|
||||||
MODELHOME = ../work-spm/models/${LANGPAIR}
|
MODELHOME = ../work-filter/models/${LANGPAIR}
|
||||||
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||||
endif
|
endif
|
||||||
@ -52,8 +52,6 @@ WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
|
|||||||
## all parts of this wiki
|
## all parts of this wiki
|
||||||
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
|
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
|
||||||
|
|
||||||
ttt:
|
|
||||||
echo ${PARTS}
|
|
||||||
|
|
||||||
## don't delete translated text if the process crashes
|
## don't delete translated text if the process crashes
|
||||||
.PRECIOUS: ${WIKI_TRG}
|
.PRECIOUS: ${WIKI_TRG}
|
||||||
@ -104,6 +102,7 @@ focus-wikis:
|
|||||||
${MAKE} SRC=$$l TRG=en all-wikis; \
|
${MAKE} SRC=$$l TRG=en all-wikis; \
|
||||||
done
|
done
|
||||||
|
|
||||||
|
get-data: ${WIKI_JSON}
|
||||||
extract-text: ${WIKI_TXT}
|
extract-text: ${WIKI_TXT}
|
||||||
prepare-model: ${LANGPAIR}/decoder.yml
|
prepare-model: ${LANGPAIR}/decoder.yml
|
||||||
prepare-data: ${WIKI_PRE}
|
prepare-data: ${WIKI_PRE}
|
||||||
@ -222,12 +221,15 @@ ${WIKI_TXT}: ${WIKI_JSON}
|
|||||||
${SENTSPLITTER} |\
|
${SENTSPLITTER} |\
|
||||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||||
|
python3 ../mono-match-lang.py -l ${LANGID} |\
|
||||||
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
|
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
|
||||||
gzip -f ${patsubst %${PART}.gz,%,$@}*
|
gzip -f ${patsubst %${PART}.gz,%,$@}*
|
||||||
|
|
||||||
|
|
||||||
|
# $(TOKENIZER)/normalize-punctuation.perl |\
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## OLD: without splitting into parts
|
## OLD: without splitting into parts
|
||||||
#
|
#
|
||||||
|
64
bitext-match-lang.py
Executable file
64
bitext-match-lang.py
Executable file
@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#-*-python-*-
|
||||||
|
|
||||||
|
|
||||||
|
import pycld2 as cld2
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='language filter')
|
||||||
|
parser.add_argument('-s','--srclang','--source-language', type=str, default='en',
|
||||||
|
help='accepted language')
|
||||||
|
parser.add_argument('-t','--trglang','--target-language', type=str, default='de',
|
||||||
|
help='accepted language')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def supported_language(lang):
|
||||||
|
supported = False
|
||||||
|
for l in cld2.LANGUAGES:
|
||||||
|
if l[1] == lang:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_accepted(line,accept,reject):
|
||||||
|
# isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang)
|
||||||
|
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
|
||||||
|
if accept:
|
||||||
|
if details[0][1] == accept:
|
||||||
|
if isReliable:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
if details[0][1] != 'un':
|
||||||
|
if details[0][1] != reject:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if not supported_language(args.srclang):
|
||||||
|
# print(args.srclang + " is not supported")
|
||||||
|
srcreject = 'en'
|
||||||
|
srcaccept = ''
|
||||||
|
else:
|
||||||
|
srcaccept = args.srclang
|
||||||
|
srcreject = ''
|
||||||
|
|
||||||
|
if not supported_language(args.trglang):
|
||||||
|
# print(args.trglang + " is not supported")
|
||||||
|
trgreject = 'en'
|
||||||
|
trgaccept = ''
|
||||||
|
else:
|
||||||
|
trgaccept = args.trglang
|
||||||
|
trgreject = ''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
text = line.rstrip().split("\t")
|
||||||
|
if len(text) > 1:
|
||||||
|
if text[0] and text[1]:
|
||||||
|
if is_accepted(text[0],srcaccept,srcreject):
|
||||||
|
if is_accepted(text[1],trgaccept,trgreject):
|
||||||
|
print(text[0] + "\t" + text[1])
|
||||||
|
|
70
mono-match-lang.py
Executable file
70
mono-match-lang.py
Executable file
@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#-*-python-*-
|
||||||
|
|
||||||
|
|
||||||
|
import pycld2 as cld2
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='language filter')
|
||||||
|
parser.add_argument('-l','--lang','--language', type=str, default='en',
|
||||||
|
help='accepted language')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
def supported_language(lang):
|
||||||
|
supported = False
|
||||||
|
for l in cld2.LANGUAGES:
|
||||||
|
if l[1] == lang:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_accepted(line,accept,reject):
|
||||||
|
# isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang)
|
||||||
|
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
|
||||||
|
if accept:
|
||||||
|
if details[0][1] == accept:
|
||||||
|
if isReliable:
|
||||||
|
# print("ACCEPT")
|
||||||
|
# print(details)
|
||||||
|
return True
|
||||||
|
# else:
|
||||||
|
# print("REJECT - not reliable", file=sys.stderr)
|
||||||
|
# print(details, file=sys.stderr)
|
||||||
|
# print(line, file=sys.stderr)
|
||||||
|
# else:
|
||||||
|
# print("REJECT", file=sys.stderr)
|
||||||
|
# print(details, file=sys.stderr)
|
||||||
|
# print(line, file=sys.stderr)
|
||||||
|
else:
|
||||||
|
if details[0][1] != 'un':
|
||||||
|
if details[0][1] != reject:
|
||||||
|
# print("ACCEPT")
|
||||||
|
# print(details)
|
||||||
|
return True
|
||||||
|
# else:
|
||||||
|
# print("REJECT", file=sys.stderr)
|
||||||
|
# print(details, file=sys.stderr)
|
||||||
|
# print(line, file=sys.stderr)
|
||||||
|
# else:
|
||||||
|
# print("REJECT", file=sys.stderr)
|
||||||
|
# print(details, file=sys.stderr)
|
||||||
|
# print(line, file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if not supported_language(args.lang):
|
||||||
|
# print(args.lang + " is not supported")
|
||||||
|
reject = 'en'
|
||||||
|
accept = ''
|
||||||
|
else:
|
||||||
|
accept = args.lang
|
||||||
|
reject = ''
|
||||||
|
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
text = line.rstrip()
|
||||||
|
if is_accepted(text,accept,reject):
|
||||||
|
print(text)
|
||||||
|
|
@ -24,7 +24,8 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
|||||||
|
|
||||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||||
${TOKENIZER}/remove-non-printing-char.perl |
|
${TOKENIZER}/remove-non-printing-char.perl |
|
||||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||||
${SPMENCODE} --model $3 |
|
${SPMENCODE} --model $3 |
|
||||||
sed "s/^/>>$2<< /"
|
sed "s/^/>>$2<< /"
|
||||||
|
|
||||||
|
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||||
|
@ -24,6 +24,7 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
|||||||
|
|
||||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||||
${TOKENIZER}/remove-non-printing-char.perl |
|
${TOKENIZER}/remove-non-printing-char.perl |
|
||||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
|
||||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||||
${SPMENCODE} --model $2
|
${SPMENCODE} --model $2
|
||||||
|
|
||||||
|
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||||
|
Loading…
Reference in New Issue
Block a user