mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-28 06:09:35 +03:00
removed punctuation normalisation and added language filter
This commit is contained in:
parent
91576aa3e9
commit
ee8c27e3db
@ -141,7 +141,8 @@ endif
|
||||
CONTEXT_SIZE = 100
|
||||
|
||||
## pre-processing type
|
||||
PRE = norm
|
||||
# PRE = norm
|
||||
PRE = simple
|
||||
PRE_SRC = spm${SRCBPESIZE:000=}k
|
||||
PRE_TRG = spm${TRGBPESIZE:000=}k
|
||||
|
||||
|
@ -236,15 +236,37 @@ ${TRAIN_ALG}: ${TRAIN_SRC}.clean.${PRE_SRC}${TRAINSIZE}.gz \
|
||||
# @echo "done!"
|
||||
|
||||
|
||||
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
|
||||
cat $< |\
|
||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||
gzip -c > $@
|
||||
|
||||
%.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE}
|
||||
cat $< |\
|
||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||
gzip -c > $@
|
||||
# %.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
|
||||
# cat $< |\
|
||||
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||
# gzip -c > $@
|
||||
|
||||
# %.clean.${TRGEXT}.gz: %.${TRGEXT}.${PRE}
|
||||
# cat $< |\
|
||||
# perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' |\
|
||||
# gzip -c > $@
|
||||
|
||||
|
||||
|
||||
%.clean.${SRCEXT}.gz: %.${SRCEXT}.${PRE} %.${TRGEXT}.${PRE}
|
||||
cat ${word 1,$^} |\
|
||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.1
|
||||
cat ${word 2,$^} |\
|
||||
perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;' > $@.2
|
||||
paste $@.1 $@.2 |\
|
||||
python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
||||
cut -f1 $@.bitext | gzip -c > $@
|
||||
cut -f2 $@.bitext | gzip -c > $(@:.clean.${SRCEXT}.gz=.clean.${TRGEXT}.gz)
|
||||
rm -f $@.bitext $@.1 $@.2
|
||||
|
||||
# paste $@.${SRCEXT} $@.${TRGEXT} |\
|
||||
# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
||||
# cut -f1 $@.bitext > $@
|
||||
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
|
||||
|
||||
%.clean.${TRGEXT}.gz: %.clean.${SRCEXT}.gz
|
||||
@echo "done!"
|
||||
|
||||
|
||||
|
||||
@ -639,6 +661,21 @@ endif
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||
|
||||
|
||||
## minimal pre-processing
|
||||
%.simple: %.raw
|
||||
$(LOAD_MOSES) cat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' > $@
|
||||
|
||||
%.simple.gz: %.gz
|
||||
$(LOAD_MOSES) zcat $< |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' | gzip -c > $@
|
||||
|
||||
|
||||
|
||||
## increase max number of tokens to 250
|
||||
## (TODO: should MIN_NTOKENS be 1?)
|
||||
MIN_NR_TOKENS = 0
|
||||
@ -654,6 +691,13 @@ MAX_NR_TOKENS = 250
|
||||
mv $@.${SRCEXT} $@
|
||||
mv $@.${TRGEXT} $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
|
||||
|
||||
# paste $@.${SRCEXT} $@.${TRGEXT} |\
|
||||
# python3 bitext-match-lang.py -s ${SRC} -t ${TRG} > $@.bitext
|
||||
# cut -f1 $@.bitext > $@
|
||||
# cut -f2 $@.bitext > $(@:.src.clean.${PRE_SRC}=.trg.clean.${PRE_TRG})
|
||||
# rm -f $@.${SRCEXT} $@.${TRGEXT} $@.bitext
|
||||
|
||||
|
||||
%.trg.clean.${PRE_TRG}: %.src.clean.${PRE_SRC}
|
||||
@echo "done!"
|
||||
|
||||
|
@ -55,14 +55,14 @@ LOADGPU = module load ${GPU_MODULES}
|
||||
|
||||
ifeq (${shell hostname},dx6-ibs-p2)
|
||||
APPLHOME = /opt/tools
|
||||
WORKHOME = ${shell realpath ${PWD}/work-spm}
|
||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
MARIAN = ${APPLHOME}/marian/build
|
||||
LOADMODS = echo "nothing to load"
|
||||
else ifeq (${shell hostname},dx7-nkiel-4gpu)
|
||||
APPLHOME = /opt/tools
|
||||
WORKHOME = ${shell realpath ${PWD}/work-spm}
|
||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||
OPUSHOME = tiedeman@taito.csc.fi:/proj/nlpl/data/OPUS/
|
||||
MOSESHOME = ${APPLHOME}/mosesdecoder
|
||||
MARIAN = ${APPLHOME}/marian/build
|
||||
@ -71,7 +71,7 @@ else ifneq ($(wildcard /wrk/tiedeman/research),)
|
||||
DATAHOME = /proj/OPUS/WMT19/data/${LANGPAIR}
|
||||
# APPLHOME = ${USERAPPL}/tools
|
||||
APPLHOME = /proj/memad/tools
|
||||
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-spm
|
||||
WORKHOME = /wrk/tiedeman/research/Opus-MT/work-filter
|
||||
OPUSHOME = /proj/nlpl/data/OPUS
|
||||
MOSESHOME = /proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||
# MARIAN = /proj/nlpl/software/marian/1.2.0
|
||||
@ -83,7 +83,7 @@ else
|
||||
CSCPROJECT = project_2001194
|
||||
# CSCPROJECT = project_2000309
|
||||
DATAHOME = ${HOME}/work/opentrans/data/${LANGPAIR}
|
||||
WORKHOME = ${shell realpath ${PWD}/work-spm}
|
||||
WORKHOME = ${shell realpath ${PWD}/work-filter}
|
||||
APPLHOME = ${HOME}/projappl
|
||||
# OPUSHOME = /scratch/project_2000661/nlpl/data/OPUS
|
||||
OPUSHOME = /projappl/nlpl/data/OPUS
|
||||
|
@ -140,6 +140,15 @@ endif
|
||||
${@:-spm-noalign=}
|
||||
|
||||
|
||||
## sentence-piece models with langid-filtering (new default)
|
||||
%-filter:
|
||||
${MAKE} WORKHOME=${shell realpath ${PWD}/work-filter} \
|
||||
PRE=simple \
|
||||
PRE_SRC=spm${SRCBPESIZE:000=}k \
|
||||
PRE_TRG=spm${TRGBPESIZE:000=}k \
|
||||
${@:-spm=}
|
||||
|
||||
|
||||
|
||||
## BPE models
|
||||
%-bpe:
|
||||
|
@ -29,7 +29,7 @@ MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
|
||||
ifeq (${MODELNAME},)
|
||||
MODELHOME = ../work-spm/models/${LANGPAIR}
|
||||
MODELHOME = ../work-filter/models/${LANGPAIR}
|
||||
MODELZIP = ${lastword ${sort ${wildcard ${MODELHOME}/*-20*.zip}}}
|
||||
MODELNAME = ${patsubst %.zip,%,${notdir ${MODELZIP}}}
|
||||
endif
|
||||
@ -52,8 +52,6 @@ WIKI_TRG = ${LANGPAIR}/${WIKISOURCE}.${PART}_${MODELNAME}.${LANGPAIR}.${TRG}.gz
|
||||
## all parts of this wiki
|
||||
PARTS = ${sort ${patsubst ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.%.gz,%,${wildcard ${WIKI_DIR}/${WIKISOURCE}.${LANGID}.*.gz}}}
|
||||
|
||||
ttt:
|
||||
echo ${PARTS}
|
||||
|
||||
## don't delete translated text if the process crashes
|
||||
.PRECIOUS: ${WIKI_TRG}
|
||||
@ -104,6 +102,7 @@ focus-wikis:
|
||||
${MAKE} SRC=$$l TRG=en all-wikis; \
|
||||
done
|
||||
|
||||
get-data: ${WIKI_JSON}
|
||||
extract-text: ${WIKI_TXT}
|
||||
prepare-model: ${LANGPAIR}/decoder.yml
|
||||
prepare-data: ${WIKI_PRE}
|
||||
@ -222,12 +221,15 @@ ${WIKI_TXT}: ${WIKI_JSON}
|
||||
${SENTSPLITTER} |\
|
||||
$(TOKENIZER)/replace-unicode-punctuation.perl |\
|
||||
$(TOKENIZER)/remove-non-printing-char.perl |\
|
||||
$(TOKENIZER)/normalize-punctuation.perl |\
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |\
|
||||
python3 ../mono-match-lang.py -l ${LANGID} |\
|
||||
split -l ${MAX_SENTENCES} - ${patsubst %${PART}.gz,%,$@}
|
||||
gzip -f ${patsubst %${PART}.gz,%,$@}*
|
||||
|
||||
|
||||
# $(TOKENIZER)/normalize-punctuation.perl |\
|
||||
|
||||
|
||||
|
||||
## OLD: without splitting into parts
|
||||
#
|
||||
|
64
bitext-match-lang.py
Executable file
64
bitext-match-lang.py
Executable file
@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
#-*-python-*-
|
||||
|
||||
|
||||
import pycld2 as cld2
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='language filter')
|
||||
parser.add_argument('-s','--srclang','--source-language', type=str, default='en',
|
||||
help='accepted language')
|
||||
parser.add_argument('-t','--trglang','--target-language', type=str, default='de',
|
||||
help='accepted language')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def supported_language(lang):
|
||||
supported = False
|
||||
for l in cld2.LANGUAGES:
|
||||
if l[1] == lang:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_accepted(line,accept,reject):
|
||||
# isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang)
|
||||
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
|
||||
if accept:
|
||||
if details[0][1] == accept:
|
||||
if isReliable:
|
||||
return True
|
||||
else:
|
||||
if details[0][1] != 'un':
|
||||
if details[0][1] != reject:
|
||||
return True
|
||||
|
||||
|
||||
|
||||
if not supported_language(args.srclang):
|
||||
# print(args.srclang + " is not supported")
|
||||
srcreject = 'en'
|
||||
srcaccept = ''
|
||||
else:
|
||||
srcaccept = args.srclang
|
||||
srcreject = ''
|
||||
|
||||
if not supported_language(args.trglang):
|
||||
# print(args.trglang + " is not supported")
|
||||
trgreject = 'en'
|
||||
trgaccept = ''
|
||||
else:
|
||||
trgaccept = args.trglang
|
||||
trgreject = ''
|
||||
|
||||
|
||||
|
||||
for line in sys.stdin:
|
||||
text = line.rstrip().split("\t")
|
||||
if len(text) > 1:
|
||||
if text[0] and text[1]:
|
||||
if is_accepted(text[0],srcaccept,srcreject):
|
||||
if is_accepted(text[1],trgaccept,trgreject):
|
||||
print(text[0] + "\t" + text[1])
|
||||
|
70
mono-match-lang.py
Executable file
70
mono-match-lang.py
Executable file
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
#-*-python-*-
|
||||
|
||||
|
||||
import pycld2 as cld2
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='language filter')
|
||||
parser.add_argument('-l','--lang','--language', type=str, default='en',
|
||||
help='accepted language')
|
||||
args = parser.parse_args()
|
||||
|
||||
def supported_language(lang):
|
||||
supported = False
|
||||
for l in cld2.LANGUAGES:
|
||||
if l[1] == lang:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_accepted(line,accept,reject):
|
||||
# isReliable, textBytesFound, details = cld2.detect(line, hintLanguage=args.lang)
|
||||
isReliable, textBytesFound, details = cld2.detect(line, bestEffort=True)
|
||||
if accept:
|
||||
if details[0][1] == accept:
|
||||
if isReliable:
|
||||
# print("ACCEPT")
|
||||
# print(details)
|
||||
return True
|
||||
# else:
|
||||
# print("REJECT - not reliable", file=sys.stderr)
|
||||
# print(details, file=sys.stderr)
|
||||
# print(line, file=sys.stderr)
|
||||
# else:
|
||||
# print("REJECT", file=sys.stderr)
|
||||
# print(details, file=sys.stderr)
|
||||
# print(line, file=sys.stderr)
|
||||
else:
|
||||
if details[0][1] != 'un':
|
||||
if details[0][1] != reject:
|
||||
# print("ACCEPT")
|
||||
# print(details)
|
||||
return True
|
||||
# else:
|
||||
# print("REJECT", file=sys.stderr)
|
||||
# print(details, file=sys.stderr)
|
||||
# print(line, file=sys.stderr)
|
||||
# else:
|
||||
# print("REJECT", file=sys.stderr)
|
||||
# print(details, file=sys.stderr)
|
||||
# print(line, file=sys.stderr)
|
||||
|
||||
|
||||
|
||||
if not supported_language(args.lang):
|
||||
# print(args.lang + " is not supported")
|
||||
reject = 'en'
|
||||
accept = ''
|
||||
else:
|
||||
accept = args.lang
|
||||
reject = ''
|
||||
|
||||
|
||||
for line in sys.stdin:
|
||||
text = line.rstrip()
|
||||
if is_accepted(text,accept,reject):
|
||||
print(text)
|
||||
|
@ -24,7 +24,8 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
|
||||
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
|
@ -24,6 +24,7 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $2
|
||||
|
||||
# ${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
|
Loading…
Reference in New Issue
Block a user