pre-processing scripts fixed

This commit is contained in:
Joerg Tiedemann 2020-01-17 13:42:18 +02:00
parent 831acb1ae7
commit 0185534823
5 changed files with 113 additions and 15 deletions

View File

@ -91,6 +91,47 @@ DATE = ${shell date +%F}
MODELS_URL = https://object.pouta.csc.fi/OPUS-MT-dev
SKIP_DIST_EVAL = 0
## determine pre-processing type
ifneq (${words ${TRGLANGS}},1)
ifneq ("$(wildcard ${BPESRCMODEL})","")
PREPROCESS_SCRIPT = preprocess-bpe-multi-target.sh
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
POSTPROCESS_SCRIPT = postprocess-bpe.sh
PREPROCESS_TYPE = normalization + tokenization + BPE
else
PREPROCESS_SCRIPT = preprocess-spm-multi-target.sh
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
POSTPROCESS_SCRIPT = postprocess-spm.sh
PREPROCESS_TYPE = normalization + SentencePiece
endif
else
ifneq ("$(wildcard ${BPESRCMODEL})","")
PREPROCESS_SCRIPT = preprocess-bpe.sh
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
POSTPROCESS_SCRIPT = postprocess-bpe.sh
PREPROCESS_TYPE = normalization + tokenization + BPE
else
PREPROCESS_SCRIPT = preprocess-spm.sh
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
POSTPROCESS_SCRIPT = postprocess-spm.sh
PREPROCESS_TYPE = normalization + SentencePiece
endif
endif
ttt:
@echo ${PREPROCESS_SRCMODEL}
@echo ${PREPROCESS_TRGMODEL}
@echo ${PREPROCESS_SCRIPT}
@echo ${POSTPROCESS_SCRIPT}
${DIST_PACKAGE}: ${MODEL_FINAL}
ifneq (${SKIP_DIST_EVAL},1)
@${MAKE} $(TEST_EVALUATION)
@ -102,19 +143,11 @@ endif
@echo '' >> ${WORKDIR}/README.md
@echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
@if [ -e ${BPESRCMODEL} ]; then \
echo "* pre-processing: normalization + tokenization + BPE" >> ${WORKDIR}/README.md; \
cp ${BPESRCMODEL} ${WORKDIR}/source.bpe; \
cp ${BPETRGMODEL} ${WORKDIR}/target.bpe; \
cp preprocess-bpe.sh ${WORKDIR}/preprocess.sh; \
cp postprocess-bpe.sh ${WORKDIR}/postprocess.sh; \
elif [ -e ${SPMSRCMODEL} ]; then \
echo "* pre-processing: normalization + SentencePiece" >> ${WORKDIR}/README.md; \
cp ${SPMSRCMODEL} ${WORKDIR}/source.spm; \
cp ${SPMTRGMODEL} ${WORKDIR}/target.spm; \
cp preprocess-spm.sh ${WORKDIR}/preprocess.sh; \
cp postprocess-spm.sh ${WORKDIR}/postprocess.sh; \
fi
@echo "* pre-processing: ${PREPROCESS_TYPE}" >> ${WORKDIR}/README.md
@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.bpe
@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.bpe
@cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
@cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
@if [ ${words ${TRGLANGS}} -gt 1 ]; then \
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \
>> ${WORKDIR}/README.md; \

View File

@ -0,0 +1,35 @@
#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
#
#
# replace MOSESHOME and SNMTPATH with your own setup!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
MOSESHOME=${APPLHOME}/mosesdecoder
SNMTPATH=${APPLHOME}/subword-nmt/subword_nmt
elif [ `hostname -d` == "csc.fi" ]; then
APPLHOME=/proj/memad/tools
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
SNMTPATH=${APPLHOME}/subword-nmt
else
MOSESHOME=${PWD}/mosesdecoder
SNMTPATH=${PWD}/subword-nmt
fi
MOSESSCRIPTS=${MOSESHOME}/scripts
TOKENIZER=${MOSESSCRIPTS}/tokenizer
THREADS=4
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
sed "s/^/>>$2<< /"

View File

@ -29,5 +29,5 @@ ${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
python3 ${SNMTPATH}/apply_bpe.py -c $2

View File

@ -0,0 +1,30 @@
#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
#
#
# replace MOSESHOME and SPMENCODE with your own setup!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
MOSESHOME=${APPLHOME}/mosesdecoder
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
elif [ `hostname -d` == "csc.fi" ]; then
APPLHOME=/proj/memad/tools
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
else
MOSESHOME=${PWD}/mosesdecoder
SPMENCODE=${PWD}/marian-dev/build/spm_encode
fi
MOSESSCRIPTS=${MOSESHOME}/scripts
TOKENIZER=${MOSESSCRIPTS}/tokenizer
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"

View File

@ -25,5 +25,5 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $2