From 01855348232222ec035c2bb4b6e17b0b8c5fdc2a Mon Sep 17 00:00:00 2001 From: Joerg Tiedemann Date: Fri, 17 Jan 2020 13:42:18 +0200 Subject: [PATCH] pre-processing scripts fixed --- Makefile.dist | 59 ++++++++++++++++++++++++++-------- preprocess-bpe-multi-target.sh | 35 ++++++++++++++++++++ preprocess-bpe.sh | 2 +- preprocess-spm-multi-target.sh | 30 +++++++++++++++++ preprocess-spm.sh | 2 +- 5 files changed, 113 insertions(+), 15 deletions(-) create mode 100644 preprocess-bpe-multi-target.sh create mode 100644 preprocess-spm-multi-target.sh diff --git a/Makefile.dist b/Makefile.dist index 10753886..7bc77781 100644 --- a/Makefile.dist +++ b/Makefile.dist @@ -91,6 +91,47 @@ DATE = ${shell date +%F} MODELS_URL = https://object.pouta.csc.fi/OPUS-MT-dev SKIP_DIST_EVAL = 0 + +## determine pre-processing type + +ifneq (${words ${TRGLANGS}},1) +ifneq ("$(wildcard ${BPESRCMODEL})","") + PREPROCESS_SCRIPT = preprocess-bpe-multi-target.sh + PREPROCESS_SRCMODEL = ${BPESRCMODEL} + PREPROCESS_TRGMODEL = ${BPETRGMODEL} + POSTPROCESS_SCRIPT = postprocess-bpe.sh + PREPROCESS_TYPE = normalization + tokenization + BPE +else + PREPROCESS_SCRIPT = preprocess-spm-multi-target.sh + PREPROCESS_SRCMODEL = ${SPMSRCMODEL} + PREPROCESS_TRGMODEL = ${SPMTRGMODEL} + POSTPROCESS_SCRIPT = postprocess-spm.sh + PREPROCESS_TYPE = normalization + SentencePiece +endif +else +ifneq ("$(wildcard ${BPESRCMODEL})","") + PREPROCESS_SCRIPT = preprocess-bpe.sh + PREPROCESS_SRCMODEL = ${BPESRCMODEL} + PREPROCESS_TRGMODEL = ${BPETRGMODEL} + POSTPROCESS_SCRIPT = postprocess-bpe.sh + PREPROCESS_TYPE = normalization + tokenization + BPE +else + PREPROCESS_SCRIPT = preprocess-spm.sh + PREPROCESS_SRCMODEL = ${SPMSRCMODEL} + PREPROCESS_TRGMODEL = ${SPMTRGMODEL} + POSTPROCESS_SCRIPT = postprocess-spm.sh + PREPROCESS_TYPE = normalization + SentencePiece +endif +endif + + +ttt: + @echo ${PREPROCESS_SRCMODEL} + @echo ${PREPROCESS_TRGMODEL} + @echo ${PREPROCESS_SCRIPT} + @echo ${POSTPROCESS_SCRIPT} + + ${DIST_PACKAGE}: ${MODEL_FINAL} ifneq (${SKIP_DIST_EVAL},1) @${MAKE} $(TEST_EVALUATION) @@ -102,19 +143,11 @@ endif @echo '' >> ${WORKDIR}/README.md @echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md @echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md - @if [ -e ${BPESRCMODEL} ]; then \ - echo "* pre-processing: normalization + tokenization + BPE" >> ${WORKDIR}/README.md; \ - cp ${BPESRCMODEL} ${WORKDIR}/source.bpe; \ - cp ${BPETRGMODEL} ${WORKDIR}/target.bpe; \ - cp preprocess-bpe.sh ${WORKDIR}/preprocess.sh; \ - cp postprocess-bpe.sh ${WORKDIR}/postprocess.sh; \ - elif [ -e ${SPMSRCMODEL} ]; then \ - echo "* pre-processing: normalization + SentencePiece" >> ${WORKDIR}/README.md; \ - cp ${SPMSRCMODEL} ${WORKDIR}/source.spm; \ - cp ${SPMTRGMODEL} ${WORKDIR}/target.spm; \ - cp preprocess-spm.sh ${WORKDIR}/preprocess.sh; \ - cp postprocess-spm.sh ${WORKDIR}/postprocess.sh; \ - fi + @echo "* pre-processing: ${PREPROCESS_TYPE}" >> ${WORKDIR}/README.md + @cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.bpe + @cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.bpe + @cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh + @cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh @if [ ${words ${TRGLANGS}} -gt 1 ]; then \ echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \ >> ${WORKDIR}/README.md; \ diff --git a/preprocess-bpe-multi-target.sh b/preprocess-bpe-multi-target.sh new file mode 100644 index 00000000..ae986dc1 --- /dev/null +++ b/preprocess-bpe-multi-target.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# +# USAGE preprocess.sh source-langid target-langid bpecodes < input > output +# +# +# replace MOSESHOME and SNMTPATH with your own setup! + + +if [ `hostname -d` == "bullx" ]; then + APPLHOME=/projappl/project_2001569 + MOSESHOME=${APPLHOME}/mosesdecoder + SNMTPATH=${APPLHOME}/subword-nmt/subword_nmt +elif [ `hostname -d` == "csc.fi" ]; then + APPLHOME=/proj/memad/tools + MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses + SNMTPATH=${APPLHOME}/subword-nmt +else + MOSESHOME=${PWD}/mosesdecoder + SNMTPATH=${PWD}/subword-nmt +fi + +MOSESSCRIPTS=${MOSESHOME}/scripts +TOKENIZER=${MOSESSCRIPTS}/tokenizer + + +THREADS=4 + +${TOKENIZER}/replace-unicode-punctuation.perl | +${TOKENIZER}/remove-non-printing-char.perl | +${TOKENIZER}/normalize-punctuation.perl -l $1 | +${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 | +sed 's/ */ /g;s/^ *//g;s/ *$//g' | +python3 ${SNMTPATH}/apply_bpe.py -c $3 | +sed "s/^/>>$2<< /" + diff --git a/preprocess-bpe.sh b/preprocess-bpe.sh index 0de089c2..a940e955 100755 --- a/preprocess-bpe.sh +++ b/preprocess-bpe.sh @@ -29,5 +29,5 @@ ${TOKENIZER}/replace-unicode-punctuation.perl | ${TOKENIZER}/remove-non-printing-char.perl | ${TOKENIZER}/normalize-punctuation.perl -l $1 | ${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 | -sed 's/ */ /g;s/^ *//g;s/ *$$//g' | +sed 's/ */ /g;s/^ *//g;s/ *$//g' | python3 ${SNMTPATH}/apply_bpe.py -c $2 diff --git a/preprocess-spm-multi-target.sh b/preprocess-spm-multi-target.sh new file mode 100644 index 00000000..920edf47 --- /dev/null +++ b/preprocess-spm-multi-target.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +# USAGE preprocess.sh source-langid target-langid bpecodes < input > output +# +# +# replace MOSESHOME and SPMENCODE with your own setup! + +if [ `hostname -d` == "bullx" ]; then + APPLHOME=/projappl/project_2001569 + MOSESHOME=${APPLHOME}/mosesdecoder + SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode +elif [ `hostname -d` == "csc.fi" ]; then + APPLHOME=/proj/memad/tools + MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses + SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode +else + MOSESHOME=${PWD}/mosesdecoder + SPMENCODE=${PWD}/marian-dev/build/spm_encode +fi + +MOSESSCRIPTS=${MOSESHOME}/scripts +TOKENIZER=${MOSESSCRIPTS}/tokenizer + + +${TOKENIZER}/replace-unicode-punctuation.perl | +${TOKENIZER}/remove-non-printing-char.perl | +${TOKENIZER}/normalize-punctuation.perl -l $1 | +sed 's/ */ /g;s/^ *//g;s/ *$//g' | +${SPMENCODE} --model $3 | +sed "s/^/>>$2<< /" diff --git a/preprocess-spm.sh b/preprocess-spm.sh index e934ceb2..33a477de 100755 --- a/preprocess-spm.sh +++ b/preprocess-spm.sh @@ -25,5 +25,5 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer ${TOKENIZER}/replace-unicode-punctuation.perl | ${TOKENIZER}/remove-non-printing-char.perl | ${TOKENIZER}/normalize-punctuation.perl -l $1 | -sed 's/ */ /g;s/^ *//g;s/ *$$//g' | +sed 's/ */ /g;s/^ *//g;s/ *$//g' | ${SPMENCODE} --model $2