mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-11-30 22:14:14 +03:00
pre-processing scripts fixed
This commit is contained in:
parent
831acb1ae7
commit
0185534823
@ -91,6 +91,47 @@ DATE = ${shell date +%F}
|
||||
MODELS_URL = https://object.pouta.csc.fi/OPUS-MT-dev
|
||||
SKIP_DIST_EVAL = 0
|
||||
|
||||
|
||||
## determine pre-processing type
|
||||
|
||||
ifneq (${words ${TRGLANGS}},1)
|
||||
ifneq ("$(wildcard ${BPESRCMODEL})","")
|
||||
PREPROCESS_SCRIPT = preprocess-bpe-multi-target.sh
|
||||
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
|
||||
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
|
||||
POSTPROCESS_SCRIPT = postprocess-bpe.sh
|
||||
PREPROCESS_TYPE = normalization + tokenization + BPE
|
||||
else
|
||||
PREPROCESS_SCRIPT = preprocess-spm-multi-target.sh
|
||||
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
|
||||
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
|
||||
POSTPROCESS_SCRIPT = postprocess-spm.sh
|
||||
PREPROCESS_TYPE = normalization + SentencePiece
|
||||
endif
|
||||
else
|
||||
ifneq ("$(wildcard ${BPESRCMODEL})","")
|
||||
PREPROCESS_SCRIPT = preprocess-bpe.sh
|
||||
PREPROCESS_SRCMODEL = ${BPESRCMODEL}
|
||||
PREPROCESS_TRGMODEL = ${BPETRGMODEL}
|
||||
POSTPROCESS_SCRIPT = postprocess-bpe.sh
|
||||
PREPROCESS_TYPE = normalization + tokenization + BPE
|
||||
else
|
||||
PREPROCESS_SCRIPT = preprocess-spm.sh
|
||||
PREPROCESS_SRCMODEL = ${SPMSRCMODEL}
|
||||
PREPROCESS_TRGMODEL = ${SPMTRGMODEL}
|
||||
POSTPROCESS_SCRIPT = postprocess-spm.sh
|
||||
PREPROCESS_TYPE = normalization + SentencePiece
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ttt:
|
||||
@echo ${PREPROCESS_SRCMODEL}
|
||||
@echo ${PREPROCESS_TRGMODEL}
|
||||
@echo ${PREPROCESS_SCRIPT}
|
||||
@echo ${POSTPROCESS_SCRIPT}
|
||||
|
||||
|
||||
${DIST_PACKAGE}: ${MODEL_FINAL}
|
||||
ifneq (${SKIP_DIST_EVAL},1)
|
||||
@${MAKE} $(TEST_EVALUATION)
|
||||
@ -102,19 +143,11 @@ endif
|
||||
@echo '' >> ${WORKDIR}/README.md
|
||||
@echo "* dataset: ${DATASET}" >> ${WORKDIR}/README.md
|
||||
@echo "* model: ${MODELTYPE}" >> ${WORKDIR}/README.md
|
||||
@if [ -e ${BPESRCMODEL} ]; then \
|
||||
echo "* pre-processing: normalization + tokenization + BPE" >> ${WORKDIR}/README.md; \
|
||||
cp ${BPESRCMODEL} ${WORKDIR}/source.bpe; \
|
||||
cp ${BPETRGMODEL} ${WORKDIR}/target.bpe; \
|
||||
cp preprocess-bpe.sh ${WORKDIR}/preprocess.sh; \
|
||||
cp postprocess-bpe.sh ${WORKDIR}/postprocess.sh; \
|
||||
elif [ -e ${SPMSRCMODEL} ]; then \
|
||||
echo "* pre-processing: normalization + SentencePiece" >> ${WORKDIR}/README.md; \
|
||||
cp ${SPMSRCMODEL} ${WORKDIR}/source.spm; \
|
||||
cp ${SPMTRGMODEL} ${WORKDIR}/target.spm; \
|
||||
cp preprocess-spm.sh ${WORKDIR}/preprocess.sh; \
|
||||
cp postprocess-spm.sh ${WORKDIR}/postprocess.sh; \
|
||||
fi
|
||||
@echo "* pre-processing: ${PREPROCESS_TYPE}" >> ${WORKDIR}/README.md
|
||||
@cp ${PREPROCESS_SRCMODEL} ${WORKDIR}/source.bpe
|
||||
@cp ${PREPROCESS_TRGMODEL} ${WORKDIR}/target.bpe
|
||||
@cp ${PREPROCESS_SCRIPT} ${WORKDIR}/preprocess.sh
|
||||
@cp ${POSTPROCESS_SCRIPT} ${WORKDIR}/postprocess.sh
|
||||
@if [ ${words ${TRGLANGS}} -gt 1 ]; then \
|
||||
echo '* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)' \
|
||||
>> ${WORKDIR}/README.md; \
|
||||
|
35
preprocess-bpe-multi-target.sh
Normal file
35
preprocess-bpe-multi-target.sh
Normal file
@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
|
||||
#
|
||||
#
|
||||
# replace MOSESHOME and SNMTPATH with your own setup!
|
||||
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
MOSESHOME=${APPLHOME}/mosesdecoder
|
||||
SNMTPATH=${APPLHOME}/subword-nmt/subword_nmt
|
||||
elif [ `hostname -d` == "csc.fi" ]; then
|
||||
APPLHOME=/proj/memad/tools
|
||||
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||
SNMTPATH=${APPLHOME}/subword-nmt
|
||||
else
|
||||
MOSESHOME=${PWD}/mosesdecoder
|
||||
SNMTPATH=${PWD}/subword-nmt
|
||||
fi
|
||||
|
||||
MOSESSCRIPTS=${MOSESHOME}/scripts
|
||||
TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
|
||||
THREADS=4
|
||||
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
|
||||
sed "s/^/>>$2<< /"
|
||||
|
@ -29,5 +29,5 @@ ${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
python3 ${SNMTPATH}/apply_bpe.py -c $2
|
||||
|
30
preprocess-spm-multi-target.sh
Normal file
30
preprocess-spm-multi-target.sh
Normal file
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# USAGE preprocess.sh source-langid target-langid bpecodes < input > output
|
||||
#
|
||||
#
|
||||
# replace MOSESHOME and SPMENCODE with your own setup!
|
||||
|
||||
if [ `hostname -d` == "bullx" ]; then
|
||||
APPLHOME=/projappl/project_2001569
|
||||
MOSESHOME=${APPLHOME}/mosesdecoder
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
elif [ `hostname -d` == "csc.fi" ]; then
|
||||
APPLHOME=/proj/memad/tools
|
||||
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
|
||||
SPMENCODE=${APPLHOME}/marian-dev/build-spm/spm_encode
|
||||
else
|
||||
MOSESHOME=${PWD}/mosesdecoder
|
||||
SPMENCODE=${PWD}/marian-dev/build/spm_encode
|
||||
fi
|
||||
|
||||
MOSESSCRIPTS=${MOSESHOME}/scripts
|
||||
TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
|
||||
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $3 |
|
||||
sed "s/^/>>$2<< /"
|
@ -25,5 +25,5 @@ TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
||||
${TOKENIZER}/replace-unicode-punctuation.perl |
|
||||
${TOKENIZER}/remove-non-printing-char.perl |
|
||||
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$$//g' |
|
||||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||||
${SPMENCODE} --model $2
|
||||
|
Loading…
Reference in New Issue
Block a user