OPUS-MT-train/preprocess-bpe-multi-target.sh
Joerg Tiedemann 08c17af2ee sami
2020-03-27 22:30:51 +02:00

45 lines
1.2 KiB
Bash
Executable File

#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
#
#
# replace MOSESHOME and SNMTPATH with your own setup!
if [ `hostname -d` == "bullx" ]; then
APPLHOME=/projappl/project_2001569
MOSESHOME=${APPLHOME}/mosesdecoder
SNMTPATH=${APPLHOME}/subword-nmt/subword_nmt
elif [ `hostname -d` == "csc.fi" ]; then
APPLHOME=/proj/memad/tools
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
SNMTPATH=${APPLHOME}/subword-nmt
else
MOSESHOME=${PWD}/mosesdecoder
SNMTPATH=${PWD}/subword-nmt
fi
MOSESSCRIPTS=${MOSESHOME}/scripts
TOKENIZER=${MOSESSCRIPTS}/tokenizer
THREADS=4
if [ "$4" == "noflags" ]; then
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
python3 ${SNMTPATH}/apply_bpe.py -c $3
else
${TOKENIZER}/replace-unicode-punctuation.perl |
${TOKENIZER}/remove-non-printing-char.perl |
${TOKENIZER}/normalize-punctuation.perl -l $1 |
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
sed "s/^/>>$2<< /"
fi