mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2025-01-08 10:48:25 +03:00
45 lines
1.2 KiB
Bash
Executable File
45 lines
1.2 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# USAGE preprocess.sh source-langid target-langid bpecodes [noflags] < input > output
|
|
#
|
|
#
|
|
# replace MOSESHOME and SNMTPATH with your own setup!
|
|
|
|
|
|
if [ `hostname -d` == "bullx" ]; then
|
|
APPLHOME=/projappl/project_2001569
|
|
MOSESHOME=${APPLHOME}/mosesdecoder
|
|
SNMTPATH=${APPLHOME}/subword-nmt/subword_nmt
|
|
elif [ `hostname -d` == "csc.fi" ]; then
|
|
APPLHOME=/proj/memad/tools
|
|
MOSESHOME=/proj/nlpl/software/moses/4.0-65c75ff/moses
|
|
SNMTPATH=${APPLHOME}/subword-nmt
|
|
else
|
|
MOSESHOME=${PWD}/mosesdecoder
|
|
SNMTPATH=${PWD}/subword-nmt
|
|
fi
|
|
|
|
MOSESSCRIPTS=${MOSESHOME}/scripts
|
|
TOKENIZER=${MOSESSCRIPTS}/tokenizer
|
|
|
|
|
|
THREADS=4
|
|
|
|
if [ "$4" == "noflags" ]; then
|
|
${TOKENIZER}/replace-unicode-punctuation.perl |
|
|
${TOKENIZER}/remove-non-printing-char.perl |
|
|
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
|
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
|
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
|
python3 ${SNMTPATH}/apply_bpe.py -c $3
|
|
else
|
|
${TOKENIZER}/replace-unicode-punctuation.perl |
|
|
${TOKENIZER}/remove-non-printing-char.perl |
|
|
${TOKENIZER}/normalize-punctuation.perl -l $1 |
|
|
${TOKENIZER}/tokenizer.perl -a -threads ${THREADS} -l $1 |
|
|
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
|
python3 ${SNMTPATH}/apply_bpe.py -c $3 |
|
|
sed "s/^/>>$2<< /"
|
|
fi
|
|
|