OPUS-MT-train/scripts/preprocess-spm-multi-target.sh

102 lines
2.1 KiB
Bash
Raw Permalink Normal View History

#!/bin/bash
#
# USAGE preprocess.sh source-langid target-langid spmodel [noflags] < input > output
#
# replace SPMENCODE with your own setup!
2021-02-25 18:17:21 +03:00
#
# CHANGES
#
# * issue with perl code that removes control characters
# unicode property Other = \p{C}) seems to remove
# newline characters as well --> add negative lookahead
# to avoid removing newline characters!
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
if [ "$4" == "noflags" ]; then
sed -e 's//,/g' \
-e 's/。 */. /g' \
-e 's/、/,/g' \
-e 's/”/"/g' \
-e 's/“/"/g' \
-e 's//:/g' \
-e 's//:/g' \
-e 's//\?/g' \
-e 's/《/"/g' \
-e 's/》/"/g' \
-e 's//\)/g' \
-e 's//\!/g' \
-e 's//\(/g' \
-e 's//;/g' \
-e 's//"/g' \
-e 's/」/"/g' \
-e 's/「/"/g' \
-e 's//0/g' \
-e 's//3/g' \
-e 's//2/g' \
-e 's//5/g' \
-e 's//6/g' \
-e 's//9/g' \
-e 's//7/g' \
-e 's//8/g' \
-e 's//4/g' \
-e 's/ */. /g' \
-e 's//\~/g' \
-e "s//\'/g" \
-e 's/…/\.\.\./g' \
-e 's/━/\-/g' \
-e 's/〈/\</g' \
-e 's/〉/\>/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/\p{C}/ /g;' |
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3
else
sed -e 's//,/g' \
-e 's/。 */. /g' \
-e 's/、/,/g' \
-e 's/”/"/g' \
-e 's/“/"/g' \
-e 's//:/g' \
-e 's//:/g' \
-e 's//\?/g' \
-e 's/《/"/g' \
-e 's/》/"/g' \
-e 's//\)/g' \
-e 's//\!/g' \
-e 's//\(/g' \
-e 's//;/g' \
-e 's//"/g' \
-e 's/」/"/g' \
-e 's/「/"/g' \
-e 's//0/g' \
-e 's//3/g' \
-e 's//2/g' \
-e 's//5/g' \
-e 's//6/g' \
-e 's//9/g' \
-e 's//7/g' \
-e 's//8/g' \
-e 's//4/g' \
-e 's/ */. /g' \
-e 's//\~/g' \
-e "s//\'/g" \
-e 's/…/\.\.\./g' \
-e 's/━/\-/g' \
-e 's/〈/\</g' \
-e 's/〉/\>/g' \
-e 's/【/\[/g' \
-e 's/】/\]/g' \
-e 's//\%/g' |
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
${SPMENCODE} --model $3 |
sed "s/^/>>$2<< /"
fi