mirror of
https://github.com/Helsinki-NLP/OPUS-MT-train.git
synced 2024-10-26 21:19:02 +03:00
102 lines
2.1 KiB
Bash
Executable File
102 lines
2.1 KiB
Bash
Executable File
#!/bin/bash
|
||
#
|
||
# USAGE preprocess.sh source-langid target-langid spmodel [noflags] < input > output
|
||
#
|
||
# replace SPMENCODE with your own setup!
|
||
#
|
||
# CHANGES
|
||
#
|
||
# * issue with perl code that removes control characters
|
||
# unicode property Other = \p{C}) seems to remove
|
||
# newline characters as well --> add negative lookahead
|
||
# to avoid removing newline characters!
|
||
|
||
|
||
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
|
||
|
||
|
||
if [ "$4" == "noflags" ]; then
|
||
sed -e 's/,/,/g' \
|
||
-e 's/。 */. /g' \
|
||
-e 's/、/,/g' \
|
||
-e 's/”/"/g' \
|
||
-e 's/“/"/g' \
|
||
-e 's/∶/:/g' \
|
||
-e 's/:/:/g' \
|
||
-e 's/?/\?/g' \
|
||
-e 's/《/"/g' \
|
||
-e 's/》/"/g' \
|
||
-e 's/)/\)/g' \
|
||
-e 's/!/\!/g' \
|
||
-e 's/(/\(/g' \
|
||
-e 's/;/;/g' \
|
||
-e 's/1/"/g' \
|
||
-e 's/」/"/g' \
|
||
-e 's/「/"/g' \
|
||
-e 's/0/0/g' \
|
||
-e 's/3/3/g' \
|
||
-e 's/2/2/g' \
|
||
-e 's/5/5/g' \
|
||
-e 's/6/6/g' \
|
||
-e 's/9/9/g' \
|
||
-e 's/7/7/g' \
|
||
-e 's/8/8/g' \
|
||
-e 's/4/4/g' \
|
||
-e 's/. */. /g' \
|
||
-e 's/~/\~/g' \
|
||
-e "s/’/\'/g" \
|
||
-e 's/…/\.\.\./g' \
|
||
-e 's/━/\-/g' \
|
||
-e 's/〈/\</g' \
|
||
-e 's/〉/\>/g' \
|
||
-e 's/【/\[/g' \
|
||
-e 's/】/\]/g' \
|
||
-e 's/%/\%/g' |
|
||
perl -C -pe 's/\p{C}/ /g;' |
|
||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||
${SPMENCODE} --model $3
|
||
else
|
||
sed -e 's/,/,/g' \
|
||
-e 's/。 */. /g' \
|
||
-e 's/、/,/g' \
|
||
-e 's/”/"/g' \
|
||
-e 's/“/"/g' \
|
||
-e 's/∶/:/g' \
|
||
-e 's/:/:/g' \
|
||
-e 's/?/\?/g' \
|
||
-e 's/《/"/g' \
|
||
-e 's/》/"/g' \
|
||
-e 's/)/\)/g' \
|
||
-e 's/!/\!/g' \
|
||
-e 's/(/\(/g' \
|
||
-e 's/;/;/g' \
|
||
-e 's/1/"/g' \
|
||
-e 's/」/"/g' \
|
||
-e 's/「/"/g' \
|
||
-e 's/0/0/g' \
|
||
-e 's/3/3/g' \
|
||
-e 's/2/2/g' \
|
||
-e 's/5/5/g' \
|
||
-e 's/6/6/g' \
|
||
-e 's/9/9/g' \
|
||
-e 's/7/7/g' \
|
||
-e 's/8/8/g' \
|
||
-e 's/4/4/g' \
|
||
-e 's/. */. /g' \
|
||
-e 's/~/\~/g' \
|
||
-e "s/’/\'/g" \
|
||
-e 's/…/\.\.\./g' \
|
||
-e 's/━/\-/g' \
|
||
-e 's/〈/\</g' \
|
||
-e 's/〉/\>/g' \
|
||
-e 's/【/\[/g' \
|
||
-e 's/】/\]/g' \
|
||
-e 's/%/\%/g' |
|
||
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
|
||
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
|
||
sed 's/ */ /g;s/^ *//g;s/ *$//g' |
|
||
${SPMENCODE} --model $3 |
|
||
sed "s/^/>>$2<< /"
|
||
fi
|
||
|