do not add spaces in cjk

This commit is contained in:
Barry Haddow 2019-09-30 15:33:26 +01:00
parent 158d252389
commit 768944d851

View File

@ -145,10 +145,12 @@ sub preprocess {
# spaces here, so that later processing stages can tokenize readily.
# Note that this handles mixed latinate+CJK.
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
$text =~ s/ +/ /g;
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
# what we want from sentence-splitter!
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
#$text =~ s/ +/ /g;
}
# Hindi and Gujarati do not capitalise beginning of sentence characters.