diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 90fa6ac90..b7d5c5a6d 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -145,10 +145,12 @@ sub preprocess { # spaces here, so that later processing stages can tokenize readily. # Note that this handles mixed latinate+CJK. # TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ? - $text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g; - $text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g; - $text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g; - $text =~ s/ +/ /g; + # bhaddow - Comment this out since it adds white-space between Chinese characters. This is not + # what we want from sentence-splitter! + #$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g; + #$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g; + #$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g; + #$text =~ s/ +/ /g; } # Hindi and Gujarati do not capitalise beginning of sentence characters.