From 2e48f83ab4cbf93b4f39eb8a8f91d1662cc9f5e0 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 8 Jan 2017 10:08:53 -0600 Subject: [PATCH] Handle punctuation+CJK combinations. --- scripts/ems/support/split-sentences.perl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 160c5d548..c8ff87dde 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -128,13 +128,19 @@ sub preprocess { # A normal full-stop or other Western sentence enders followed # by an ideograph is an and-of-sentence, always. - $text =~ s/([\.?!]) *(\p{InCJK})/$1\n$2/g; + $text =~ s/([\.?!]) *(\p{CJK})/$1\n$2/g; + + # Split close-paren-then-comma into two. + $text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g; # Chinese does not use any sort of white-space between ideographs. # Nominally, each single ideograph corresponds to one word. Add # spaces here, so that later processing stages can tokenize readily. # Note that this handles mixed latinate+CJK. - $text =~ s/(\p{InCJK})/ $1 /g; + # TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ? + $text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g + $text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g; + $text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g; $text =~ s/ +/ /g; # Special punctuation cases are covered. Check all remaining periods.