From 2e48f83ab4cbf93b4f39eb8a8f91d1662cc9f5e0 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linasvepstas@gmail.com>
Date: Sun, 8 Jan 2017 10:08:53 -0600
Subject: [PATCH] Handle punctuation+CJK combinations.

---
 scripts/ems/support/split-sentences.perl | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 160c5d548..c8ff87dde 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -128,13 +128,19 @@ sub preprocess {
 
 	# A normal full-stop or other Western sentence enders followed
 	# by an ideograph is an and-of-sentence, always.
-	$text =~ s/([\.?!]) *(\p{InCJK})/$1\n$2/g;
+	$text =~ s/([\.?!]) *(\p{CJK})/$1\n$2/g;
+
+	# Split close-paren-then-comma into two.
+	$text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g;
 
 	# Chinese does not use any sort of white-space between ideographs.
 	# Nominally, each single ideograph corresponds to one word. Add
 	# spaces here, so that later processing stages can tokenize readily.
 	# Note that this handles mixed latinate+CJK.
-	$text =~ s/(\p{InCJK})/ $1 /g;
+	# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
+	$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g
+	$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
+	$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
 	$text =~ s/ +/ /g;
 
 	# Special punctuation cases are covered. Check all remaining periods.