From 2affb9b624fe8a4f2d51f4e099fa062b52bda504 Mon Sep 17 00:00:00 2001
From: Barry Haddow <barry.haddow@gmail.com>
Date: Thu, 31 Oct 2019 16:50:17 +0000
Subject: [PATCH] reorganise indic support

---
 scripts/ems/support/split-sentences.perl | 42 ++++++++++++++----------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index e553c3b05..4e2798067 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -119,20 +119,27 @@ sub preprocess {
 
 	##### Add sentence breaks as needed #####
 
+  # Sentences can start with upper-case, numnbers,  or Indic characters
+  my $sentence_start = "\\p{IsUpper}0-9";
+  $sentence_start .= "\\p{Block: Devanagari_Extended}" if $language eq "hi";
+  $sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu";
+
+  # we include danda and double danda (U+0964 and U+0965) as sentence split characters
+
 	# Non-period end of sentence markers (?!) followed by sentence starters.
-	$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
+	$text =~ s/([?!\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
 
 	# Multi-dots followed by sentence starters.
-	$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
+	$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
 
 	# Add breaks for sentences that end with some sort of punctuation
 	# inside a quote or parenthetical and are followed by a possible
 	# sentence starter punctuation and upper case.
-	$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
+	$text =~ s/([?!\.\x{0964}\x{0965}][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g;
 
 	# Add breaks for sentences that end with some sort of punctuation,
 	# and are followed by a sentence starter punctuation and upper case.
-	$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
+	$text =~ s/([?!\.\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
 
 	if ($is_cjk == 1) {
 		# Chinese uses unusual end-of-sentence markers. These are NOT
@@ -161,19 +168,20 @@ sub preprocess {
 		#$text =~ s/ +/ /g;
 	}
 
+  # Indic languages dealt with using regular rules above
   # Hindi and Gujarati do not capitalise beginning of sentence characters.
   # Also Hindi traditionally uses a danda as a sentence separator (U+0964)
-  if ($language eq 'hi' || $language eq 'gu') {
-    $text =~ s{
-            ( (?: [\.\?!\x{0964}\x{0965}] | \.\.+ )
-              [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
-              )
-            \s+
-            ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
-              [\p{Block: Devanagari_Extended}\p{Block: Gujarati}]
-              )
-        }{$1\n$2}gx;
-  }
+  #if ($language eq 'hi' || $language eq 'gu') {
+  #  $text =~ s{
+  #          ( (?: [\.\?!\x{0964}\x{0965}] | \.\.+ )
+  #            [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
+  #            )
+  #          \s+
+  #          ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
+  #            [\p{Block: Devanagari_Extended}\p{Block: Gujarati}]
+  #            )
+  #      }{$1\n$2}gx;
+  #}
 
   # Urdu support
   # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
@@ -201,9 +209,9 @@ sub preprocess {
 			my $starting_punct = $2;
 			if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
 				# Not breaking;
-			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
+			} elsif ($words[$i] =~ /(\.)[$sentence_start\-]+(\.+)$/) {
 				# Not breaking - upper case acronym
-			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
+			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) {
 				# The next word has a bunch of initial quotes, maybe a
 				# space, then either upper case or a number
 				$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));