From 7ad5ffa0c0e57308b18b2c705c98ba103902f135 Mon Sep 17 00:00:00 2001 From: Achim Ruopp Date: Wed, 10 Jul 2019 10:48:32 -0400 Subject: [PATCH] Support for Urdu in sentence splitter --- scripts/ems/support/split-sentences.perl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 90fa6ac90..a1cfb0d37 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -165,6 +165,20 @@ sub preprocess { }{$1\n$2}gx; } + # Urdu support + # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode + if ($language eq 'ur') { + $text =~ s{ + ( (?: [\.\?!\x{06d4}] | \.\.+ ) + [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]* + ) + \s+ + ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]* + [\x{0600}-\x{06ff}] + ) + }{$1\n$2}gx; + } + # Special punctuation cases are covered. Check all remaining periods. my $word; my $i;