diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 90fa6ac90..a1cfb0d37 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -165,6 +165,20 @@ sub preprocess { }{$1\n$2}gx; } + # Urdu support + # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode + if ($language eq 'ur') { + $text =~ s{ + ( (?: [\.\?!\x{06d4}] | \.\.+ ) + [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]* + ) + \s+ + ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]* + [\x{0600}-\x{06ff}] + ) + }{$1\n$2}gx; + } + # Special punctuation cases are covered. Check all remaining periods. my $word; my $i;