From 3910cd6c4625eefa57600159e66f9a86122750fa Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 31 Oct 2019 21:28:43 +0000 Subject: [PATCH] devanagari fix --- scripts/ems/support/split-sentences.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 4e2798067..0279a0b88 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -121,7 +121,7 @@ sub preprocess { # Sentences can start with upper-case, numnbers, or Indic characters my $sentence_start = "\\p{IsUpper}0-9"; - $sentence_start .= "\\p{Block: Devanagari_Extended}" if $language eq "hi"; + $sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if $language eq "hi"; $sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu"; # we include danda and double danda (U+0964 and U+0965) as sentence split characters