From 78ca5f3cc5aa671a8a5d36c56452e217e6f00828 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 3 Aug 2020 21:51:09 +0100 Subject: [PATCH] Allow Arabic letters to begin a fa sentence --- scripts/ems/support/split-sentences.perl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 206b7ebe9..5df22cdc9 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -141,6 +141,7 @@ sub preprocess { $sentence_start .= "\\p{Block: Tamil}" if $language eq "ta"; $sentence_start .= "\\p{Block: Telugu}" if $language eq "te"; $sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko"; + $sentence_start .= "\\p{Arabic}" if $language eq "fa"; # we include danda and double danda (U+0964 and U+0965) as sentence split characters