Proper spacing

2024-12-25 21:03:22 +03:00 · 2020-01-06 11:43:31 +08:00 · 2020-01-06 11:43:31 +08:00 · d03df21e88
commit d03df21e88
parent f46ee7c5ac
1 changed files with 50 additions and 50 deletions
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@ -135,7 +135,6 @@ sub preprocess {
 	$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
 	$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
 	# we include danda and double danda (U+0964 and U+0965) as sentence split characters
 	# Non-period end of sentence markers (?!) followed by sentence starters.
@ -154,7 +153,7 @@ sub preprocess {
 	$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
-#NOTE: Korean no longer handled here.
+	#NOTE: Korean no longer handled here, cos Korean has spaces.
 	if ($is_cjk == 1) {
 		# Chinese uses unusual end-of-sentence markers. These are NOT
 		# followed by whitespace.  Nor is there any idea of capitalization.
@ -175,6 +174,7 @@ sub preprocess {
 		# spaces here, so that later processing stages can tokenize readily.
 		# Note that this handles mixed latinate+CJK.
 		# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
 		# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
    	# what we want from sentence-splitter!
 		#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
@ -211,7 +211,7 @@ sub preprocess {
 			my $starting_punct = $2;
 			if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
 				# Not breaking;
-   #     print "NBP1 $words[$i] $words[$i+1]\n";
+                ## print "NBP1 $words[$i] $words[$i+1]\n";
 			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
 				# Not breaking - upper case acronym
                #print "NBP2 $words[$i] $words[$i+1]\n";