Merge pull request #217 from moses-smt/alvations-patch-2

Proper spacing for sent-split perl script
2024-09-11 11:25:40 +03:00 · 2020-01-05 19:46:25 -08:00 · 2020-01-05 19:46:25 -08:00 · e4a52f14e4
commit e4a52f14e4
parent f46ee7c5ac d03df21e88
1 changed files with 50 additions and 50 deletions
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@ -58,7 +58,7 @@ if ($prefixfile ne "") {

    $prefixfile = "$mydir/nonbreaking_prefix.$language";

-# Default to English, if we don't have a language-specific prefix file.
+    # Default to English, if we don't have a language-specific prefix file.
    if (!(-e $prefixfile)) {
      $prefixfile = "$mydir/nonbreaking_prefix.en";
      print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
@ -135,7 +135,6 @@ sub preprocess {
 	$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
 	$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";

-
 	# we include danda and double danda (U+0964 and U+0965) as sentence split characters

 	# Non-period end of sentence markers (?!) followed by sentence starters.
@ -154,7 +153,7 @@ sub preprocess {
 	$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;


-#NOTE: Korean no longer handled here.
+	#NOTE: Korean no longer handled here, cos Korean has spaces.
 	if ($is_cjk == 1) {
 		# Chinese uses unusual end-of-sentence markers. These are NOT
 		# followed by whitespace.  Nor is there any idea of capitalization.
@ -175,6 +174,7 @@ sub preprocess {
 		# spaces here, so that later processing stages can tokenize readily.
 		# Note that this handles mixed latinate+CJK.
 		# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
+
 		# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
    	# what we want from sentence-splitter!
 		#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
@ -211,19 +211,19 @@ sub preprocess {
 			my $starting_punct = $2;
 			if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
 				# Not breaking;
-   #     print "NBP1 $words[$i] $words[$i+1]\n";
+                ## print "NBP1 $words[$i] $words[$i+1]\n";
 			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
 				# Not breaking - upper case acronym
-  #      print "NBP2 $words[$i] $words[$i+1]\n";
+                #print "NBP2 $words[$i] $words[$i+1]\n";
      } elsif ($LIST_ITEM
             && ($i == 0 || substr($words[$i-1], -1) eq "\n")
             && $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) {
-        #Maybe list item - non breaking
- #       print "NBP3 $words[$i] $words[$i+1]\n";
+        	 	# Maybe list item - non breaking
+ 				#print "NBP3 $words[$i] $words[$i+1]\n";
 			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) {
 				# The next word has a bunch of initial quotes, maybe a
 				# space, then either upper case or a number
-#        print "MAYBE $words[$i] $words[$i+1]\n";
+                #print "MAYBE $words[$i] $words[$i+1]\n";
 				$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
 				# We always add a return for these, unless we have a
 				# numeric non-breaker and a number start.