Merge pull request #217 from moses-smt/alvations-patch-2

Proper spacing for sent-split perl script
2024-08-16 23:10:31 +03:00 · 2020-01-05 19:46:25 -08:00 · 2020-01-05 19:46:25 -08:00 · e4a52f14e4
commit e4a52f14e4
parent f46ee7c5ac d03df21e88
1 changed files with 50 additions and 50 deletions
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@ -27,10 +27,10 @@ my $LIST_ITEM = 0;
 while (@ARGV) {
 	$_ = shift;
 	/^-l$/ && ($language = shift, next);
-  /^-p$/ && ($prefixfile = shift, next);
+	/^-p$/ && ($prefixfile = shift, next);
 	/^-q$/ && ($QUIET = 1, next);
 	/^-h$/ && ($HELP = 1, next);
-  /^-i$/ && ($LIST_ITEM = 1, next);
+	/^-i$/ && ($LIST_ITEM = 1, next);
 	/^-b$/ && ($|++, next); # no output buffering
 }

@ -39,7 +39,7 @@ if ($HELP) {
 	print "-q: quiet mode\n";
 	print "-b: no output buffering (for use in bidirectional pipes)\n";
 	print "-p: use a custom prefix file, overriding the installed one\n";
-  print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
+	print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
 	exit;
 }
 if (!$QUIET) {
@ -53,17 +53,17 @@ if ($language eq "yue" || $language eq "zh" || $language eq "ja") {
 }

 if ($prefixfile ne "") {
-  print STDERR "Loading non-breaking prefixes from $prefixfile\n";
+	print STDERR "Loading non-breaking prefixes from $prefixfile\n";
 } else {

-  $prefixfile = "$mydir/nonbreaking_prefix.$language";
+    $prefixfile = "$mydir/nonbreaking_prefix.$language";

-# Default to English, if we don't have a language-specific prefix file.
-  if (!(-e $prefixfile)) {
-    $prefixfile = "$mydir/nonbreaking_prefix.en";
-    print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
-    die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
-  }
+    # Default to English, if we don't have a language-specific prefix file.
+    if (!(-e $prefixfile)) {
+      $prefixfile = "$mydir/nonbreaking_prefix.en";
+      print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+      die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
 }

 if (-e "$prefixfile") {
@ -122,21 +122,20 @@ sub preprocess {

 	##### Add sentence breaks as needed #####

-  # Sentences can start with upper-case, numnbers,  or Indic characters
-  my $sentence_start = "\\p{IsUpper}0-9";
-  $sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr");
-  $sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu";
-  $sentence_start .= "\\p{Block: Bengali}" if ($language eq "as" || $language eq  "bn" || $language eq "mni"); 
-  $sentence_start .= "\\p{Block: Kannada}" if $language eq "kn"; 
-  $sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml"; 
-  $sentence_start .= "\\p{Block: Oriya}" if $language eq "or"; 
-  $sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa"; 
-  $sentence_start .= "\\p{Block: Tamil}" if $language eq "ta"; 
-  $sentence_start .= "\\p{Block: Telugu}" if $language eq "te"; 
-  $sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
+	# Sentences can start with upper-case, numnbers,  or Indic characters
+	my $sentence_start = "\\p{IsUpper}0-9";
+	$sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr");
+	$sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu";
+	$sentence_start .= "\\p{Block: Bengali}" if ($language eq "as" || $language eq  "bn" || $language eq "mni");
+	$sentence_start .= "\\p{Block: Kannada}" if $language eq "kn";
+	$sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml";
+	$sentence_start .= "\\p{Block: Oriya}" if $language eq "or";
+	$sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa";
+	$sentence_start .= "\\p{Block: Tamil}" if $language eq "ta";
+	$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
+	$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";

-
-  # we include danda and double danda (U+0964 and U+0965) as sentence split characters
+	# we include danda and double danda (U+0964 and U+0965) as sentence split characters

 	# Non-period end of sentence markers (?!) followed by sentence starters.
 	$text =~ s/([?!\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
@ -153,15 +152,15 @@ sub preprocess {
 	# and are followed by a sentence starter punctuation and upper case.
 	$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;

-  
-#NOTE: Korean no longer handled here.
+
+	#NOTE: Korean no longer handled here, cos Korean has spaces.
 	if ($is_cjk == 1) {
 		# Chinese uses unusual end-of-sentence markers. These are NOT
 		# followed by whitespace.  Nor is there any idea of capitalization.
 		# There does not appear to be any unicode category for full-stops
 		# in general, so list them here.  U+3002 U+FF0E U+FF1F U+FF01
 		#$text =~ s/([。．？！♪])/$1\n/g;
-    $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
+    	$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;

 		# A normal full-stop or other Western sentence enders followed
 		# by an ideograph is an end-of-sentence, always.
@ -175,33 +174,34 @@ sub preprocess {
 		# spaces here, so that later processing stages can tokenize readily.
 		# Note that this handles mixed latinate+CJK.
 		# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
-    # bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
-    # what we want from sentence-splitter!
+
+		# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
+    	# what we want from sentence-splitter!
 		#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
 		#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
 		#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
 		#$text =~ s/ +/ /g;
-	} 
+	}

-  # Urdu support
-  # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
-  if ($language eq 'ur') {
-    $text =~ s{
-            ( (?: [\.\?!\x{06d4}] | \.\.+ )
-              [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
-              )
-            \s+
-            ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
-              [\x{0600}-\x{06ff}]
-              )
-        }{$1\n$2}gx;
-  }
+	# Urdu support
+	# https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
+	if ($language eq 'ur') {
+	$text =~ s{
+	        ( (?: [\.\?!\x{06d4}] | \.\.+ )
+	          [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
+	          )
+	        \s+
+	        ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
+	          [\x{0600}-\x{06ff}]
+	          )
+	    }{$1\n$2}gx;
+	}

 	# Special punctuation cases are covered. Check all remaining periods.
 	my $word;
 	my $i;
 	my @words = split(/\h/,$text);
-  #print "NOW $text\n";
+    #print "NOW $text\n";
 	$text = "";
 	for ($i=0;$i<(scalar(@words)-1);$i++) {
    #print "Checking $words[$i] $words[$i+1]\n";
@ -211,19 +211,19 @@ sub preprocess {
 			my $starting_punct = $2;
 			if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
 				# Not breaking;
-   #     print "NBP1 $words[$i] $words[$i+1]\n";
+                ## print "NBP1 $words[$i] $words[$i+1]\n";
 			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
 				# Not breaking - upper case acronym
-  #      print "NBP2 $words[$i] $words[$i+1]\n";
+                #print "NBP2 $words[$i] $words[$i+1]\n";
      } elsif ($LIST_ITEM
             && ($i == 0 || substr($words[$i-1], -1) eq "\n")
             && $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) {
-        #Maybe list item - non breaking
- #       print "NBP3 $words[$i] $words[$i+1]\n";
+        	 	# Maybe list item - non breaking
+ 				#print "NBP3 $words[$i] $words[$i+1]\n";
 			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) {
 				# The next word has a bunch of initial quotes, maybe a
 				# space, then either upper case or a number
-#        print "MAYBE $words[$i] $words[$i+1]\n";
+                #print "MAYBE $words[$i] $words[$i+1]\n";
 				$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
 				# We always add a return for these, unless we have a
 				# numeric non-breaker and a number start.