diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 03febea63..206b7ebe9 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -24,6 +24,7 @@ my $QUIET = 0; my $HELP = 0; my $LIST_ITEM = 0; my $NOP = 0; +my $KEEP_LINES = 0; while (@ARGV) { $_ = shift; @@ -33,6 +34,7 @@ while (@ARGV) { /^-h$/ && ($HELP = 1, next); /^-i$/ && ($LIST_ITEM = 1, next); /^-n$/ && ($NOP = 1, next); + /^-k$/ && ($KEEP_LINES = 1, next); /^-b$/ && ($|++, next); # no output buffering } @@ -43,6 +45,7 @@ if ($HELP) { print "-p: use a custom prefix file, overriding the installed one\n"; print "-i: avoid splitting on list items (e.g. 1. This is the first)\n"; print "-n: do not emit
after paragraphs\n";
+ print "-k: keep existing line boundaries\n";
exit;
}
if (!$QUIET) {
@@ -89,13 +92,14 @@ if (-e "$prefixfile") {
my $text = "";
while (
&do_it_for($text, $_);
print " \n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by
$text = "";
- }
- else {
+ } else {
# Append the text, with a space.
$text .= $_. " ";
}
@@ -163,7 +167,7 @@ sub preprocess {
# There does not appear to be any unicode category for full-stops
# in general, so list them here. U+3002 U+FF0E U+FF1F U+FF01
#$text =~ s/([。.?!♪])/$1\n/g;
- $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
+ $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
# A normal full-stop or other Western sentence enders followed
# by an ideograph is an end-of-sentence, always.
@@ -179,7 +183,7 @@ sub preprocess {
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
- # what we want from sentence-splitter!
+ # what we want from sentence-splitter!
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
@@ -204,10 +208,10 @@ sub preprocess {
my $word;
my $i;
my @words = split(/\h/,$text);
- #print "NOW $text\n";
+ #print "NOW $text\n";
$text = "";
for ($i=0;$i<(scalar(@words)-1);$i++) {
- #print "Checking $words[$i] $words[$i+1]\n";
+ #print "Checking $words[$i] $words[$i+1]\n";
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
# Check if $1 is a known honorific and $2 is empty, never break.
my $prefix = $1;