diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 03febea63..206b7ebe9 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -24,6 +24,7 @@ my $QUIET = 0; my $HELP = 0; my $LIST_ITEM = 0; my $NOP = 0; +my $KEEP_LINES = 0; while (@ARGV) { $_ = shift; @@ -33,6 +34,7 @@ while (@ARGV) { /^-h$/ && ($HELP = 1, next); /^-i$/ && ($LIST_ITEM = 1, next); /^-n$/ && ($NOP = 1, next); + /^-k$/ && ($KEEP_LINES = 1, next); /^-b$/ && ($|++, next); # no output buffering } @@ -43,6 +45,7 @@ if ($HELP) { print "-p: use a custom prefix file, overriding the installed one\n"; print "-i: avoid splitting on list items (e.g. 1. This is the first)\n"; print "-n: do not emit

after paragraphs\n"; + print "-k: keep existing line boundaries\n"; exit; } if (!$QUIET) { @@ -89,13 +92,14 @@ if (-e "$prefixfile") { my $text = ""; while () { chomp; - if (/^<.+>$/ || /^\s*$/) { + if ($KEEP_LINES) { + &do_it_for($_,""); + } elsif (/^<.+>$/ || /^\s*$/) { # Time to process this block; we've hit a blank or

&do_it_for($text, $_); print "

\n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by

$text = ""; - } - else { + } else { # Append the text, with a space. $text .= $_. " "; } @@ -163,7 +167,7 @@ sub preprocess { # There does not appear to be any unicode category for full-stops # in general, so list them here. U+3002 U+FF0E U+FF1F U+FF01 #$text =~ s/([。.?!♪])/$1\n/g; - $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g; + $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g; # A normal full-stop or other Western sentence enders followed # by an ideograph is an end-of-sentence, always. @@ -179,7 +183,7 @@ sub preprocess { # TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ? # bhaddow - Comment this out since it adds white-space between Chinese characters. This is not - # what we want from sentence-splitter! + # what we want from sentence-splitter! #$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g; #$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g; #$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g; @@ -204,10 +208,10 @@ sub preprocess { my $word; my $i; my @words = split(/\h/,$text); - #print "NOW $text\n"; + #print "NOW $text\n"; $text = ""; for ($i=0;$i<(scalar(@words)-1);$i++) { - #print "Checking $words[$i] $words[$i+1]\n"; + #print "Checking $words[$i] $words[$i+1]\n"; if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) { # Check if $1 is a known honorific and $2 is empty, never break. my $prefix = $1;