sentence splitter -k option to keep line boundaries

This commit is contained in:
Kenneth Heafield 2020-03-19 15:44:41 +00:00
parent 0a892749bc
commit 89b9b4fba2

View File

@ -24,6 +24,7 @@ my $QUIET = 0;
my $HELP = 0;
my $LIST_ITEM = 0;
my $NOP = 0;
my $KEEP_LINES = 0;
while (@ARGV) {
$_ = shift;
@ -33,6 +34,7 @@ while (@ARGV) {
/^-h$/ && ($HELP = 1, next);
/^-i$/ && ($LIST_ITEM = 1, next);
/^-n$/ && ($NOP = 1, next);
/^-k$/ && ($KEEP_LINES = 1, next);
/^-b$/ && ($|++, next); # no output buffering
}
@ -43,6 +45,7 @@ if ($HELP) {
print "-p: use a custom prefix file, overriding the installed one\n";
print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
print "-n: do not emit <P> after paragraphs\n";
print "-k: keep existing line boundaries\n";
exit;
}
if (!$QUIET) {
@ -89,13 +92,14 @@ if (-e "$prefixfile") {
my $text = "";
while (<STDIN>) {
chomp;
if (/^<.+>$/ || /^\s*$/) {
if ($KEEP_LINES) {
&do_it_for($_,"");
} elsif (/^<.+>$/ || /^\s*$/) {
# Time to process this block; we've hit a blank or <p>
&do_it_for($text, $_);
print "<P>\n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by <P>
$text = "";
}
else {
} else {
# Append the text, with a space.
$text .= $_. " ";
}
@ -163,7 +167,7 @@ sub preprocess {
# There does not appear to be any unicode category for full-stops
# in general, so list them here. U+3002 U+FF0E U+FF1F U+FF01
#$text =~ s/([。.?!♪])/$1\n/g;
$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
# A normal full-stop or other Western sentence enders followed
# by an ideograph is an end-of-sentence, always.
@ -179,7 +183,7 @@ sub preprocess {
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
# what we want from sentence-splitter!
# what we want from sentence-splitter!
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
@ -204,10 +208,10 @@ sub preprocess {
my $word;
my $i;
my @words = split(/\h/,$text);
#print "NOW $text\n";
#print "NOW $text\n";
$text = "";
for ($i=0;$i<(scalar(@words)-1);$i++) {
#print "Checking $words[$i] $words[$i+1]\n";
#print "Checking $words[$i] $words[$i+1]\n";
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
# Check if $1 is a known honorific and $2 is empty, never break.
my $prefix = $1;