sentence splitter -k option to keep line boundaries

This commit is contained in:
Kenneth Heafield 2020-03-19 15:44:41 +00:00
parent 0a892749bc
commit 89b9b4fba2

View File

@ -24,6 +24,7 @@ my $QUIET = 0;
my $HELP = 0;
my $LIST_ITEM = 0;
my $NOP = 0;
my $KEEP_LINES = 0;
while (@ARGV) {
$_ = shift;
@ -33,6 +34,7 @@ while (@ARGV) {
/^-h$/ && ($HELP = 1, next);
/^-i$/ && ($LIST_ITEM = 1, next);
/^-n$/ && ($NOP = 1, next);
/^-k$/ && ($KEEP_LINES = 1, next);
/^-b$/ && ($|++, next); # no output buffering
}
@ -43,6 +45,7 @@ if ($HELP) {
print "-p: use a custom prefix file, overriding the installed one\n";
print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
print "-n: do not emit <P> after paragraphs\n";
print "-k: keep existing line boundaries\n";
exit;
}
if (!$QUIET) {
@ -89,13 +92,14 @@ if (-e "$prefixfile") {
my $text = "";
while (<STDIN>) {
chomp;
if (/^<.+>$/ || /^\s*$/) {
if ($KEEP_LINES) {
&do_it_for($_,"");
} elsif (/^<.+>$/ || /^\s*$/) {
# Time to process this block; we've hit a blank or <p>
&do_it_for($text, $_);
print "<P>\n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by <P>
$text = "";
}
else {
} else {
# Append the text, with a space.
$text .= $_. " ";
}