enable custom non breaking prefixes

This commit is contained in:
Barry Haddow 2019-09-30 16:52:24 +01:00
parent 01a8ec41e8
commit 257d7e5e66

View File

@ -18,6 +18,7 @@ my $mydir = "$RealBin/../../share/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = (); my %NONBREAKING_PREFIX = ();
my $language = "en"; my $language = "en";
my $prefixfile = "";
my $is_cjk = 0; my $is_cjk = 0;
my $QUIET = 0; my $QUIET = 0;
my $HELP = 0; my $HELP = 0;
@ -25,15 +26,17 @@ my $HELP = 0;
while (@ARGV) { while (@ARGV) {
$_ = shift; $_ = shift;
/^-l$/ && ($language = shift, next); /^-l$/ && ($language = shift, next);
/^-p$/ && ($prefixfile = shift, next);
/^-q$/ && ($QUIET = 1, next); /^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next); /^-h$/ && ($HELP = 1, next);
/^-b$/ && ($|++, next); # no output buffering /^-b$/ && ($|++, next); # no output buffering
} }
if ($HELP) { if ($HELP) {
print "Usage ./split-sentences.perl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n"; print "Usage ./split-sentences.perl (-l [en|de|...]) [-p prefix-file] [-q] [-b] < textfile > splitfile\n";
print "-q: quiet mode\n"; print "-q: quiet mode\n";
print "-b: no output buffering (for use in bidirectional pipes)\n"; print "-b: no output buffering (for use in bidirectional pipes)\n";
print "-p: use a custom prefix file, overriding the installed one\n";
exit; exit;
} }
if (!$QUIET) { if (!$QUIET) {
@ -46,13 +49,18 @@ if ($language eq "yue" || $language eq "zh") {
$is_cjk = 1; $is_cjk = 1;
} }
my $prefixfile = "$mydir/nonbreaking_prefix.$language"; if ($prefixfile ne "") {
print STDERR "Loading non-breaking prefixes from $prefixfile\n";
} else {
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
# Default to English, if we don't have a language-specific prefix file. # Default to English, if we don't have a language-specific prefix file.
if (!(-e $prefixfile)) { if (!(-e $prefixfile)) {
$prefixfile = "$mydir/nonbreaking_prefix.en"; $prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
}
} }
if (-e "$prefixfile") { if (-e "$prefixfile") {
@ -65,6 +73,7 @@ if (-e "$prefixfile") {
$NONBREAKING_PREFIX{$1} = 2; $NONBREAKING_PREFIX{$1} = 2;
} else { } else {
$NONBREAKING_PREFIX{$item} = 1; $NONBREAKING_PREFIX{$item} = 1;
print STDERR "nbp: $item\n";
} }
} }
} }