Merge pull request #217 from moses-smt/alvations-patch-2

Proper spacing for sent-split perl script
This commit is contained in:
Hieu Hoang 2020-01-05 19:46:25 -08:00 committed by GitHub
commit e4a52f14e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -58,7 +58,7 @@ if ($prefixfile ne "") {
$prefixfile = "$mydir/nonbreaking_prefix.$language";
# Default to English, if we don't have a language-specific prefix file.
# Default to English, if we don't have a language-specific prefix file.
if (!(-e $prefixfile)) {
$prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
@ -135,7 +135,6 @@ sub preprocess {
$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
# we include danda and double danda (U+0964 and U+0965) as sentence split characters
# Non-period end of sentence markers (?!) followed by sentence starters.
@ -154,7 +153,7 @@ sub preprocess {
$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
#NOTE: Korean no longer handled here.
#NOTE: Korean no longer handled here, cos Korean has spaces.
if ($is_cjk == 1) {
# Chinese uses unusual end-of-sentence markers. These are NOT
# followed by whitespace. Nor is there any idea of capitalization.
@ -175,6 +174,7 @@ sub preprocess {
# spaces here, so that later processing stages can tokenize readily.
# Note that this handles mixed latinate+CJK.
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
# what we want from sentence-splitter!
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
@ -211,19 +211,19 @@ sub preprocess {
my $starting_punct = $2;
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
# Not breaking;
# print "NBP1 $words[$i] $words[$i+1]\n";
## print "NBP1 $words[$i] $words[$i+1]\n";
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
# Not breaking - upper case acronym
# print "NBP2 $words[$i] $words[$i+1]\n";
#print "NBP2 $words[$i] $words[$i+1]\n";
} elsif ($LIST_ITEM
&& ($i == 0 || substr($words[$i-1], -1) eq "\n")
&& $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) {
#Maybe list item - non breaking
# print "NBP3 $words[$i] $words[$i+1]\n";
# Maybe list item - non breaking
#print "NBP3 $words[$i] $words[$i+1]\n";
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) {
# The next word has a bunch of initial quotes, maybe a
# space, then either upper case or a number
# print "MAYBE $words[$i] $words[$i+1]\n";
#print "MAYBE $words[$i] $words[$i+1]\n";
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
# We always add a return for these, unless we have a
# numeric non-breaker and a number start.