Proper spacing

This commit is contained in:
alvations 2020-01-06 11:43:31 +08:00 committed by GitHub
parent f46ee7c5ac
commit d03df21e88
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -135,7 +135,6 @@ sub preprocess {
$sentence_start .= "\\p{Block: Telugu}" if $language eq "te"; $sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko"; $sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
# we include danda and double danda (U+0964 and U+0965) as sentence split characters # we include danda and double danda (U+0964 and U+0965) as sentence split characters
# Non-period end of sentence markers (?!) followed by sentence starters. # Non-period end of sentence markers (?!) followed by sentence starters.
@ -154,7 +153,7 @@ sub preprocess {
$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g; $text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
#NOTE: Korean no longer handled here. #NOTE: Korean no longer handled here, cos Korean has spaces.
if ($is_cjk == 1) { if ($is_cjk == 1) {
# Chinese uses unusual end-of-sentence markers. These are NOT # Chinese uses unusual end-of-sentence markers. These are NOT
# followed by whitespace. Nor is there any idea of capitalization. # followed by whitespace. Nor is there any idea of capitalization.
@ -175,6 +174,7 @@ sub preprocess {
# spaces here, so that later processing stages can tokenize readily. # spaces here, so that later processing stages can tokenize readily.
# Note that this handles mixed latinate+CJK. # Note that this handles mixed latinate+CJK.
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ? # TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not # bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
# what we want from sentence-splitter! # what we want from sentence-splitter!
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g; #$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
@ -211,7 +211,7 @@ sub preprocess {
my $starting_punct = $2; my $starting_punct = $2;
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) { if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
# Not breaking; # Not breaking;
# print "NBP1 $words[$i] $words[$i+1]\n"; ## print "NBP1 $words[$i] $words[$i+1]\n";
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) { } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
# Not breaking - upper case acronym # Not breaking - upper case acronym
#print "NBP2 $words[$i] $words[$i+1]\n"; #print "NBP2 $words[$i] $words[$i+1]\n";