From 555829a771cd897bb807f495a95737953a7ca9a3 Mon Sep 17 00:00:00 2001 From: alvations Date: Tue, 1 Oct 2019 05:27:06 +0800 Subject: [PATCH 1/2] Undoing 05788925812f0d3265e355565cbb1701a0ad7510 Causes abbreviations to not split when ending with a fullstop. E.g. > The restructuring of IBM was essential to enable it organisationally to take up the responsibilities entrusted in the role with the recent changes in the policy and legislations, revised charter of function of IBM and the new activities and initiatives undertaken by IBM. IBM is also engaged in handholding the States for auction of mineral blocks for greater transparency in allocation of mineral concessions. --- scripts/ems/support/split-sentences.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 2c2319a12..f3494bc88 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -193,7 +193,7 @@ sub preprocess { my $starting_punct = $2; if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) { # Not breaking; - } elsif ($words[$i] =~ /(\.?)[\p{IsUpper}\-]+(\.+)$/) { + } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) { # Not breaking - upper case acronym } elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) { # The next word has a bunch of initial quotes, maybe a From 5d3331b922d4443b86a74960c7ebb7fea4ce7d50 Mon Sep 17 00:00:00 2001 From: Kevin Canwen Xu Date: Mon, 14 Oct 2019 16:33:58 +0800 Subject: [PATCH 2/2] Update replace-unicode-punctuation.perl --- scripts/tokenizer/replace-unicode-punctuation.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl index b0bc811fe..faed2cd9d 100755 --- a/scripts/tokenizer/replace-unicode-punctuation.perl +++ b/scripts/tokenizer/replace-unicode-punctuation.perl @@ -29,7 +29,7 @@ while() { s/!/\!/g; s/(/\(/g; s/;/;/g; - s/1/"/g; + s/1/1/g; s/」/"/g; s/「/"/g; s/0/0/g;