add fi/sv-specific colon handling in tokenizer.perl

This commit is contained in:
Scherrer Yves 2018-02-14 10:27:46 +02:00
parent 5be48ce9db
commit 4a7f16b366

View File

@ -257,8 +257,17 @@ sub tokenize
$text =~ s/^ //g;
$text =~ s/ $//g;
# seperate out all "other" special characters
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
# separate out all "other" special characters
if (($language eq "fi") or ($language eq "sv")) {
# in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
# USA:n, 20:een, EU:ssa, USA:s, S:t
$text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
# if a colon is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
}
else {
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
}
# aggressive hyphen splitting
if ($AGGRESSIVE)