From 63dc4b39bc6bc0bf5a95d0c1a8298f5349637a9e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 28 Apr 2022 17:20:56 +0200 Subject: [PATCH] ICU: better letter identification in normalization The Letter class does not include non-spacing marks that can also have a consonant or vowel meaning, especially in Indian languages. Use the alnum propoerty instead which includes them all. Also include the vowel-canceling Virama, which is not a letter by itself but changes the transliteration. --- settings/icu_tokenizer.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index bebd49e9..cd9c0d6d 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -8,8 +8,8 @@ normalization: - "ª > a" - "º > o" - "[[:Punctuation:][:Symbol:]\u02bc] > ' '" - - "ß > 'ss'" # German szet is unimbigiously equal to double ss - - "[^[:Letter:] [:Number:] [:Space:]] >" + - "ß > 'ss'" # German szet is unambiguously equal to double ss + - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >" - "[:Lm:] >" - ":: [[:Number:]] Latin ()" - ":: [[:Number:]] Ascii ();"