diff --git a/nominatim/tokenizer/icu_token_analysis.py b/nominatim/tokenizer/icu_token_analysis.py index 1d319b32..ee3144a8 100644 --- a/nominatim/tokenizer/icu_token_analysis.py +++ b/nominatim/tokenizer/icu_token_analysis.py @@ -25,5 +25,5 @@ class ICUTokenAnalysis: self.search = Transliterator.createFromRules("icu_search", norm_rules + trans_rules) - self.analysis = {name: arules.create(self.to_ascii, arules.config) + self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config) for name, arules in analysis_rules.items()} diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 9c25b6d7..b89180ae 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -561,7 +561,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): for name in names: analyzer_id = name.get_attr('analyzer') - norm_name = self._normalized(name.name) + analyzer = self.token_analysis.analysis[analyzer_id] + norm_name = analyzer.normalize(name.name) if analyzer_id is None: token_id = norm_name else: @@ -569,7 +570,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): full, part = self._cache.names.get(token_id, (None, None)) if full is None: - variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name) + variants = analyzer.get_variants_ascii(norm_name) if not variants: continue diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index d4eae312..3de915ba 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -47,10 +47,10 @@ def configure(rules, normalization_rules): ### Analysis section -def create(transliterator, config): +def create(normalizer, transliterator, config): """ Create a new token analysis instance for this module. """ - return GenericTokenAnalysis(transliterator, config) + return GenericTokenAnalysis(normalizer, transliterator, config) class GenericTokenAnalysis: @@ -58,7 +58,8 @@ class GenericTokenAnalysis: and provides the functions to apply the transformations. """ - def __init__(self, to_ascii, config): + def __init__(self, norm, to_ascii, config): + self.norm = norm self.to_ascii = to_ascii self.variant_only = config['variant_only'] @@ -74,6 +75,13 @@ class GenericTokenAnalysis: self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] + def normalize(self, name): + """ Return the normalized form of the name. This is the standard form + from which possible variants for the name can be derived. + """ + return self.norm.transliterate(name).strip() + + def get_variants_ascii(self, norm_name): """ Compute the spelling variants for the given normalized name and transliterate the result. diff --git a/test/python/tokenizer/token_analysis/test_generic.py b/test/python/tokenizer/token_analysis/test_generic.py index 9b008cc5..afbd5e9b 100644 --- a/test/python/tokenizer/token_analysis/test_generic.py +++ b/test/python/tokenizer/token_analysis/test_generic.py @@ -32,8 +32,9 @@ def make_analyser(*variants, variant_only=False): rules['mode'] = 'variant-only' config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) - return module.create(trans, config) + return module.create(norm, trans, config) def get_normalized_variants(proc, name): @@ -45,8 +46,9 @@ def test_no_variants(): rules = { 'analyzer': 'generic' } config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) - proc = module.create(trans, config) + proc = module.create(norm, trans, config) assert get_normalized_variants(proc, '大德!') == ['dà dé'] diff --git a/test/python/tokenizer/token_analysis/test_generic_mutation.py b/test/python/tokenizer/token_analysis/test_generic_mutation.py index 757f0311..abe31f6d 100644 --- a/test/python/tokenizer/token_analysis/test_generic_mutation.py +++ b/test/python/tokenizer/token_analysis/test_generic_mutation.py @@ -33,8 +33,9 @@ class TestMutationNoVariants: } config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) - self.analysis = module.create(trans, config) + self.analysis = module.create(norm, trans, config) def variants(self, name):