mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-10-27 19:48:44 +03:00
move generation of normalized token form to analyzer
This gives the analyzer more flexibility in choosing the normalized form. In particular, an analyzer creating different variants can choose the variant that will be used as the canonical form.
This commit is contained in:
parent
691ec08586
commit
837d44391c
@ -25,5 +25,5 @@ class ICUTokenAnalysis:
|
||||
self.search = Transliterator.createFromRules("icu_search",
|
||||
norm_rules + trans_rules)
|
||||
|
||||
self.analysis = {name: arules.create(self.to_ascii, arules.config)
|
||||
self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config)
|
||||
for name, arules in analysis_rules.items()}
|
||||
|
@ -561,7 +561,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||
|
||||
for name in names:
|
||||
analyzer_id = name.get_attr('analyzer')
|
||||
norm_name = self._normalized(name.name)
|
||||
analyzer = self.token_analysis.analysis[analyzer_id]
|
||||
norm_name = analyzer.normalize(name.name)
|
||||
if analyzer_id is None:
|
||||
token_id = norm_name
|
||||
else:
|
||||
@ -569,7 +570,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||
|
||||
full, part = self._cache.names.get(token_id, (None, None))
|
||||
if full is None:
|
||||
variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
|
||||
variants = analyzer.get_variants_ascii(norm_name)
|
||||
if not variants:
|
||||
continue
|
||||
|
||||
|
@ -47,10 +47,10 @@ def configure(rules, normalization_rules):
|
||||
|
||||
### Analysis section
|
||||
|
||||
def create(transliterator, config):
|
||||
def create(normalizer, transliterator, config):
|
||||
""" Create a new token analysis instance for this module.
|
||||
"""
|
||||
return GenericTokenAnalysis(transliterator, config)
|
||||
return GenericTokenAnalysis(normalizer, transliterator, config)
|
||||
|
||||
|
||||
class GenericTokenAnalysis:
|
||||
@ -58,7 +58,8 @@ class GenericTokenAnalysis:
|
||||
and provides the functions to apply the transformations.
|
||||
"""
|
||||
|
||||
def __init__(self, to_ascii, config):
|
||||
def __init__(self, norm, to_ascii, config):
|
||||
self.norm = norm
|
||||
self.to_ascii = to_ascii
|
||||
self.variant_only = config['variant_only']
|
||||
|
||||
@ -74,6 +75,13 @@ class GenericTokenAnalysis:
|
||||
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
|
||||
|
||||
|
||||
def normalize(self, name):
|
||||
""" Return the normalized form of the name. This is the standard form
|
||||
from which possible variants for the name can be derived.
|
||||
"""
|
||||
return self.norm.transliterate(name).strip()
|
||||
|
||||
|
||||
def get_variants_ascii(self, norm_name):
|
||||
""" Compute the spelling variants for the given normalized name
|
||||
and transliterate the result.
|
||||
|
@ -32,8 +32,9 @@ def make_analyser(*variants, variant_only=False):
|
||||
rules['mode'] = 'variant-only'
|
||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
|
||||
return module.create(trans, config)
|
||||
return module.create(norm, trans, config)
|
||||
|
||||
|
||||
def get_normalized_variants(proc, name):
|
||||
@ -45,8 +46,9 @@ def test_no_variants():
|
||||
rules = { 'analyzer': 'generic' }
|
||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
|
||||
proc = module.create(trans, config)
|
||||
proc = module.create(norm, trans, config)
|
||||
|
||||
assert get_normalized_variants(proc, '大德!') == ['dà dé']
|
||||
|
||||
|
@ -33,8 +33,9 @@ class TestMutationNoVariants:
|
||||
}
|
||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
|
||||
self.analysis = module.create(trans, config)
|
||||
self.analysis = module.create(norm, trans, config)
|
||||
|
||||
|
||||
def variants(self, name):
|
||||
|
Loading…
Reference in New Issue
Block a user