mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-09-20 15:37:49 +03:00
extend ICU config to accomodate multiple analysers
Adds parsing of multiple variant lists from the configuration. Every entry except one must have a unique 'id' paramter to distinguish the entries. The entry without id is considered the default. Currently only the list without an id is used for analysis.
This commit is contained in:
parent
5a36559834
commit
52847b61a3
@ -43,12 +43,10 @@ class ICURuleLoader:
|
||||
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG')
|
||||
|
||||
self.variants = set()
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self.analysis_rules = self._get_section(rules, 'variants')
|
||||
self._parse_variant_list()
|
||||
self.analysis_rules = self._get_section(rules, 'token-analysis')
|
||||
self._setup_analysis()
|
||||
|
||||
# Load optional sanitizer rule set.
|
||||
self.sanitizer_rules = rules.get('sanitizers', [])
|
||||
@ -61,7 +59,7 @@ class ICURuleLoader:
|
||||
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
|
||||
self._parse_variant_list()
|
||||
self._setup_analysis()
|
||||
|
||||
|
||||
def save_config_to_db(self, conn):
|
||||
@ -82,9 +80,8 @@ class ICURuleLoader:
|
||||
def make_token_analysis(self):
|
||||
""" Create a token analyser from the reviouly loaded rules.
|
||||
"""
|
||||
return ICUNameProcessor(self.normalization_rules,
|
||||
self.transliteration_rules,
|
||||
self.variants)
|
||||
return self.analysis[None].create(self.normalization_rules,
|
||||
self.transliteration_rules)
|
||||
|
||||
|
||||
def get_search_rules(self):
|
||||
@ -99,23 +96,37 @@ class ICURuleLoader:
|
||||
rules.write(self.transliteration_rules)
|
||||
return rules.getvalue()
|
||||
|
||||
|
||||
def get_normalization_rules(self):
|
||||
""" Return rules for normalisation of a term.
|
||||
"""
|
||||
return self.normalization_rules
|
||||
|
||||
|
||||
def get_transliteration_rules(self):
|
||||
""" Return the rules for converting a string into its asciii representation.
|
||||
"""
|
||||
return self.transliteration_rules
|
||||
|
||||
def get_replacement_pairs(self):
|
||||
""" Return the list of possible compound decompositions with
|
||||
application of abbreviations included.
|
||||
The result is a list of pairs: the first item is the sequence to
|
||||
replace, the second is a list of replacements.
|
||||
|
||||
def _setup_analysis(self):
|
||||
""" Process the rules used for creating the various token analyzers.
|
||||
"""
|
||||
return self.variants
|
||||
self.analysis = {}
|
||||
|
||||
if not isinstance(self.analysis_rules, list):
|
||||
raise UsageError("Configuration section 'token-analysis' must be a list.")
|
||||
|
||||
for section in self.analysis_rules:
|
||||
name = section.get('id', None)
|
||||
if name in self.analysis:
|
||||
if name is None:
|
||||
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
|
||||
else:
|
||||
LOG.fatal("ICU tokenizer configuration has two token "
|
||||
"analyzers with id '%s'.", name)
|
||||
UsageError("Syntax error in ICU tokenizer config.")
|
||||
self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
|
||||
|
||||
|
||||
@staticmethod
|
||||
@ -145,17 +156,32 @@ class ICURuleLoader:
|
||||
return ';'.join(flatten_config_list(content, section)) + ';'
|
||||
|
||||
|
||||
def _parse_variant_list(self):
|
||||
rules = self.analysis_rules
|
||||
class TokenAnalyzerRule:
|
||||
""" Factory for a single analysis module. The class saves the configuration
|
||||
and creates a new token analyzer on request.
|
||||
"""
|
||||
|
||||
self.variants.clear()
|
||||
def __init__(self, rules, normalization_rules):
|
||||
self._parse_variant_list(rules.get('variants'), normalization_rules)
|
||||
|
||||
|
||||
def create(self, normalization_rules, transliteration_rules):
|
||||
""" Create an analyzer from the given rules.
|
||||
"""
|
||||
return ICUNameProcessor(normalization_rules,
|
||||
transliteration_rules,
|
||||
self.variants)
|
||||
|
||||
|
||||
def _parse_variant_list(self, rules, normalization_rules):
|
||||
self.variants = set()
|
||||
|
||||
if not rules:
|
||||
return
|
||||
|
||||
rules = flatten_config_list(rules, 'variants')
|
||||
|
||||
vmaker = _VariantMaker(self.normalization_rules)
|
||||
vmaker = _VariantMaker(normalization_rules)
|
||||
|
||||
properties = []
|
||||
for section in rules:
|
||||
|
@ -27,34 +27,35 @@ transliteration:
|
||||
sanitizers:
|
||||
- step: split-name-list
|
||||
- step: strip-brace-terms
|
||||
variants:
|
||||
- !include icu-rules/variants-bg.yaml
|
||||
- !include icu-rules/variants-ca.yaml
|
||||
- !include icu-rules/variants-cs.yaml
|
||||
- !include icu-rules/variants-da.yaml
|
||||
- !include icu-rules/variants-de.yaml
|
||||
- !include icu-rules/variants-el.yaml
|
||||
- !include icu-rules/variants-en.yaml
|
||||
- !include icu-rules/variants-es.yaml
|
||||
- !include icu-rules/variants-et.yaml
|
||||
- !include icu-rules/variants-eu.yaml
|
||||
- !include icu-rules/variants-fi.yaml
|
||||
- !include icu-rules/variants-fr.yaml
|
||||
- !include icu-rules/variants-gl.yaml
|
||||
- !include icu-rules/variants-hu.yaml
|
||||
- !include icu-rules/variants-it.yaml
|
||||
- !include icu-rules/variants-ja.yaml
|
||||
- !include icu-rules/variants-mg.yaml
|
||||
- !include icu-rules/variants-ms.yaml
|
||||
- !include icu-rules/variants-nl.yaml
|
||||
- !include icu-rules/variants-no.yaml
|
||||
- !include icu-rules/variants-pl.yaml
|
||||
- !include icu-rules/variants-pt.yaml
|
||||
- !include icu-rules/variants-ro.yaml
|
||||
- !include icu-rules/variants-ru.yaml
|
||||
- !include icu-rules/variants-sk.yaml
|
||||
- !include icu-rules/variants-sl.yaml
|
||||
- !include icu-rules/variants-sv.yaml
|
||||
- !include icu-rules/variants-tr.yaml
|
||||
- !include icu-rules/variants-uk.yaml
|
||||
- !include icu-rules/variants-vi.yaml
|
||||
token-analysis:
|
||||
- variants:
|
||||
- !include icu-rules/variants-bg.yaml
|
||||
- !include icu-rules/variants-ca.yaml
|
||||
- !include icu-rules/variants-cs.yaml
|
||||
- !include icu-rules/variants-da.yaml
|
||||
- !include icu-rules/variants-de.yaml
|
||||
- !include icu-rules/variants-el.yaml
|
||||
- !include icu-rules/variants-en.yaml
|
||||
- !include icu-rules/variants-es.yaml
|
||||
- !include icu-rules/variants-et.yaml
|
||||
- !include icu-rules/variants-eu.yaml
|
||||
- !include icu-rules/variants-fi.yaml
|
||||
- !include icu-rules/variants-fr.yaml
|
||||
- !include icu-rules/variants-gl.yaml
|
||||
- !include icu-rules/variants-hu.yaml
|
||||
- !include icu-rules/variants-it.yaml
|
||||
- !include icu-rules/variants-ja.yaml
|
||||
- !include icu-rules/variants-mg.yaml
|
||||
- !include icu-rules/variants-ms.yaml
|
||||
- !include icu-rules/variants-nl.yaml
|
||||
- !include icu-rules/variants-no.yaml
|
||||
- !include icu-rules/variants-pl.yaml
|
||||
- !include icu-rules/variants-pt.yaml
|
||||
- !include icu-rules/variants-ro.yaml
|
||||
- !include icu-rules/variants-ru.yaml
|
||||
- !include icu-rules/variants-sk.yaml
|
||||
- !include icu-rules/variants-sl.yaml
|
||||
- !include icu-rules/variants-sv.yaml
|
||||
- !include icu-rules/variants-tr.yaml
|
||||
- !include icu-rules/variants-uk.yaml
|
||||
- !include icu-rules/variants-vi.yaml
|
||||
|
@ -69,10 +69,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
||||
variants=('~gasse -> gasse', 'street => st', ),
|
||||
sanitizers=[]):
|
||||
cfgstr = {'normalization' : list(norm),
|
||||
'sanitizers' : sanitizers,
|
||||
'transliteration' : list(trans),
|
||||
'variants' : [ {'words': list(variants)}]}
|
||||
cfgstr = {'normalization': list(norm),
|
||||
'sanitizers': sanitizers,
|
||||
'transliteration': list(trans),
|
||||
'token-analysis': [{'variants': [{'words': list(variants)}]}]}
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||
tok.loader = ICURuleLoader(test_config)
|
||||
|
||||
|
@ -28,10 +28,10 @@ def cfgfile(def_config, tmp_path):
|
||||
- ":: Latin ()"
|
||||
- "'🜵' > ' '"
|
||||
""")
|
||||
content += "variants:\n - words:\n"
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
content += "token-analysis:\n - variants:\n - words:\n"
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
content += " {}: {}\n".format(k, v)
|
||||
(project_dir / 'icu_tokenizer.yaml').write_text(content)
|
||||
|
||||
return def_config
|
||||
|
@ -34,8 +34,8 @@ def cfgrules(test_config):
|
||||
- ":: Latin ()"
|
||||
- "[[:Punctuation:][:Space:]]+ > ' '"
|
||||
""")
|
||||
content += "variants:\n - words:\n"
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
content += "token-analysis:\n - variants:\n - words:\n"
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
|
||||
@ -49,20 +49,20 @@ def test_empty_rule_set(test_config):
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
|
||||
normalization:
|
||||
transliteration:
|
||||
variants:
|
||||
token-analysis:
|
||||
- variants:
|
||||
"""))
|
||||
|
||||
rules = ICURuleLoader(test_config)
|
||||
assert rules.get_search_rules() == ''
|
||||
assert rules.get_normalization_rules() == ''
|
||||
assert rules.get_transliteration_rules() == ''
|
||||
assert list(rules.get_replacement_pairs()) == []
|
||||
|
||||
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
|
||||
CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')
|
||||
|
||||
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
|
||||
def test_missing_section(section, test_config):
|
||||
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
|
||||
rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
@ -107,7 +107,8 @@ def test_transliteration_rules_from_file(test_config):
|
||||
transliteration:
|
||||
- "'ax' > 'b'"
|
||||
- !include transliteration.yaml
|
||||
variants:
|
||||
token-analysis:
|
||||
- variants:
|
||||
"""))
|
||||
transpath = test_config.project_dir / ('transliteration.yaml')
|
||||
transpath.write_text('- "x > y"')
|
||||
@ -127,7 +128,7 @@ class TestGetReplacements:
|
||||
|
||||
def get_replacements(self, *variants):
|
||||
loader = ICURuleLoader(self.cfgrules(*variants))
|
||||
rules = loader.get_replacement_pairs()
|
||||
rules = loader.analysis[None].variants
|
||||
|
||||
return set((v.source, v.replacement) for v in rules)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user