extend ICU config to accomodate multiple analysers

Adds parsing of multiple variant lists from the configuration.
Every entry except one must have a unique 'id' paramter to
distinguish the entries. The entry without id is considered
the default. Currently only the list without an id is used
for analysis.
This commit is contained in:
Sarah Hoffmann 2021-10-04 16:40:28 +02:00
parent 5a36559834
commit 52847b61a3
5 changed files with 92 additions and 64 deletions

View File

@ -43,12 +43,10 @@ class ICURuleLoader:
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
self.variants = set()
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = self._get_section(rules, 'variants')
self._parse_variant_list()
self.analysis_rules = self._get_section(rules, 'token-analysis')
self._setup_analysis()
# Load optional sanitizer rule set.
self.sanitizer_rules = rules.get('sanitizers', [])
@ -61,7 +59,7 @@ class ICURuleLoader:
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
self._parse_variant_list()
self._setup_analysis()
def save_config_to_db(self, conn):
@ -82,9 +80,8 @@ class ICURuleLoader:
def make_token_analysis(self):
""" Create a token analyser from the reviouly loaded rules.
"""
return ICUNameProcessor(self.normalization_rules,
self.transliteration_rules,
self.variants)
return self.analysis[None].create(self.normalization_rules,
self.transliteration_rules)
def get_search_rules(self):
@ -99,23 +96,37 @@ class ICURuleLoader:
rules.write(self.transliteration_rules)
return rules.getvalue()
def get_normalization_rules(self):
""" Return rules for normalisation of a term.
"""
return self.normalization_rules
def get_transliteration_rules(self):
""" Return the rules for converting a string into its asciii representation.
"""
return self.transliteration_rules
def get_replacement_pairs(self):
""" Return the list of possible compound decompositions with
application of abbreviations included.
The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements.
def _setup_analysis(self):
""" Process the rules used for creating the various token analyzers.
"""
return self.variants
self.analysis = {}
if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.")
for section in self.analysis_rules:
name = section.get('id', None)
if name in self.analysis:
if name is None:
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
else:
LOG.fatal("ICU tokenizer configuration has two token "
"analyzers with id '%s'.", name)
UsageError("Syntax error in ICU tokenizer config.")
self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
@staticmethod
@ -145,17 +156,32 @@ class ICURuleLoader:
return ';'.join(flatten_config_list(content, section)) + ';'
def _parse_variant_list(self):
rules = self.analysis_rules
class TokenAnalyzerRule:
""" Factory for a single analysis module. The class saves the configuration
and creates a new token analyzer on request.
"""
self.variants.clear()
def __init__(self, rules, normalization_rules):
self._parse_variant_list(rules.get('variants'), normalization_rules)
def create(self, normalization_rules, transliteration_rules):
""" Create an analyzer from the given rules.
"""
return ICUNameProcessor(normalization_rules,
transliteration_rules,
self.variants)
def _parse_variant_list(self, rules, normalization_rules):
self.variants = set()
if not rules:
return
rules = flatten_config_list(rules, 'variants')
vmaker = _VariantMaker(self.normalization_rules)
vmaker = _VariantMaker(normalization_rules)
properties = []
for section in rules:

View File

@ -27,34 +27,35 @@ transliteration:
sanitizers:
- step: split-name-list
- step: strip-brace-terms
variants:
- !include icu-rules/variants-bg.yaml
- !include icu-rules/variants-ca.yaml
- !include icu-rules/variants-cs.yaml
- !include icu-rules/variants-da.yaml
- !include icu-rules/variants-de.yaml
- !include icu-rules/variants-el.yaml
- !include icu-rules/variants-en.yaml
- !include icu-rules/variants-es.yaml
- !include icu-rules/variants-et.yaml
- !include icu-rules/variants-eu.yaml
- !include icu-rules/variants-fi.yaml
- !include icu-rules/variants-fr.yaml
- !include icu-rules/variants-gl.yaml
- !include icu-rules/variants-hu.yaml
- !include icu-rules/variants-it.yaml
- !include icu-rules/variants-ja.yaml
- !include icu-rules/variants-mg.yaml
- !include icu-rules/variants-ms.yaml
- !include icu-rules/variants-nl.yaml
- !include icu-rules/variants-no.yaml
- !include icu-rules/variants-pl.yaml
- !include icu-rules/variants-pt.yaml
- !include icu-rules/variants-ro.yaml
- !include icu-rules/variants-ru.yaml
- !include icu-rules/variants-sk.yaml
- !include icu-rules/variants-sl.yaml
- !include icu-rules/variants-sv.yaml
- !include icu-rules/variants-tr.yaml
- !include icu-rules/variants-uk.yaml
- !include icu-rules/variants-vi.yaml
token-analysis:
- variants:
- !include icu-rules/variants-bg.yaml
- !include icu-rules/variants-ca.yaml
- !include icu-rules/variants-cs.yaml
- !include icu-rules/variants-da.yaml
- !include icu-rules/variants-de.yaml
- !include icu-rules/variants-el.yaml
- !include icu-rules/variants-en.yaml
- !include icu-rules/variants-es.yaml
- !include icu-rules/variants-et.yaml
- !include icu-rules/variants-eu.yaml
- !include icu-rules/variants-fi.yaml
- !include icu-rules/variants-fr.yaml
- !include icu-rules/variants-gl.yaml
- !include icu-rules/variants-hu.yaml
- !include icu-rules/variants-it.yaml
- !include icu-rules/variants-ja.yaml
- !include icu-rules/variants-mg.yaml
- !include icu-rules/variants-ms.yaml
- !include icu-rules/variants-nl.yaml
- !include icu-rules/variants-no.yaml
- !include icu-rules/variants-pl.yaml
- !include icu-rules/variants-pt.yaml
- !include icu-rules/variants-ro.yaml
- !include icu-rules/variants-ru.yaml
- !include icu-rules/variants-sk.yaml
- !include icu-rules/variants-sl.yaml
- !include icu-rules/variants-sv.yaml
- !include icu-rules/variants-tr.yaml
- !include icu-rules/variants-uk.yaml
- !include icu-rules/variants-vi.yaml

View File

@ -69,10 +69,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
variants=('~gasse -> gasse', 'street => st', ),
sanitizers=[]):
cfgstr = {'normalization' : list(norm),
'sanitizers' : sanitizers,
'transliteration' : list(trans),
'variants' : [ {'words': list(variants)}]}
cfgstr = {'normalization': list(norm),
'sanitizers': sanitizers,
'transliteration': list(trans),
'token-analysis': [{'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)

View File

@ -28,10 +28,10 @@ def cfgfile(def_config, tmp_path):
- ":: Latin ()"
- "'🜵' > ' '"
""")
content += "variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
content += "token-analysis:\n - variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
content += " {}: {}\n".format(k, v)
(project_dir / 'icu_tokenizer.yaml').write_text(content)
return def_config

View File

@ -34,8 +34,8 @@ def cfgrules(test_config):
- ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
""")
content += "variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
content += "token-analysis:\n - variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
@ -49,20 +49,20 @@ def test_empty_rule_set(test_config):
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
normalization:
transliteration:
variants:
token-analysis:
- variants:
"""))
rules = ICURuleLoader(test_config)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
assert list(rules.get_replacement_pairs()) == []
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_section(section, test_config):
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
with pytest.raises(UsageError):
@ -107,7 +107,8 @@ def test_transliteration_rules_from_file(test_config):
transliteration:
- "'ax' > 'b'"
- !include transliteration.yaml
variants:
token-analysis:
- variants:
"""))
transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
@ -127,7 +128,7 @@ class TestGetReplacements:
def get_replacements(self, *variants):
loader = ICURuleLoader(self.cfgrules(*variants))
rules = loader.get_replacement_pairs()
rules = loader.analysis[None].variants
return set((v.source, v.replacement) for v in rules)