use yaml tag syntax to mark include files

This commit is contained in:
Sarah Hoffmann 2021-06-20 23:45:33 +02:00
parent c4f6c06f44
commit a6aa6360e0
5 changed files with 5001 additions and 4953 deletions

View File

@ -5,6 +5,7 @@ import io
import logging
from collections import defaultdict
import itertools
from pathlib import Path
import yaml
from icu import Transliterator
@ -13,6 +14,22 @@ from nominatim.errors import UsageError
LOG = logging.getLogger()
def _flatten_yaml_list(content):
if not content:
return []
if not isinstance(content, list):
raise UsageError("List expected in ICU yaml configuration.")
output = []
for ele in content:
if isinstance(ele, list):
output.extend(_flatten_yaml_list(ele))
else:
output.append(ele)
return output
class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
@ -87,8 +104,20 @@ class ICURuleLoader:
return [(k, list(synonyms[k])) for k in sorted_keys]
def _yaml_include_representer(self, loader, node):
value = loader.construct_scalar(node)
if Path(value).is_absolute():
content = Path(value).read_text()
else:
content = (self.configfile.parent / value).read_text()
return yaml.safe_load(content)
def _load_from_yaml(self):
yaml.add_constructor('!include', self._yaml_include_representer,
Loader=yaml.SafeLoader)
rules = yaml.safe_load(self.configfile.read_text())
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
@ -121,10 +150,8 @@ class ICURuleLoader:
if content is None:
return ''
if isinstance(content, str):
return (self.configfile.parent / content).read_text().replace('\n', ' ')
return ';'.join(_flatten_yaml_list(content)) + ';'
return ';'.join(content) + ';'
def _parse_compound_suffix_list(self, rules):

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,16 @@ normalization:
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration: icu_transliteration.rules
transliteration:
- !include icu-rules/extended-unicode-to-asccii.yaml
- ":: Ascii ()"
- ":: NFD ()"
- "'' >"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- "[^[:Ascii:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
compound_suffixes:
# Danish
- hal

View File

@ -121,6 +121,26 @@ def test_get_transliteration_rules(cfgfile):
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def test_transliteration_rules_from_file(tmp_path):
cfgpath = tmp_path / ('test_config.yaml')
cfgpath.write_text(dedent("""\
normalization:
transliteration:
- "'ax' > 'b'"
- !include transliteration.yaml
compound_suffixes:
abbreviations:
"""))
transpath = tmp_path / ('transliteration.yaml')
transpath.write_text('- "x > y"')
loader = ICURuleLoader(cfgpath)
rules = loader.get_transliteration_rules()
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" axxt ") == " byt "
def test_get_replacement_pairs_multi_to(cfgfile):
fpath = cfgfile(['Pfad', 'Strasse'],
['Strasse => str,st'])