mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-09-20 15:37:49 +03:00
use yaml tag syntax to mark include files
This commit is contained in:
parent
c4f6c06f44
commit
a6aa6360e0
@ -5,6 +5,7 @@ import io
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from icu import Transliterator
|
||||
@ -13,6 +14,22 @@ from nominatim.errors import UsageError
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _flatten_yaml_list(content):
|
||||
if not content:
|
||||
return []
|
||||
|
||||
if not isinstance(content, list):
|
||||
raise UsageError("List expected in ICU yaml configuration.")
|
||||
|
||||
output = []
|
||||
for ele in content:
|
||||
if isinstance(ele, list):
|
||||
output.extend(_flatten_yaml_list(ele))
|
||||
else:
|
||||
output.append(ele)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class ICURuleLoader:
|
||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||
@ -87,8 +104,20 @@ class ICURuleLoader:
|
||||
|
||||
return [(k, list(synonyms[k])) for k in sorted_keys]
|
||||
|
||||
def _yaml_include_representer(self, loader, node):
|
||||
value = loader.construct_scalar(node)
|
||||
|
||||
if Path(value).is_absolute():
|
||||
content = Path(value).read_text()
|
||||
else:
|
||||
content = (self.configfile.parent / value).read_text()
|
||||
|
||||
return yaml.safe_load(content)
|
||||
|
||||
|
||||
def _load_from_yaml(self):
|
||||
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||
Loader=yaml.SafeLoader)
|
||||
rules = yaml.safe_load(self.configfile.read_text())
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
@ -121,10 +150,8 @@ class ICURuleLoader:
|
||||
if content is None:
|
||||
return ''
|
||||
|
||||
if isinstance(content, str):
|
||||
return (self.configfile.parent / content).read_text().replace('\n', ' ')
|
||||
return ';'.join(_flatten_yaml_list(content)) + ';'
|
||||
|
||||
return ';'.join(content) + ';'
|
||||
|
||||
|
||||
def _parse_compound_suffix_list(self, rules):
|
||||
|
4941
settings/icu-rules/extended-unicode-to-asccii.yaml
Normal file
4941
settings/icu-rules/extended-unicode-to-asccii.yaml
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,16 @@ normalization:
|
||||
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
|
||||
- "[[:Punctuation:][:Space:]]+ > ' '"
|
||||
- ":: NFC ()"
|
||||
transliteration: icu_transliteration.rules
|
||||
transliteration:
|
||||
- !include icu-rules/extended-unicode-to-asccii.yaml
|
||||
- ":: Ascii ()"
|
||||
- ":: NFD ()"
|
||||
- "'' >"
|
||||
- "[[:Nonspacing Mark:] [:Cf:]] >"
|
||||
- "[^[:Ascii:]] >"
|
||||
- ":: lower ()"
|
||||
- "[[:Punctuation:][:Space:]]+ > ' '"
|
||||
- ":: NFC ()"
|
||||
compound_suffixes:
|
||||
# Danish
|
||||
- hal
|
||||
|
@ -121,6 +121,26 @@ def test_get_transliteration_rules(cfgfile):
|
||||
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
||||
|
||||
|
||||
def test_transliteration_rules_from_file(tmp_path):
|
||||
cfgpath = tmp_path / ('test_config.yaml')
|
||||
cfgpath.write_text(dedent("""\
|
||||
normalization:
|
||||
transliteration:
|
||||
- "'ax' > 'b'"
|
||||
- !include transliteration.yaml
|
||||
compound_suffixes:
|
||||
abbreviations:
|
||||
"""))
|
||||
transpath = tmp_path / ('transliteration.yaml')
|
||||
transpath.write_text('- "x > y"')
|
||||
|
||||
loader = ICURuleLoader(cfgpath)
|
||||
rules = loader.get_transliteration_rules()
|
||||
trans = Transliterator.createFromRules("test", rules)
|
||||
|
||||
assert trans.transliterate(" axxt ") == " byt "
|
||||
|
||||
|
||||
def test_get_replacement_pairs_multi_to(cfgfile):
|
||||
fpath = cfgfile(['Pfad', 'Strasse'],
|
||||
['Strasse => str,st'])
|
||||
|
Loading…
Reference in New Issue
Block a user