mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-10 11:42:13 +03:00
introduce generic YAML config loader
Adds a function to the Configuration class to load a YAML file. This means that searching for the file is generalised and works the same now for all configuration files. Changes the search logic, so that it is always possible to have a custom version of the configuration file in the project directory. Move ICU tokenizer to use new load function.
This commit is contained in:
parent
18554dfed7
commit
1c42780bb5
@ -4,6 +4,7 @@ Nominatim configuration accessor.
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
from dotenv import dotenv_values
|
||||
|
||||
@ -114,3 +115,96 @@ class Configuration:
|
||||
env.update(self.environ)
|
||||
|
||||
return env
|
||||
|
||||
|
||||
def load_sub_configuration(self, filename, config=None):
|
||||
""" Load additional configuration from a file. `filename` is the name
|
||||
of the configuration file. The file is first searched in the
|
||||
project directory and then in the global settings dirctory.
|
||||
|
||||
If `config` is set, then the name of the configuration file can
|
||||
be additionally given through a .env configuration option. When
|
||||
the option is set, then the file will be exclusively loaded as set:
|
||||
if the name is an absolute path, the file name is taken as is,
|
||||
if the name is relative, it is taken to be relative to the
|
||||
project directory.
|
||||
|
||||
The format of the file is determined from the filename suffix.
|
||||
Currently only files with extension '.yaml' are supported.
|
||||
|
||||
YAML files support a special '!include' construct. When the
|
||||
directive is given, the value is taken to be a filename, the file
|
||||
is loaded using this function and added at the position in the
|
||||
configuration tree.
|
||||
"""
|
||||
configfile = self._find_config_file(filename, config)
|
||||
|
||||
if configfile.suffix != '.yaml':
|
||||
LOG.format("Format error while reading '%s': only YAML format supported.",
|
||||
configfile)
|
||||
raise UsageError("Cannot handle config file format.")
|
||||
|
||||
return self._load_from_yaml(configfile)
|
||||
|
||||
|
||||
def _find_config_file(self, filename, config=None):
|
||||
""" Resolve the location of a configuration file given a filename and
|
||||
an optional configuration option with the file name.
|
||||
Raises a UsageError when the file cannot be found or is not
|
||||
a regular file.
|
||||
"""
|
||||
if config is not None:
|
||||
cfg_filename = self.__getattr__(config)
|
||||
if cfg_filename:
|
||||
cfg_filename = Path(cfg_filename)
|
||||
|
||||
if not cfg_filename.is_absolute():
|
||||
cfg_filename = self.project_dir / cfg_filename
|
||||
|
||||
cfg_filename = cfg_filename.resolve()
|
||||
|
||||
if not cfg_filename.is_file():
|
||||
LOG.fatal("Cannot find config file '%s'.", cfg_filename)
|
||||
raise UsageError("Config file not found.")
|
||||
|
||||
return cfg_filename
|
||||
|
||||
|
||||
search_paths = [self.project_dir, self.config_dir]
|
||||
for path in search_paths:
|
||||
if (path / filename).is_file():
|
||||
return path / filename
|
||||
|
||||
LOG.fatal("Configuration file '%s' not found.\nDirectories searched: %s",
|
||||
filename, search_paths)
|
||||
raise UsageError("Config file not found.")
|
||||
|
||||
|
||||
def _load_from_yaml(self, cfgfile):
|
||||
""" Load a YAML configuration file. This installs a special handler that
|
||||
allows to include other YAML files using the '!include' operator.
|
||||
"""
|
||||
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||
Loader=yaml.SafeLoader)
|
||||
return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
|
||||
|
||||
|
||||
def _yaml_include_representer(self, loader, node):
|
||||
""" Handler for the '!include' operator in YAML files.
|
||||
|
||||
When the filename is relative, then the file is first searched in the
|
||||
project directory and then in the global settings dirctory.
|
||||
"""
|
||||
fname = loader.construct_scalar(node)
|
||||
|
||||
if Path(fname).is_absolute():
|
||||
configfile = Path(fname)
|
||||
else:
|
||||
configfile = self._find_config_file(loader.construct_scalar(node))
|
||||
|
||||
if configfile.suffix != '.yaml':
|
||||
LOG.format("Format error while reading '%s': only YAML format supported.",
|
||||
configfile)
|
||||
raise UsageError("Cannot handle config file format.")
|
||||
|
||||
return yaml.safe_load(configfile.read_text(encoding='utf-8'))
|
||||
|
@ -4,10 +4,8 @@ Helper class to create ICU rules from a configuration file.
|
||||
import io
|
||||
import logging
|
||||
import itertools
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
import yaml
|
||||
from icu import Transliterator
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
@ -15,17 +13,17 @@ import nominatim.tokenizer.icu_variants as variants
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _flatten_yaml_list(content):
|
||||
def _flatten_config_list(content):
|
||||
if not content:
|
||||
return []
|
||||
|
||||
if not isinstance(content, list):
|
||||
raise UsageError("List expected in ICU yaml configuration.")
|
||||
raise UsageError("List expected in ICU configuration.")
|
||||
|
||||
output = []
|
||||
for ele in content:
|
||||
if isinstance(ele, list):
|
||||
output.extend(_flatten_yaml_list(ele))
|
||||
output.extend(_flatten_config_list(ele))
|
||||
else:
|
||||
output.append(ele)
|
||||
|
||||
@ -48,14 +46,12 @@ class ICURuleLoader:
|
||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||
"""
|
||||
|
||||
def __init__(self, configfile):
|
||||
self.configfile = configfile
|
||||
def __init__(self, rules):
|
||||
self.variants = set()
|
||||
|
||||
if configfile.suffix == '.yaml':
|
||||
self._load_from_yaml()
|
||||
else:
|
||||
raise UsageError("Unknown format of tokenizer configuration.")
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self._parse_variant_list(self._get_section(rules, 'variants'))
|
||||
|
||||
|
||||
def get_search_rules(self):
|
||||
@ -88,34 +84,14 @@ class ICURuleLoader:
|
||||
"""
|
||||
return self.variants
|
||||
|
||||
def _yaml_include_representer(self, loader, node):
|
||||
value = loader.construct_scalar(node)
|
||||
|
||||
if Path(value).is_absolute():
|
||||
content = Path(value)
|
||||
else:
|
||||
content = (self.configfile.parent / value)
|
||||
|
||||
return yaml.safe_load(content.read_text(encoding='utf-8'))
|
||||
|
||||
|
||||
def _load_from_yaml(self):
|
||||
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||
Loader=yaml.SafeLoader)
|
||||
rules = yaml.safe_load(self.configfile.read_text(encoding='utf-8'))
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self._parse_variant_list(self._get_section(rules, 'variants'))
|
||||
|
||||
|
||||
def _get_section(self, rules, section):
|
||||
@staticmethod
|
||||
def _get_section(rules, section):
|
||||
""" Get the section named 'section' from the rules. If the section does
|
||||
not exist, raise a usage error with a meaningful message.
|
||||
"""
|
||||
if section not in rules:
|
||||
LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
|
||||
section, str(self.configfile))
|
||||
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
return rules[section]
|
||||
@ -133,7 +109,7 @@ class ICURuleLoader:
|
||||
if content is None:
|
||||
return ''
|
||||
|
||||
return ';'.join(_flatten_yaml_list(content)) + ';'
|
||||
return ';'.join(_flatten_config_list(content)) + ';'
|
||||
|
||||
|
||||
def _parse_variant_list(self, rules):
|
||||
@ -142,7 +118,7 @@ class ICURuleLoader:
|
||||
if not rules:
|
||||
return
|
||||
|
||||
rules = _flatten_yaml_list(rules)
|
||||
rules = _flatten_config_list(rules)
|
||||
|
||||
vmaker = _VariantMaker(self.normalization_rules)
|
||||
|
||||
|
@ -8,7 +8,6 @@ import json
|
||||
import logging
|
||||
import re
|
||||
from textwrap import dedent
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim.db.connection import connect
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
@ -49,12 +48,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
This copies all necessary data in the project directory to make
|
||||
sure the tokenizer remains stable even over updates.
|
||||
"""
|
||||
if config.TOKENIZER_CONFIG:
|
||||
cfgfile = Path(config.TOKENIZER_CONFIG)
|
||||
else:
|
||||
cfgfile = config.config_dir / 'icu_tokenizer.yaml'
|
||||
|
||||
loader = ICURuleLoader(cfgfile)
|
||||
loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG'))
|
||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
||||
self.term_normalization = config.TERM_NORMALIZATION
|
||||
self.max_word_frequency = config.MAX_WORD_FREQUENCY
|
||||
|
@ -67,13 +67,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||
|
||||
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
||||
variants=('~gasse -> gasse', 'street => st', )):
|
||||
cfgfile = tmp_path / 'analyser_test_config.yaml'
|
||||
with cfgfile.open('w') as stream:
|
||||
cfgstr = {'normalization' : list(norm),
|
||||
'transliteration' : list(trans),
|
||||
'variants' : [ {'words': list(variants)}]}
|
||||
yaml.dump(cfgstr, stream)
|
||||
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile))
|
||||
cfgstr = {'normalization' : list(norm),
|
||||
'transliteration' : list(trans),
|
||||
'variants' : [ {'words': list(variants)}]}
|
||||
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
|
||||
|
||||
return tok.name_analyzer()
|
||||
|
||||
|
@ -4,6 +4,7 @@ Tests for import name normalisation and variant generation.
|
||||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
||||
@ -11,7 +12,7 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProc
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
@pytest.fixture
|
||||
def cfgfile(tmp_path, suffix='.yaml'):
|
||||
def cfgfile():
|
||||
def _create_config(*variants, **kwargs):
|
||||
content = dedent("""\
|
||||
normalization:
|
||||
@ -29,9 +30,7 @@ def cfgfile(tmp_path, suffix='.yaml'):
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
fpath = tmp_path / ('test_config' + suffix)
|
||||
fpath.write_text(dedent(content))
|
||||
return fpath
|
||||
return yaml.safe_load(content)
|
||||
|
||||
return _create_config
|
||||
|
||||
|
@ -1,16 +1,18 @@
|
||||
"""
|
||||
Tests for converting a config file to ICU rules.
|
||||
"""
|
||||
import pytest
|
||||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
@pytest.fixture
|
||||
def cfgfile(tmp_path, suffix='.yaml'):
|
||||
def cfgrules():
|
||||
def _create_config(*variants, **kwargs):
|
||||
content = dedent("""\
|
||||
normalization:
|
||||
@ -27,22 +29,19 @@ def cfgfile(tmp_path, suffix='.yaml'):
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
fpath = tmp_path / ('test_config' + suffix)
|
||||
fpath.write_text(dedent(content))
|
||||
return fpath
|
||||
return yaml.safe_load(content)
|
||||
|
||||
return _create_config
|
||||
|
||||
|
||||
def test_empty_rule_file(tmp_path):
|
||||
fpath = tmp_path / ('test_config.yaml')
|
||||
fpath.write_text(dedent("""\
|
||||
def test_empty_rule_set():
|
||||
rule_cfg = yaml.safe_load(dedent("""\
|
||||
normalization:
|
||||
transliteration:
|
||||
variants:
|
||||
"""))
|
||||
|
||||
rules = ICURuleLoader(fpath)
|
||||
rules = ICURuleLoader(rule_cfg)
|
||||
assert rules.get_search_rules() == ''
|
||||
assert rules.get_normalization_rules() == ''
|
||||
assert rules.get_transliteration_rules() == ''
|
||||
@ -51,19 +50,15 @@ def test_empty_rule_file(tmp_path):
|
||||
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
|
||||
|
||||
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
|
||||
def test_missing_normalization(tmp_path, section):
|
||||
fpath = tmp_path / ('test_config.yaml')
|
||||
with fpath.open('w') as fd:
|
||||
for name in CONFIG_SECTIONS:
|
||||
if name != section:
|
||||
fd.write(name + ':\n')
|
||||
def test_missing_section(section):
|
||||
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
ICURuleLoader(fpath)
|
||||
ICURuleLoader(rule_cfg)
|
||||
|
||||
|
||||
def test_get_search_rules(cfgfile):
|
||||
loader = ICURuleLoader(cfgfile())
|
||||
def test_get_search_rules(cfgrules):
|
||||
loader = ICURuleLoader(cfgrules())
|
||||
|
||||
rules = loader.get_search_rules()
|
||||
trans = Transliterator.createFromRules("test", rules)
|
||||
@ -77,23 +72,24 @@ def test_get_search_rules(cfgfile):
|
||||
assert trans.transliterate(" проспект ") == " prospekt "
|
||||
|
||||
|
||||
def test_get_normalization_rules(cfgfile):
|
||||
loader = ICURuleLoader(cfgfile())
|
||||
def test_get_normalization_rules(cfgrules):
|
||||
loader = ICURuleLoader(cfgrules())
|
||||
rules = loader.get_normalization_rules()
|
||||
trans = Transliterator.createFromRules("test", rules)
|
||||
|
||||
assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
|
||||
|
||||
|
||||
def test_get_transliteration_rules(cfgfile):
|
||||
loader = ICURuleLoader(cfgfile())
|
||||
def test_get_transliteration_rules(cfgrules):
|
||||
loader = ICURuleLoader(cfgrules())
|
||||
rules = loader.get_transliteration_rules()
|
||||
trans = Transliterator.createFromRules("test", rules)
|
||||
|
||||
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
||||
|
||||
|
||||
def test_transliteration_rules_from_file(tmp_path):
|
||||
def test_transliteration_rules_from_file(def_config, tmp_path):
|
||||
def_config.project_dir = tmp_path
|
||||
cfgpath = tmp_path / ('test_config.yaml')
|
||||
cfgpath.write_text(dedent("""\
|
||||
normalization:
|
||||
@ -105,7 +101,7 @@ def test_transliteration_rules_from_file(tmp_path):
|
||||
transpath = tmp_path / ('transliteration.yaml')
|
||||
transpath.write_text('- "x > y"')
|
||||
|
||||
loader = ICURuleLoader(cfgpath)
|
||||
loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
|
||||
rules = loader.get_transliteration_rules()
|
||||
trans = Transliterator.createFromRules("test", rules)
|
||||
|
||||
@ -115,11 +111,11 @@ def test_transliteration_rules_from_file(tmp_path):
|
||||
class TestGetReplacements:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_cfg(self, cfgfile):
|
||||
self.cfgfile = cfgfile
|
||||
def setup_cfg(self, cfgrules):
|
||||
self.cfgrules = cfgrules
|
||||
|
||||
def get_replacements(self, *variants):
|
||||
loader = ICURuleLoader(self.cfgfile(*variants))
|
||||
loader = ICURuleLoader(self.cfgrules(*variants))
|
||||
rules = loader.get_replacement_pairs()
|
||||
|
||||
return set((v.source, v.replacement) for v in rules)
|
||||
@ -129,7 +125,7 @@ class TestGetReplacements:
|
||||
'~foo~ -> bar', 'fo~ o -> bar'])
|
||||
def test_invalid_variant_description(self, variant):
|
||||
with pytest.raises(UsageError):
|
||||
ICURuleLoader(self.cfgfile(variant))
|
||||
ICURuleLoader(self.cfgrules(variant))
|
||||
|
||||
def test_add_full(self):
|
||||
repl = self.get_replacements("foo -> bar")
|
||||
|
Loading…
Reference in New Issue
Block a user