add type hints for sanitizers

This commit is contained in:
Sarah Hoffmann 2022-07-12 23:15:19 +02:00
parent 5617bffe2f
commit 62eedbb8f6
10 changed files with 200 additions and 133 deletions

View File

@ -79,7 +79,7 @@ class PostcodeFormatter:
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
def get_matcher(self, country_code: str) -> Optional[CountryPostcodeMatcher]:
def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
""" Return the CountryPostcodeMatcher for the given country.
Returns None if the country doesn't have a postcode and the
default matcher if there is no specific matcher configured for
@ -88,10 +88,12 @@ class PostcodeFormatter:
if country_code in self.country_without_postcode:
return None
assert country_code is not None
return self.country_matcher.get(country_code, self.default_matcher)
def match(self, country_code: str, postcode: str) -> Optional[Match[str]]:
def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the country has a pattern
and the match was successful or None if the match failed.
@ -99,6 +101,8 @@ class PostcodeFormatter:
if country_code in self.country_without_postcode:
return None
assert country_code is not None
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)

View File

@ -8,100 +8,13 @@
Handler for cleaning name and address tags in place information before it
is handed to the token analysis.
"""
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
import importlib
from nominatim.errors import UsageError
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
class PlaceName:
""" A searchable name for a place together with properties.
Every name object saves the name proper and two basic properties:
* 'kind' describes the name of the OSM key used without any suffixes
(i.e. the part after the colon removed)
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
is the part of the key after the first colon.
In addition to that, the name may have arbitrary additional attributes.
Which attributes are used, depends on the token analyser.
"""
def __init__(self, name, kind, suffix):
self.name = name
self.kind = kind
self.suffix = suffix
self.attr = {}
def __repr__(self):
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
def clone(self, name=None, kind=None, suffix=None, attr=None):
""" Create a deep copy of the place name, optionally with the
given parameters replaced. In the attribute list only the given
keys are updated. The list is not replaced completely.
In particular, the function cannot to be used to remove an
attribute from a place name.
"""
newobj = PlaceName(name or self.name,
kind or self.kind,
suffix or self.suffix)
newobj.attr.update(self.attr)
if attr:
newobj.attr.update(attr)
return newobj
def set_attr(self, key, value):
""" Add the given property to the name. If the property was already
set, then the value is overwritten.
"""
self.attr[key] = value
def get_attr(self, key, default=None):
""" Return the given property or the value of 'default' if it
is not set.
"""
return self.attr.get(key, default)
def has_attr(self, key):
""" Check if the given attribute is set.
"""
return key in self.attr
class _ProcessInfo:
""" Container class for information handed into to handler functions.
The 'names' and 'address' members are mutable. A handler must change
them by either modifying the lists place or replacing the old content
with a new list.
"""
def __init__(self, place):
self.place = place
self.names = self._convert_name_dict(place.name)
self.address = self._convert_name_dict(place.address)
@staticmethod
def _convert_name_dict(names):
""" Convert a dictionary of names into a list of PlaceNames.
The dictionary key is split into the primary part of the key
and the suffix (the part after an optional colon).
"""
out = []
if names:
for key, value in names.items():
parts = key.split(':', 1)
out.append(PlaceName(value.strip(),
parts[0].strip(),
parts[1].strip() if len(parts) > 1 else None))
return out
from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
from nominatim.data.place_info import PlaceInfo
class PlaceSanitizer:
@ -109,24 +22,24 @@ class PlaceSanitizer:
names and address before they are used by the token analysers.
"""
def __init__(self, rules):
self.handlers = []
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None:
self.handlers: List[Callable[[ProcessInfo], None]] = []
if rules:
for func in rules:
if 'step' not in func:
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
handler_module = importlib.import_module(module_name)
handler_module: SanitizerHandler = importlib.import_module(module_name)
self.handlers.append(handler_module.create(SanitizerConfig(func)))
def process_names(self, place):
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
""" Extract a sanitized list of names and address parts from the
given place. The function returns a tuple
(list of names, list of address names)
"""
obj = _ProcessInfo(place)
obj = ProcessInfo(place)
for func in self.handlers:
func(obj)

View File

@ -0,0 +1,119 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for sanitizers.
"""
from typing import Optional, Dict, List, Mapping, Callable
from typing_extensions import Protocol, Final
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
from nominatim.data.place_info import PlaceInfo
class PlaceName:
""" A searchable name for a place together with properties.
Every name object saves the name proper and two basic properties:
* 'kind' describes the name of the OSM key used without any suffixes
(i.e. the part after the colon removed)
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
is the part of the key after the first colon.
In addition to that, the name may have arbitrary additional attributes.
Which attributes are used, depends on the token analyser.
"""
def __init__(self, name: str, kind: str, suffix: Optional[str]):
self.name = name
self.kind = kind
self.suffix = suffix
self.attr: Dict[str, str] = {}
def __repr__(self) -> str:
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
def clone(self, name: Optional[str] = None,
kind: Optional[str] = None,
suffix: Optional[str] = None,
attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
""" Create a deep copy of the place name, optionally with the
given parameters replaced. In the attribute list only the given
keys are updated. The list is not replaced completely.
In particular, the function cannot to be used to remove an
attribute from a place name.
"""
newobj = PlaceName(name or self.name,
kind or self.kind,
suffix or self.suffix)
newobj.attr.update(self.attr)
if attr:
newobj.attr.update(attr)
return newobj
def set_attr(self, key: str, value: str) -> None:
""" Add the given property to the name. If the property was already
set, then the value is overwritten.
"""
self.attr[key] = value
def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
""" Return the given property or the value of 'default' if it
is not set.
"""
return self.attr.get(key, default)
def has_attr(self, key: str) -> bool:
""" Check if the given attribute is set.
"""
return key in self.attr
class ProcessInfo:
""" Container class for information handed into to handler functions.
The 'names' and 'address' members are mutable. A handler must change
them by either modifying the lists place or replacing the old content
with a new list.
"""
def __init__(self, place: PlaceInfo):
self.place: Final = place
self.names = self._convert_name_dict(place.name)
self.address = self._convert_name_dict(place.address)
@staticmethod
def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
""" Convert a dictionary of names into a list of PlaceNames.
The dictionary key is split into the primary part of the key
and the suffix (the part after an optional colon).
"""
out = []
if names:
for key, value in names.items():
parts = key.split(':', 1)
out.append(PlaceName(value.strip(),
parts[0].strip(),
parts[1].strip() if len(parts) > 1 else None))
return out
class SanitizerHandler(Protocol):
""" Protocol for sanitizer modules.
"""
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""
A sanitizer must define a single function `create`. It takes the
dictionary with the configuration information for the sanitizer and
returns a function that transforms name and address.
"""

View File

@ -24,11 +24,15 @@ Arguments:
or a list of strings, where each string is a regular
expression that must match the full house number value.
"""
from typing import Callable, Iterator, List
import re
from nominatim.tokenizer.sanitizers.base import ProcessInfo, PlaceName
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
class _HousenumberSanitizer:
def __init__(self, config):
def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter_kind('housenumber')
self.split_regexp = config.get_delimiter()
@ -37,13 +41,13 @@ class _HousenumberSanitizer:
def __call__(self, obj):
def __call__(self, obj: ProcessInfo) -> None:
if not obj.address:
return
new_address = []
new_address: List[PlaceName] = []
for item in obj.address:
if self.filter_kind(item):
if self.filter_kind(item.kind):
if self._treat_as_name(item.name):
obj.names.append(item.clone(kind='housenumber'))
else:
@ -56,7 +60,7 @@ class _HousenumberSanitizer:
obj.address = new_address
def sanitize(self, value):
def sanitize(self, value: str) -> Iterator[str]:
""" Extract housenumbers in a regularized format from an OSM value.
The function works as a generator that yields all valid housenumbers
@ -67,16 +71,15 @@ class _HousenumberSanitizer:
yield from self._regularize(hnr)
@staticmethod
def _regularize(hnr):
def _regularize(self, hnr: str) -> Iterator[str]:
yield hnr
def _treat_as_name(self, housenumber):
def _treat_as_name(self, housenumber: str) -> bool:
return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
def create(config):
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a housenumber processing function.
"""

View File

@ -20,11 +20,15 @@ Arguments:
objects that have no country assigned. These are always
assumed to have no postcode.
"""
from typing import Callable, Optional, Tuple
from nominatim.data.postcode_format import PostcodeFormatter
from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
class _PostcodeSanitizer:
def __init__(self, config):
def __init__(self, config: SanitizerConfig) -> None:
self.convert_to_address = config.get_bool('convert-to-address', True)
self.matcher = PostcodeFormatter()
@ -33,7 +37,7 @@ class _PostcodeSanitizer:
self.matcher.set_default_pattern(default_pattern)
def __call__(self, obj):
def __call__(self, obj: ProcessInfo) -> None:
if not obj.address:
return
@ -52,7 +56,7 @@ class _PostcodeSanitizer:
postcode.set_attr('variant', formatted[1])
def scan(self, postcode, country):
def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
""" Check the postcode for correct formatting and return the
normalized version. Returns None if the postcode does not
correspond to the oficial format of the given country.
@ -61,13 +65,15 @@ class _PostcodeSanitizer:
if match is None:
return None
assert country is not None
return self.matcher.normalize(country, match),\
' '.join(filter(lambda p: p is not None, match.groups()))
def create(config):
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a housenumber processing function.
"""

View File

@ -7,20 +7,28 @@
"""
Configuration for Sanitizers.
"""
from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
from collections import UserDict
import re
from nominatim.errors import UsageError
class SanitizerConfig(UserDict):
# working around missing generics in Python < 3.8
# See https://github.com/python/typing/issues/60#issuecomment-869757075
if TYPE_CHECKING:
_BaseUserDict = UserDict[str, Any]
else:
_BaseUserDict = UserDict
class SanitizerConfig(_BaseUserDict):
""" Dictionary with configuration options for a sanitizer.
In addition to the usualy dictionary function, the class provides
In addition to the usual dictionary function, the class provides
accessors to standard sanatizer options that are used by many of the
sanitizers.
"""
def get_string_list(self, param, default=tuple()):
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
""" Extract a configuration parameter as a string list.
If the parameter value is a simple string, it is returned as a
one-item list. If the parameter value does not exist, the given
@ -44,7 +52,7 @@ class SanitizerConfig(UserDict):
return values
def get_bool(self, param, default=None):
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
""" Extract a configuration parameter as a boolean.
The parameter must be one of the yaml boolean values or an
user error will be raised. If `default` is given, then the parameter
@ -58,7 +66,7 @@ class SanitizerConfig(UserDict):
return value
def get_delimiter(self, default=',;'):
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
""" Return the 'delimiter' parameter in the configuration as a
compiled regular expression that can be used to split the names on the
delimiters. The regular expression makes sure that the resulting names
@ -76,7 +84,7 @@ class SanitizerConfig(UserDict):
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
def get_filter_kind(self, *default):
def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
""" Return a filter function for the name kind from the 'filter-kind'
config parameter. The filter functions takes a name item and returns
True when the item passes the filter.
@ -93,4 +101,4 @@ class SanitizerConfig(UserDict):
regexes = [re.compile(regex) for regex in filters]
return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
return lambda name: any(regex.fullmatch(name) for regex in regexes)

View File

@ -11,13 +11,18 @@ Arguments:
delimiters: Define the set of characters to be used for
splitting the list. (default: ',;')
"""
def create(config):
from typing import Callable
from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a name processing function that splits name values with
multiple values into their components.
"""
regexp = config.get_delimiter()
def _process(obj):
def _process(obj: ProcessInfo) -> None:
if not obj.names:
return

View File

@ -9,12 +9,17 @@ This sanitizer creates additional name variants for names that have
addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
only the main name part with the bracket part removed.
"""
from typing import Callable
def create(_):
from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a name processing function that creates additional name variants
for bracket addendums.
"""
def _process(obj):
def _process(obj: ProcessInfo) -> None:
""" Add variants for names that have a bracket extension.
"""
if obj.names:

View File

@ -30,13 +30,17 @@ Arguments:
any analyzer tagged) is retained. (default: replace)
"""
from typing import Callable, Dict, Optional, List
from nominatim.data import country_info
from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
class _AnalyzerByLanguage:
""" Processor for tagging the language of names in a place.
"""
def __init__(self, config):
def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter_kind()
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
@ -44,8 +48,8 @@ class _AnalyzerByLanguage:
self._compute_default_languages(config.get('use-defaults', 'no'))
def _compute_default_languages(self, use_defaults):
self.deflangs = {}
def _compute_default_languages(self, use_defaults: str) -> None:
self.deflangs: Dict[Optional[str], List[str]] = {}
if use_defaults in ('mono', 'all'):
for ccode, clangs in country_info.iterate('languages'):
@ -56,21 +60,21 @@ class _AnalyzerByLanguage:
self.deflangs[ccode] = clangs
def _suffix_matches(self, suffix):
def _suffix_matches(self, suffix: str) -> bool:
if self.whitelist is None:
return len(suffix) in (2, 3) and suffix.islower()
return suffix in self.whitelist
def __call__(self, obj):
def __call__(self, obj: ProcessInfo) -> None:
if not obj.names:
return
more_names = []
for name in (n for n in obj.names
if not n.has_attr('analyzer') and self.filter_kind(n)):
if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
if name.suffix:
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
else:
@ -88,7 +92,7 @@ class _AnalyzerByLanguage:
obj.names.extend(more_names)
def create(config):
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that sets the analyzer property depending on the
language of the tag.
"""

View File

@ -82,32 +82,32 @@ def test_create_split_regex_empty_delimiter():
def test_create_kind_filter_no_params(inp):
filt = SanitizerConfig().get_filter_kind()
assert filt(PlaceName('something', inp, ''))
assert filt(inp)
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
def test_create_kind_filter_custom_regex_positive(kind):
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
assert filt(PlaceName('something', kind, ''))
assert filt(kind)
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
def test_create_kind_filter_custom_regex_negative(kind):
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
assert not filt(PlaceName('something', kind, ''))
assert not filt(kind)
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
def test_create_kind_filter_many_positive(kind):
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
assert filt(PlaceName('something', kind, ''))
assert filt(kind)
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
def test_create_kind_filter_many_negative(kind):
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
assert not filt(PlaceName('something', kind, ''))
assert not filt(kind)