add type hints for sanitizers

2024-11-22 21:28:10 +03:00 · 2022-07-12 23:15:19 +02:00 · 2022-07-12 23:15:19 +02:00 · 62eedbb8f6
commit 62eedbb8f6
parent 5617bffe2f
10 changed files with 200 additions and 133 deletions
--- a/nominatim/data/postcode_format.py
+++ b/nominatim/data/postcode_format.py
@ -79,7 +79,7 @@ class PostcodeFormatter:
        self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
-    def get_matcher(self, country_code: str) -> Optional[CountryPostcodeMatcher]:
+    def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
        """ Return the CountryPostcodeMatcher for the given country.
            Returns None if the country doesn't have a postcode and the
            default matcher if there is no specific matcher configured for
@ -88,10 +88,12 @@ class PostcodeFormatter:
        if country_code in self.country_without_postcode:
            return None
        assert country_code is not None
        return self.country_matcher.get(country_code, self.default_matcher)
-    def match(self, country_code: str, postcode: str) -> Optional[Match[str]]:
+    def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
        """ Match the given postcode against the postcode pattern for this
            matcher. Returns a `re.Match` object if the country has a pattern
            and the match was successful or None if the match failed.
@ -99,6 +101,8 @@ class PostcodeFormatter:
        if country_code in self.country_without_postcode:
            return None
        assert country_code is not None
        return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
--- a/nominatim/tokenizer/place_sanitizer.py
+++ b/nominatim/tokenizer/place_sanitizer.py
@ -8,100 +8,13 @@
 Handler for cleaning name and address tags in place information before it
 is handed to the token analysis.
 """
 from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
 import importlib
 from nominatim.errors import UsageError
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
-
+from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
-class PlaceName:
+from nominatim.data.place_info import PlaceInfo
    """ A searchable name for a place together with properties.
        Every name object saves the name proper and two basic properties:
        * 'kind' describes the name of the OSM key used without any suffixes
          (i.e. the part after the colon removed)
        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
          is the part of the key after the first colon.
        In addition to that, the name may have arbitrary additional attributes.
        Which attributes are used, depends on the token analyser.
    """
    def __init__(self, name, kind, suffix):
        self.name = name
        self.kind = kind
        self.suffix = suffix
        self.attr = {}
    def __repr__(self):
        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
    def clone(self, name=None, kind=None, suffix=None, attr=None):
        """ Create a deep copy of the place name, optionally with the
            given parameters replaced. In the attribute list only the given
            keys are updated. The list is not replaced completely.
            In particular, the function cannot to be used to remove an
            attribute from a place name.
        """
        newobj = PlaceName(name or self.name,
                           kind or self.kind,
                           suffix or self.suffix)
        newobj.attr.update(self.attr)
        if attr:
            newobj.attr.update(attr)
        return newobj
    def set_attr(self, key, value):
        """ Add the given property to the name. If the property was already
            set, then the value is overwritten.
        """
        self.attr[key] = value
    def get_attr(self, key, default=None):
        """ Return the given property or the value of 'default' if it
            is not set.
        """
        return self.attr.get(key, default)
    def has_attr(self, key):
        """ Check if the given attribute is set.
        """
        return key in self.attr
 class _ProcessInfo:
    """ Container class for information handed into to handler functions.
        The 'names' and 'address' members are mutable. A handler must change
        them by either modifying the lists place or replacing the old content
        with a new list.
    """
    def __init__(self, place):
        self.place = place
        self.names = self._convert_name_dict(place.name)
        self.address = self._convert_name_dict(place.address)
    @staticmethod
    def _convert_name_dict(names):
        """ Convert a dictionary of names into a list of PlaceNames.
            The dictionary key is split into the primary part of the key
            and the suffix (the part after an optional colon).
        """
        out = []
        if names:
            for key, value in names.items():
                parts = key.split(':', 1)
                out.append(PlaceName(value.strip(),
                                     parts[0].strip(),
                                     parts[1].strip() if len(parts) > 1 else None))
        return out
 class PlaceSanitizer:
@ -109,24 +22,24 @@ class PlaceSanitizer:
        names and address before they are used by the token analysers.
    """
-    def __init__(self, rules):
+    def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None:
-        self.handlers = []
+        self.handlers: List[Callable[[ProcessInfo], None]] = []
        if rules:
            for func in rules:
                if 'step' not in func:
                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
-                handler_module = importlib.import_module(module_name)
+                handler_module: SanitizerHandler = importlib.import_module(module_name)
                self.handlers.append(handler_module.create(SanitizerConfig(func)))
-    def process_names(self, place):
+    def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
        """ Extract a sanitized list of names and address parts from the
            given place. The function returns a tuple
            (list of names, list of address names)
        """
-        obj = _ProcessInfo(place)
+        obj = ProcessInfo(place)
        for func in self.handlers:
            func(obj)
--- a/nominatim/tokenizer/sanitizers/base.py
+++ b/nominatim/tokenizer/sanitizers/base.py
@ -0,0 +1,119 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2022 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Common data types and protocols for sanitizers.
 """
 from typing import Optional, Dict, List, Mapping, Callable
 from typing_extensions import Protocol, Final
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 from nominatim.data.place_info import PlaceInfo
 class PlaceName:
    """ A searchable name for a place together with properties.
        Every name object saves the name proper and two basic properties:
        * 'kind' describes the name of the OSM key used without any suffixes
          (i.e. the part after the colon removed)
        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
          is the part of the key after the first colon.
        In addition to that, the name may have arbitrary additional attributes.
        Which attributes are used, depends on the token analyser.
    """
    def __init__(self, name: str, kind: str, suffix: Optional[str]):
        self.name = name
        self.kind = kind
        self.suffix = suffix
        self.attr: Dict[str, str] = {}
    def __repr__(self) -> str:
        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
    def clone(self, name: Optional[str] = None,
              kind: Optional[str] = None,
              suffix: Optional[str] = None,
              attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
        """ Create a deep copy of the place name, optionally with the
            given parameters replaced. In the attribute list only the given
            keys are updated. The list is not replaced completely.
            In particular, the function cannot to be used to remove an
            attribute from a place name.
        """
        newobj = PlaceName(name or self.name,
                           kind or self.kind,
                           suffix or self.suffix)
        newobj.attr.update(self.attr)
        if attr:
            newobj.attr.update(attr)
        return newobj
    def set_attr(self, key: str, value: str) -> None:
        """ Add the given property to the name. If the property was already
            set, then the value is overwritten.
        """
        self.attr[key] = value
    def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
        """ Return the given property or the value of 'default' if it
            is not set.
        """
        return self.attr.get(key, default)
    def has_attr(self, key: str) -> bool:
        """ Check if the given attribute is set.
        """
        return key in self.attr
 class ProcessInfo:
    """ Container class for information handed into to handler functions.
        The 'names' and 'address' members are mutable. A handler must change
        them by either modifying the lists place or replacing the old content
        with a new list.
    """
    def __init__(self, place: PlaceInfo):
        self.place: Final = place
        self.names = self._convert_name_dict(place.name)
        self.address = self._convert_name_dict(place.address)
    @staticmethod
    def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
        """ Convert a dictionary of names into a list of PlaceNames.
            The dictionary key is split into the primary part of the key
            and the suffix (the part after an optional colon).
        """
        out = []
        if names:
            for key, value in names.items():
                parts = key.split(':', 1)
                out.append(PlaceName(value.strip(),
                                     parts[0].strip(),
                                     parts[1].strip() if len(parts) > 1 else None))
        return out
 class SanitizerHandler(Protocol):
    """ Protocol for sanitizer modules.
    """
    def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
        """
        A sanitizer must define a single function `create`. It takes the
        dictionary with the configuration information for the sanitizer and
        returns a function that transforms name and address.
        """
--- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@ -24,11 +24,15 @@ Arguments:
                     or a list of strings, where each string is a regular
                     expression that must match the full house number value.
 """
 from typing import Callable, Iterator, List
 import re
 from nominatim.tokenizer.sanitizers.base import ProcessInfo, PlaceName
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 class _HousenumberSanitizer:
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
        self.filter_kind = config.get_filter_kind('housenumber')
        self.split_regexp = config.get_delimiter()
@ -37,13 +41,13 @@ class _HousenumberSanitizer:
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.address:
            return
-        new_address = []
+        new_address: List[PlaceName] = []
        for item in obj.address:
-            if self.filter_kind(item):
+            if self.filter_kind(item.kind):
                if self._treat_as_name(item.name):
                    obj.names.append(item.clone(kind='housenumber'))
                else:
@ -56,7 +60,7 @@ class _HousenumberSanitizer:
        obj.address = new_address
-    def sanitize(self, value):
+    def sanitize(self, value: str) -> Iterator[str]:
        """ Extract housenumbers in a regularized format from an OSM value.
            The function works as a generator that yields all valid housenumbers
@ -67,16 +71,15 @@ class _HousenumberSanitizer:
                yield from self._regularize(hnr)
-    @staticmethod
+    def _regularize(self, hnr: str) -> Iterator[str]:
    def _regularize(hnr):
        yield hnr
-    def _treat_as_name(self, housenumber):
+    def _treat_as_name(self, housenumber: str) -> bool:
        return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a housenumber processing function.
    """
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@ -20,11 +20,15 @@ Arguments:
                        objects that have no country assigned. These are always
                        assumed to have no postcode.
 """
 from typing import Callable, Optional, Tuple
 from nominatim.data.postcode_format import PostcodeFormatter
 from nominatim.tokenizer.sanitizers.base import ProcessInfo
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 class _PostcodeSanitizer:
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
        self.convert_to_address = config.get_bool('convert-to-address', True)
        self.matcher = PostcodeFormatter()
@ -33,7 +37,7 @@ class _PostcodeSanitizer:
            self.matcher.set_default_pattern(default_pattern)
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.address:
            return
@ -52,7 +56,7 @@ class _PostcodeSanitizer:
                postcode.set_attr('variant', formatted[1])
-    def scan(self, postcode, country):
+    def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
        """ Check the postcode for correct formatting and return the
            normalized version. Returns None if the postcode does not
            correspond to the oficial format of the given country.
@ -61,13 +65,15 @@ class _PostcodeSanitizer:
        if match is None:
            return None
        assert country is not None
        return self.matcher.normalize(country, match),\
               ' '.join(filter(lambda p: p is not None, match.groups()))
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a housenumber processing function.
    """
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@ -7,20 +7,28 @@
 """
 Configuration for Sanitizers.
 """
 from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
 from collections import UserDict
 import re
 from nominatim.errors import UsageError
-class SanitizerConfig(UserDict):
+# working around missing generics in Python < 3.8
 # See https://github.com/python/typing/issues/60#issuecomment-869757075
 if TYPE_CHECKING:
    _BaseUserDict = UserDict[str, Any]
 else:
    _BaseUserDict = UserDict
 class SanitizerConfig(_BaseUserDict):
    """ Dictionary with configuration options for a sanitizer.
-        In addition to the usualy dictionary function, the class provides
+        In addition to the usual dictionary function, the class provides
        accessors to standard sanatizer options that are used by many of the
        sanitizers.
    """
-    def get_string_list(self, param, default=tuple()):
+    def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
        """ Extract a configuration parameter as a string list.
            If the parameter value is a simple string, it is returned as a
            one-item list. If the parameter value does not exist, the given
@ -44,7 +52,7 @@ class SanitizerConfig(UserDict):
        return values
-    def get_bool(self, param, default=None):
+    def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
        """ Extract a configuration parameter as a boolean.
            The parameter must be one of the yaml boolean values or an
            user error will be raised. If `default` is given, then the parameter
@ -58,7 +66,7 @@ class SanitizerConfig(UserDict):
        return value
-    def get_delimiter(self, default=',;'):
+    def get_delimiter(self, default: str = ',;') -> Pattern[str]:
        """ Return the 'delimiter' parameter in the configuration as a
            compiled regular expression that can be used to split the names on the
            delimiters. The regular expression makes sure that the resulting names
@ -76,7 +84,7 @@ class SanitizerConfig(UserDict):
        return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
-    def get_filter_kind(self, *default):
+    def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
        """ Return a filter function for the name kind from the 'filter-kind'
            config parameter. The filter functions takes a name item and returns
            True when the item passes the filter.
@ -93,4 +101,4 @@ class SanitizerConfig(UserDict):
        regexes = [re.compile(regex) for regex in filters]
-        return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
+        return lambda name: any(regex.fullmatch(name) for regex in regexes)
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@ -11,13 +11,18 @@ Arguments:
    delimiters: Define the set of characters to be used for
                splitting the list. (default: ',;')
 """
-def create(config):
+from typing import Callable
 from nominatim.tokenizer.sanitizers.base import ProcessInfo
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a name processing function that splits name values with
        multiple values into their components.
    """
    regexp = config.get_delimiter()
-    def _process(obj):
+    def _process(obj: ProcessInfo) -> None:
        if not obj.names:
            return
--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@ -9,12 +9,17 @@ This sanitizer creates additional name variants for names that have
 addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
 only the main name part with the bracket part removed.
 """
 from typing import Callable
-def create(_):
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a name processing function that creates additional name variants
        for bracket addendums.
    """
-    def _process(obj):
+    def _process(obj: ProcessInfo) -> None:
        """ Add variants for names that have a bracket extension.
        """
        if obj.names:
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@ -30,13 +30,17 @@ Arguments:
          any analyzer tagged) is retained. (default: replace)
 """
 from typing import Callable, Dict, Optional, List
 from nominatim.data import country_info
 from nominatim.tokenizer.sanitizers.base import ProcessInfo
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 class _AnalyzerByLanguage:
    """ Processor for tagging the language of names in a place.
    """
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
        self.filter_kind = config.get_filter_kind()
        self.replace = config.get('mode', 'replace') != 'append'
        self.whitelist = config.get('whitelist')
@ -44,8 +48,8 @@ class _AnalyzerByLanguage:
        self._compute_default_languages(config.get('use-defaults', 'no'))
-    def _compute_default_languages(self, use_defaults):
+    def _compute_default_languages(self, use_defaults: str) -> None:
-        self.deflangs = {}
+        self.deflangs: Dict[Optional[str], List[str]] = {}
        if use_defaults in ('mono', 'all'):
            for ccode, clangs in country_info.iterate('languages'):
@ -56,21 +60,21 @@ class _AnalyzerByLanguage:
                        self.deflangs[ccode] = clangs
-    def _suffix_matches(self, suffix):
+    def _suffix_matches(self, suffix: str) -> bool:
        if self.whitelist is None:
            return len(suffix) in (2, 3) and suffix.islower()
        return suffix in self.whitelist
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.names:
            return
        more_names = []
        for name in (n for n in obj.names
-                     if not n.has_attr('analyzer') and self.filter_kind(n)):
+                     if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
            if name.suffix:
                langs = [name.suffix] if self._suffix_matches(name.suffix) else None
            else:
@ -88,7 +92,7 @@ class _AnalyzerByLanguage:
        obj.names.extend(more_names)
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a function that sets the analyzer property depending on the
        language of the tag.
    """
--- a/test/python/tokenizer/sanitizers/test_sanitizer_config.py
+++ b/test/python/tokenizer/sanitizers/test_sanitizer_config.py
@ -82,32 +82,32 @@ def test_create_split_regex_empty_delimiter():
 def test_create_kind_filter_no_params(inp):
    filt = SanitizerConfig().get_filter_kind()
-    assert filt(PlaceName('something', inp, ''))
+    assert filt(inp)
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
 def test_create_kind_filter_custom_regex_positive(kind):
    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
-    assert filt(PlaceName('something', kind, ''))
+    assert filt(kind)
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
 def test_create_kind_filter_custom_regex_negative(kind):
    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
-    assert not filt(PlaceName('something', kind, ''))
+    assert not filt(kind)
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
 def test_create_kind_filter_many_positive(kind):
    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
-    assert filt(PlaceName('something', kind, ''))
+    assert filt(kind)
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
 def test_create_kind_filter_many_negative(kind):
    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
-    assert not filt(PlaceName('something', kind, ''))
+    assert not filt(kind)