add type hints for sanitizers

2024-12-24 21:44:45 +03:00 · 2022-07-12 23:15:19 +02:00 · 2022-07-12 23:15:19 +02:00 · 62eedbb8f6
commit 62eedbb8f6
parent 5617bffe2f
10 changed files with 200 additions and 133 deletions
--- a/nominatim/data/postcode_format.py
+++ b/nominatim/data/postcode_format.py
@ -79,7 +79,7 @@ class PostcodeFormatter:
        self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})


-    def get_matcher(self, country_code: str) -> Optional[CountryPostcodeMatcher]:
+    def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
        """ Return the CountryPostcodeMatcher for the given country.
            Returns None if the country doesn't have a postcode and the
            default matcher if there is no specific matcher configured for
@ -88,10 +88,12 @@ class PostcodeFormatter:
        if country_code in self.country_without_postcode:
            return None

+        assert country_code is not None
+
        return self.country_matcher.get(country_code, self.default_matcher)


-    def match(self, country_code: str, postcode: str) -> Optional[Match[str]]:
+    def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
        """ Match the given postcode against the postcode pattern for this
            matcher. Returns a `re.Match` object if the country has a pattern
            and the match was successful or None if the match failed.
@ -99,6 +101,8 @@ class PostcodeFormatter:
        if country_code in self.country_without_postcode:
            return None

+        assert country_code is not None
+
        return self.country_matcher.get(country_code, self.default_matcher).match(postcode)


--- a/nominatim/tokenizer/place_sanitizer.py
+++ b/nominatim/tokenizer/place_sanitizer.py
@ -8,100 +8,13 @@
 Handler for cleaning name and address tags in place information before it
 is handed to the token analysis.
 """
+from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
 import importlib

 from nominatim.errors import UsageError
 from nominatim.tokenizer.sanitizers.config import SanitizerConfig
-
-class PlaceName:
-    """ A searchable name for a place together with properties.
-        Every name object saves the name proper and two basic properties:
-        * 'kind' describes the name of the OSM key used without any suffixes
-          (i.e. the part after the colon removed)
-        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
-          is the part of the key after the first colon.
-        In addition to that, the name may have arbitrary additional attributes.
-        Which attributes are used, depends on the token analyser.
-    """
-
-    def __init__(self, name, kind, suffix):
-        self.name = name
-        self.kind = kind
-        self.suffix = suffix
-        self.attr = {}
-
-
-    def __repr__(self):
-        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
-
-
-    def clone(self, name=None, kind=None, suffix=None, attr=None):
-        """ Create a deep copy of the place name, optionally with the
-            given parameters replaced. In the attribute list only the given
-            keys are updated. The list is not replaced completely.
-            In particular, the function cannot to be used to remove an
-            attribute from a place name.
-        """
-        newobj = PlaceName(name or self.name,
-                           kind or self.kind,
-                           suffix or self.suffix)
-
-        newobj.attr.update(self.attr)
-        if attr:
-            newobj.attr.update(attr)
-
-        return newobj
-
-
-    def set_attr(self, key, value):
-        """ Add the given property to the name. If the property was already
-            set, then the value is overwritten.
-        """
-        self.attr[key] = value
-
-
-    def get_attr(self, key, default=None):
-        """ Return the given property or the value of 'default' if it
-            is not set.
-        """
-        return self.attr.get(key, default)
-
-
-    def has_attr(self, key):
-        """ Check if the given attribute is set.
-        """
-        return key in self.attr
-
-
-class _ProcessInfo:
-    """ Container class for information handed into to handler functions.
-        The 'names' and 'address' members are mutable. A handler must change
-        them by either modifying the lists place or replacing the old content
-        with a new list.
-    """
-
-    def __init__(self, place):
-        self.place = place
-        self.names = self._convert_name_dict(place.name)
-        self.address = self._convert_name_dict(place.address)
-
-
-    @staticmethod
-    def _convert_name_dict(names):
-        """ Convert a dictionary of names into a list of PlaceNames.
-            The dictionary key is split into the primary part of the key
-            and the suffix (the part after an optional colon).
-        """
-        out = []
-
-        if names:
-            for key, value in names.items():
-                parts = key.split(':', 1)
-                out.append(PlaceName(value.strip(),
-                                     parts[0].strip(),
-                                     parts[1].strip() if len(parts) > 1 else None))
-
-        return out
+from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
+from nominatim.data.place_info import PlaceInfo


 class PlaceSanitizer:
@ -109,24 +22,24 @@ class PlaceSanitizer:
        names and address before they are used by the token analysers.
    """

-    def __init__(self, rules):
-        self.handlers = []
+    def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None:
+        self.handlers: List[Callable[[ProcessInfo], None]] = []

        if rules:
            for func in rules:
                if 'step' not in func:
                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
-                handler_module = importlib.import_module(module_name)
+                handler_module: SanitizerHandler = importlib.import_module(module_name)
                self.handlers.append(handler_module.create(SanitizerConfig(func)))


-    def process_names(self, place):
+    def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
        """ Extract a sanitized list of names and address parts from the
            given place. The function returns a tuple
            (list of names, list of address names)
        """
-        obj = _ProcessInfo(place)
+        obj = ProcessInfo(place)

        for func in self.handlers:
            func(obj)
--- a/nominatim/tokenizer/sanitizers/base.py
+++ b/nominatim/tokenizer/sanitizers/base.py
@ -0,0 +1,119 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Common data types and protocols for sanitizers.
+"""
+from typing import Optional, Dict, List, Mapping, Callable
+
+from typing_extensions import Protocol, Final
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+from nominatim.data.place_info import PlaceInfo
+
+class PlaceName:
+    """ A searchable name for a place together with properties.
+        Every name object saves the name proper and two basic properties:
+        * 'kind' describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+        In addition to that, the name may have arbitrary additional attributes.
+        Which attributes are used, depends on the token analyser.
+    """
+
+    def __init__(self, name: str, kind: str, suffix: Optional[str]):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr: Dict[str, str] = {}
+
+
+    def __repr__(self) -> str:
+        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+    def clone(self, name: Optional[str] = None,
+              kind: Optional[str] = None,
+              suffix: Optional[str] = None,
+              attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key: str, value: str) -> None:
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key: str) -> bool:
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
+
+
+class ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place: PlaceInfo):
+        self.place: Final = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class SanitizerHandler(Protocol):
+    """ Protocol for sanitizer modules.
+    """
+
+    def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+        """
+        A sanitizer must define a single function `create`. It takes the
+        dictionary with the configuration information for the sanitizer and
+        returns a function that transforms name and address.
+        """
--- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@ -24,11 +24,15 @@ Arguments:
                     or a list of strings, where each string is a regular
                     expression that must match the full house number value.
 """
+from typing import Callable, Iterator, List
 import re

+from nominatim.tokenizer.sanitizers.base import ProcessInfo, PlaceName
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
 class _HousenumberSanitizer:

-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
        self.filter_kind = config.get_filter_kind('housenumber')
        self.split_regexp = config.get_delimiter()

@ -37,13 +41,13 @@ class _HousenumberSanitizer:



-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.address:
            return

-        new_address = []
+        new_address: List[PlaceName] = []
        for item in obj.address:
-            if self.filter_kind(item):
+            if self.filter_kind(item.kind):
                if self._treat_as_name(item.name):
                    obj.names.append(item.clone(kind='housenumber'))
                else:
@ -56,7 +60,7 @@ class _HousenumberSanitizer:
        obj.address = new_address


-    def sanitize(self, value):
+    def sanitize(self, value: str) -> Iterator[str]:
        """ Extract housenumbers in a regularized format from an OSM value.

            The function works as a generator that yields all valid housenumbers
@ -67,16 +71,15 @@ class _HousenumberSanitizer:
                yield from self._regularize(hnr)


-    @staticmethod
-    def _regularize(hnr):
+    def _regularize(self, hnr: str) -> Iterator[str]:
        yield hnr


-    def _treat_as_name(self, housenumber):
+    def _treat_as_name(self, housenumber: str) -> bool:
        return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)


-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a housenumber processing function.
    """

--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@ -20,11 +20,15 @@ Arguments:
                        objects that have no country assigned. These are always
                        assumed to have no postcode.
 """
+from typing import Callable, Optional, Tuple
+
 from nominatim.data.postcode_format import PostcodeFormatter
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig

 class _PostcodeSanitizer:

-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
        self.convert_to_address = config.get_bool('convert-to-address', True)
        self.matcher = PostcodeFormatter()

@ -33,7 +37,7 @@ class _PostcodeSanitizer:
            self.matcher.set_default_pattern(default_pattern)


-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.address:
            return

@ -52,7 +56,7 @@ class _PostcodeSanitizer:
                postcode.set_attr('variant', formatted[1])


-    def scan(self, postcode, country):
+    def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
        """ Check the postcode for correct formatting and return the
            normalized version. Returns None if the postcode does not
            correspond to the oficial format of the given country.
@ -61,13 +65,15 @@ class _PostcodeSanitizer:
        if match is None:
            return None

+        assert country is not None
+
        return self.matcher.normalize(country, match),\
               ' '.join(filter(lambda p: p is not None, match.groups()))




-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a housenumber processing function.
    """

--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@ -7,20 +7,28 @@
 """
 Configuration for Sanitizers.
 """
+from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
 from collections import UserDict
 import re

 from nominatim.errors import UsageError

-class SanitizerConfig(UserDict):
+# working around missing generics in Python < 3.8
+# See https://github.com/python/typing/issues/60#issuecomment-869757075
+if TYPE_CHECKING:
+    _BaseUserDict = UserDict[str, Any]
+else:
+    _BaseUserDict = UserDict
+
+class SanitizerConfig(_BaseUserDict):
    """ Dictionary with configuration options for a sanitizer.

-        In addition to the usualy dictionary function, the class provides
+        In addition to the usual dictionary function, the class provides
        accessors to standard sanatizer options that are used by many of the
        sanitizers.
    """

-    def get_string_list(self, param, default=tuple()):
+    def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
        """ Extract a configuration parameter as a string list.
            If the parameter value is a simple string, it is returned as a
            one-item list. If the parameter value does not exist, the given
@ -44,7 +52,7 @@ class SanitizerConfig(UserDict):
        return values


-    def get_bool(self, param, default=None):
+    def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
        """ Extract a configuration parameter as a boolean.
            The parameter must be one of the yaml boolean values or an
            user error will be raised. If `default` is given, then the parameter
@ -58,7 +66,7 @@ class SanitizerConfig(UserDict):
        return value


-    def get_delimiter(self, default=',;'):
+    def get_delimiter(self, default: str = ',;') -> Pattern[str]:
        """ Return the 'delimiter' parameter in the configuration as a
            compiled regular expression that can be used to split the names on the
            delimiters. The regular expression makes sure that the resulting names
@ -76,7 +84,7 @@ class SanitizerConfig(UserDict):
        return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))


-    def get_filter_kind(self, *default):
+    def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
        """ Return a filter function for the name kind from the 'filter-kind'
            config parameter. The filter functions takes a name item and returns
            True when the item passes the filter.
@ -93,4 +101,4 @@ class SanitizerConfig(UserDict):

        regexes = [re.compile(regex) for regex in filters]

-        return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
+        return lambda name: any(regex.fullmatch(name) for regex in regexes)
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@ -11,13 +11,18 @@ Arguments:
    delimiters: Define the set of characters to be used for
                splitting the list. (default: ',;')
 """
-def create(config):
+from typing import Callable
+
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a name processing function that splits name values with
        multiple values into their components.
    """
    regexp = config.get_delimiter()

-    def _process(obj):
+    def _process(obj: ProcessInfo) -> None:
        if not obj.names:
            return

--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@ -9,12 +9,17 @@ This sanitizer creates additional name variants for names that have
 addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
 only the main name part with the bracket part removed.
 """
+from typing import Callable

-def create(_):
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a name processing function that creates additional name variants
        for bracket addendums.
    """
-    def _process(obj):
+    def _process(obj: ProcessInfo) -> None:
        """ Add variants for names that have a bracket extension.
        """
        if obj.names:
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@ -30,13 +30,17 @@ Arguments:
          any analyzer tagged) is retained. (default: replace)

 """
+from typing import Callable, Dict, Optional, List
+
 from nominatim.data import country_info
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig

 class _AnalyzerByLanguage:
    """ Processor for tagging the language of names in a place.
    """

-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
        self.filter_kind = config.get_filter_kind()
        self.replace = config.get('mode', 'replace') != 'append'
        self.whitelist = config.get('whitelist')
@ -44,8 +48,8 @@ class _AnalyzerByLanguage:
        self._compute_default_languages(config.get('use-defaults', 'no'))


-    def _compute_default_languages(self, use_defaults):
-        self.deflangs = {}
+    def _compute_default_languages(self, use_defaults: str) -> None:
+        self.deflangs: Dict[Optional[str], List[str]] = {}

        if use_defaults in ('mono', 'all'):
            for ccode, clangs in country_info.iterate('languages'):
@ -56,21 +60,21 @@ class _AnalyzerByLanguage:
                        self.deflangs[ccode] = clangs


-    def _suffix_matches(self, suffix):
+    def _suffix_matches(self, suffix: str) -> bool:
        if self.whitelist is None:
            return len(suffix) in (2, 3) and suffix.islower()

        return suffix in self.whitelist


-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
        if not obj.names:
            return

        more_names = []

        for name in (n for n in obj.names
-                     if not n.has_attr('analyzer') and self.filter_kind(n)):
+                     if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
            if name.suffix:
                langs = [name.suffix] if self._suffix_matches(name.suffix) else None
            else:
@ -88,7 +92,7 @@ class _AnalyzerByLanguage:
        obj.names.extend(more_names)


-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
    """ Create a function that sets the analyzer property depending on the
        language of the tag.
    """
--- a/test/python/tokenizer/sanitizers/test_sanitizer_config.py
+++ b/test/python/tokenizer/sanitizers/test_sanitizer_config.py
@ -82,32 +82,32 @@ def test_create_split_regex_empty_delimiter():
 def test_create_kind_filter_no_params(inp):
    filt = SanitizerConfig().get_filter_kind()

-    assert filt(PlaceName('something', inp, ''))
+    assert filt(inp)


@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
 def test_create_kind_filter_custom_regex_positive(kind):
    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()

-    assert filt(PlaceName('something', kind, ''))
+    assert filt(kind)


@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
 def test_create_kind_filter_custom_regex_negative(kind):
    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()

-    assert not filt(PlaceName('something', kind, ''))
+    assert not filt(kind)


@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
 def test_create_kind_filter_many_positive(kind):
    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()

-    assert filt(PlaceName('something', kind, ''))
+    assert filt(kind)


@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
 def test_create_kind_filter_many_negative(kind):
    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()

-    assert not filt(PlaceName('something', kind, ''))
+    assert not filt(kind)