mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-22 21:28:10 +03:00
add type hints for sanitizers
This commit is contained in:
parent
5617bffe2f
commit
62eedbb8f6
@ -79,7 +79,7 @@ class PostcodeFormatter:
|
|||||||
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
|
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
|
||||||
|
|
||||||
|
|
||||||
def get_matcher(self, country_code: str) -> Optional[CountryPostcodeMatcher]:
|
def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
|
||||||
""" Return the CountryPostcodeMatcher for the given country.
|
""" Return the CountryPostcodeMatcher for the given country.
|
||||||
Returns None if the country doesn't have a postcode and the
|
Returns None if the country doesn't have a postcode and the
|
||||||
default matcher if there is no specific matcher configured for
|
default matcher if there is no specific matcher configured for
|
||||||
@ -88,10 +88,12 @@ class PostcodeFormatter:
|
|||||||
if country_code in self.country_without_postcode:
|
if country_code in self.country_without_postcode:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
assert country_code is not None
|
||||||
|
|
||||||
return self.country_matcher.get(country_code, self.default_matcher)
|
return self.country_matcher.get(country_code, self.default_matcher)
|
||||||
|
|
||||||
|
|
||||||
def match(self, country_code: str, postcode: str) -> Optional[Match[str]]:
|
def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
|
||||||
""" Match the given postcode against the postcode pattern for this
|
""" Match the given postcode against the postcode pattern for this
|
||||||
matcher. Returns a `re.Match` object if the country has a pattern
|
matcher. Returns a `re.Match` object if the country has a pattern
|
||||||
and the match was successful or None if the match failed.
|
and the match was successful or None if the match failed.
|
||||||
@ -99,6 +101,8 @@ class PostcodeFormatter:
|
|||||||
if country_code in self.country_without_postcode:
|
if country_code in self.country_without_postcode:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
assert country_code is not None
|
||||||
|
|
||||||
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
|
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,100 +8,13 @@
|
|||||||
Handler for cleaning name and address tags in place information before it
|
Handler for cleaning name and address tags in place information before it
|
||||||
is handed to the token analysis.
|
is handed to the token analysis.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
|
||||||
import importlib
|
import importlib
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
|
||||||
class PlaceName:
|
from nominatim.data.place_info import PlaceInfo
|
||||||
""" A searchable name for a place together with properties.
|
|
||||||
Every name object saves the name proper and two basic properties:
|
|
||||||
* 'kind' describes the name of the OSM key used without any suffixes
|
|
||||||
(i.e. the part after the colon removed)
|
|
||||||
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
|
||||||
is the part of the key after the first colon.
|
|
||||||
In addition to that, the name may have arbitrary additional attributes.
|
|
||||||
Which attributes are used, depends on the token analyser.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, name, kind, suffix):
|
|
||||||
self.name = name
|
|
||||||
self.kind = kind
|
|
||||||
self.suffix = suffix
|
|
||||||
self.attr = {}
|
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
|
|
||||||
|
|
||||||
|
|
||||||
def clone(self, name=None, kind=None, suffix=None, attr=None):
|
|
||||||
""" Create a deep copy of the place name, optionally with the
|
|
||||||
given parameters replaced. In the attribute list only the given
|
|
||||||
keys are updated. The list is not replaced completely.
|
|
||||||
In particular, the function cannot to be used to remove an
|
|
||||||
attribute from a place name.
|
|
||||||
"""
|
|
||||||
newobj = PlaceName(name or self.name,
|
|
||||||
kind or self.kind,
|
|
||||||
suffix or self.suffix)
|
|
||||||
|
|
||||||
newobj.attr.update(self.attr)
|
|
||||||
if attr:
|
|
||||||
newobj.attr.update(attr)
|
|
||||||
|
|
||||||
return newobj
|
|
||||||
|
|
||||||
|
|
||||||
def set_attr(self, key, value):
|
|
||||||
""" Add the given property to the name. If the property was already
|
|
||||||
set, then the value is overwritten.
|
|
||||||
"""
|
|
||||||
self.attr[key] = value
|
|
||||||
|
|
||||||
|
|
||||||
def get_attr(self, key, default=None):
|
|
||||||
""" Return the given property or the value of 'default' if it
|
|
||||||
is not set.
|
|
||||||
"""
|
|
||||||
return self.attr.get(key, default)
|
|
||||||
|
|
||||||
|
|
||||||
def has_attr(self, key):
|
|
||||||
""" Check if the given attribute is set.
|
|
||||||
"""
|
|
||||||
return key in self.attr
|
|
||||||
|
|
||||||
|
|
||||||
class _ProcessInfo:
|
|
||||||
""" Container class for information handed into to handler functions.
|
|
||||||
The 'names' and 'address' members are mutable. A handler must change
|
|
||||||
them by either modifying the lists place or replacing the old content
|
|
||||||
with a new list.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, place):
|
|
||||||
self.place = place
|
|
||||||
self.names = self._convert_name_dict(place.name)
|
|
||||||
self.address = self._convert_name_dict(place.address)
|
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _convert_name_dict(names):
|
|
||||||
""" Convert a dictionary of names into a list of PlaceNames.
|
|
||||||
The dictionary key is split into the primary part of the key
|
|
||||||
and the suffix (the part after an optional colon).
|
|
||||||
"""
|
|
||||||
out = []
|
|
||||||
|
|
||||||
if names:
|
|
||||||
for key, value in names.items():
|
|
||||||
parts = key.split(':', 1)
|
|
||||||
out.append(PlaceName(value.strip(),
|
|
||||||
parts[0].strip(),
|
|
||||||
parts[1].strip() if len(parts) > 1 else None))
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
class PlaceSanitizer:
|
class PlaceSanitizer:
|
||||||
@ -109,24 +22,24 @@ class PlaceSanitizer:
|
|||||||
names and address before they are used by the token analysers.
|
names and address before they are used by the token analysers.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rules):
|
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None:
|
||||||
self.handlers = []
|
self.handlers: List[Callable[[ProcessInfo], None]] = []
|
||||||
|
|
||||||
if rules:
|
if rules:
|
||||||
for func in rules:
|
for func in rules:
|
||||||
if 'step' not in func:
|
if 'step' not in func:
|
||||||
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
||||||
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
||||||
handler_module = importlib.import_module(module_name)
|
handler_module: SanitizerHandler = importlib.import_module(module_name)
|
||||||
self.handlers.append(handler_module.create(SanitizerConfig(func)))
|
self.handlers.append(handler_module.create(SanitizerConfig(func)))
|
||||||
|
|
||||||
|
|
||||||
def process_names(self, place):
|
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
|
||||||
""" Extract a sanitized list of names and address parts from the
|
""" Extract a sanitized list of names and address parts from the
|
||||||
given place. The function returns a tuple
|
given place. The function returns a tuple
|
||||||
(list of names, list of address names)
|
(list of names, list of address names)
|
||||||
"""
|
"""
|
||||||
obj = _ProcessInfo(place)
|
obj = ProcessInfo(place)
|
||||||
|
|
||||||
for func in self.handlers:
|
for func in self.handlers:
|
||||||
func(obj)
|
func(obj)
|
||||||
|
119
nominatim/tokenizer/sanitizers/base.py
Normal file
119
nominatim/tokenizer/sanitizers/base.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Common data types and protocols for sanitizers.
|
||||||
|
"""
|
||||||
|
from typing import Optional, Dict, List, Mapping, Callable
|
||||||
|
|
||||||
|
from typing_extensions import Protocol, Final
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
from nominatim.data.place_info import PlaceInfo
|
||||||
|
|
||||||
|
class PlaceName:
|
||||||
|
""" A searchable name for a place together with properties.
|
||||||
|
Every name object saves the name proper and two basic properties:
|
||||||
|
* 'kind' describes the name of the OSM key used without any suffixes
|
||||||
|
(i.e. the part after the colon removed)
|
||||||
|
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
||||||
|
is the part of the key after the first colon.
|
||||||
|
In addition to that, the name may have arbitrary additional attributes.
|
||||||
|
Which attributes are used, depends on the token analyser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name: str, kind: str, suffix: Optional[str]):
|
||||||
|
self.name = name
|
||||||
|
self.kind = kind
|
||||||
|
self.suffix = suffix
|
||||||
|
self.attr: Dict[str, str] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
|
||||||
|
|
||||||
|
|
||||||
|
def clone(self, name: Optional[str] = None,
|
||||||
|
kind: Optional[str] = None,
|
||||||
|
suffix: Optional[str] = None,
|
||||||
|
attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
|
||||||
|
""" Create a deep copy of the place name, optionally with the
|
||||||
|
given parameters replaced. In the attribute list only the given
|
||||||
|
keys are updated. The list is not replaced completely.
|
||||||
|
In particular, the function cannot to be used to remove an
|
||||||
|
attribute from a place name.
|
||||||
|
"""
|
||||||
|
newobj = PlaceName(name or self.name,
|
||||||
|
kind or self.kind,
|
||||||
|
suffix or self.suffix)
|
||||||
|
|
||||||
|
newobj.attr.update(self.attr)
|
||||||
|
if attr:
|
||||||
|
newobj.attr.update(attr)
|
||||||
|
|
||||||
|
return newobj
|
||||||
|
|
||||||
|
|
||||||
|
def set_attr(self, key: str, value: str) -> None:
|
||||||
|
""" Add the given property to the name. If the property was already
|
||||||
|
set, then the value is overwritten.
|
||||||
|
"""
|
||||||
|
self.attr[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
||||||
|
""" Return the given property or the value of 'default' if it
|
||||||
|
is not set.
|
||||||
|
"""
|
||||||
|
return self.attr.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
|
def has_attr(self, key: str) -> bool:
|
||||||
|
""" Check if the given attribute is set.
|
||||||
|
"""
|
||||||
|
return key in self.attr
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessInfo:
|
||||||
|
""" Container class for information handed into to handler functions.
|
||||||
|
The 'names' and 'address' members are mutable. A handler must change
|
||||||
|
them by either modifying the lists place or replacing the old content
|
||||||
|
with a new list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, place: PlaceInfo):
|
||||||
|
self.place: Final = place
|
||||||
|
self.names = self._convert_name_dict(place.name)
|
||||||
|
self.address = self._convert_name_dict(place.address)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
|
||||||
|
""" Convert a dictionary of names into a list of PlaceNames.
|
||||||
|
The dictionary key is split into the primary part of the key
|
||||||
|
and the suffix (the part after an optional colon).
|
||||||
|
"""
|
||||||
|
out = []
|
||||||
|
|
||||||
|
if names:
|
||||||
|
for key, value in names.items():
|
||||||
|
parts = key.split(':', 1)
|
||||||
|
out.append(PlaceName(value.strip(),
|
||||||
|
parts[0].strip(),
|
||||||
|
parts[1].strip() if len(parts) > 1 else None))
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class SanitizerHandler(Protocol):
|
||||||
|
""" Protocol for sanitizer modules.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
|
"""
|
||||||
|
A sanitizer must define a single function `create`. It takes the
|
||||||
|
dictionary with the configuration information for the sanitizer and
|
||||||
|
returns a function that transforms name and address.
|
||||||
|
"""
|
@ -24,11 +24,15 @@ Arguments:
|
|||||||
or a list of strings, where each string is a regular
|
or a list of strings, where each string is a regular
|
||||||
expression that must match the full house number value.
|
expression that must match the full house number value.
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable, Iterator, List
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo, PlaceName
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
class _HousenumberSanitizer:
|
class _HousenumberSanitizer:
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config: SanitizerConfig) -> None:
|
||||||
self.filter_kind = config.get_filter_kind('housenumber')
|
self.filter_kind = config.get_filter_kind('housenumber')
|
||||||
self.split_regexp = config.get_delimiter()
|
self.split_regexp = config.get_delimiter()
|
||||||
|
|
||||||
@ -37,13 +41,13 @@ class _HousenumberSanitizer:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, obj):
|
def __call__(self, obj: ProcessInfo) -> None:
|
||||||
if not obj.address:
|
if not obj.address:
|
||||||
return
|
return
|
||||||
|
|
||||||
new_address = []
|
new_address: List[PlaceName] = []
|
||||||
for item in obj.address:
|
for item in obj.address:
|
||||||
if self.filter_kind(item):
|
if self.filter_kind(item.kind):
|
||||||
if self._treat_as_name(item.name):
|
if self._treat_as_name(item.name):
|
||||||
obj.names.append(item.clone(kind='housenumber'))
|
obj.names.append(item.clone(kind='housenumber'))
|
||||||
else:
|
else:
|
||||||
@ -56,7 +60,7 @@ class _HousenumberSanitizer:
|
|||||||
obj.address = new_address
|
obj.address = new_address
|
||||||
|
|
||||||
|
|
||||||
def sanitize(self, value):
|
def sanitize(self, value: str) -> Iterator[str]:
|
||||||
""" Extract housenumbers in a regularized format from an OSM value.
|
""" Extract housenumbers in a regularized format from an OSM value.
|
||||||
|
|
||||||
The function works as a generator that yields all valid housenumbers
|
The function works as a generator that yields all valid housenumbers
|
||||||
@ -67,16 +71,15 @@ class _HousenumberSanitizer:
|
|||||||
yield from self._regularize(hnr)
|
yield from self._regularize(hnr)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _regularize(self, hnr: str) -> Iterator[str]:
|
||||||
def _regularize(hnr):
|
|
||||||
yield hnr
|
yield hnr
|
||||||
|
|
||||||
|
|
||||||
def _treat_as_name(self, housenumber):
|
def _treat_as_name(self, housenumber: str) -> bool:
|
||||||
return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
|
return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
|
||||||
|
|
||||||
|
|
||||||
def create(config):
|
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a housenumber processing function.
|
""" Create a housenumber processing function.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -20,11 +20,15 @@ Arguments:
|
|||||||
objects that have no country assigned. These are always
|
objects that have no country assigned. These are always
|
||||||
assumed to have no postcode.
|
assumed to have no postcode.
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable, Optional, Tuple
|
||||||
|
|
||||||
from nominatim.data.postcode_format import PostcodeFormatter
|
from nominatim.data.postcode_format import PostcodeFormatter
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
class _PostcodeSanitizer:
|
class _PostcodeSanitizer:
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config: SanitizerConfig) -> None:
|
||||||
self.convert_to_address = config.get_bool('convert-to-address', True)
|
self.convert_to_address = config.get_bool('convert-to-address', True)
|
||||||
self.matcher = PostcodeFormatter()
|
self.matcher = PostcodeFormatter()
|
||||||
|
|
||||||
@ -33,7 +37,7 @@ class _PostcodeSanitizer:
|
|||||||
self.matcher.set_default_pattern(default_pattern)
|
self.matcher.set_default_pattern(default_pattern)
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, obj):
|
def __call__(self, obj: ProcessInfo) -> None:
|
||||||
if not obj.address:
|
if not obj.address:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -52,7 +56,7 @@ class _PostcodeSanitizer:
|
|||||||
postcode.set_attr('variant', formatted[1])
|
postcode.set_attr('variant', formatted[1])
|
||||||
|
|
||||||
|
|
||||||
def scan(self, postcode, country):
|
def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
|
||||||
""" Check the postcode for correct formatting and return the
|
""" Check the postcode for correct formatting and return the
|
||||||
normalized version. Returns None if the postcode does not
|
normalized version. Returns None if the postcode does not
|
||||||
correspond to the oficial format of the given country.
|
correspond to the oficial format of the given country.
|
||||||
@ -61,13 +65,15 @@ class _PostcodeSanitizer:
|
|||||||
if match is None:
|
if match is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
assert country is not None
|
||||||
|
|
||||||
return self.matcher.normalize(country, match),\
|
return self.matcher.normalize(country, match),\
|
||||||
' '.join(filter(lambda p: p is not None, match.groups()))
|
' '.join(filter(lambda p: p is not None, match.groups()))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create(config):
|
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a housenumber processing function.
|
""" Create a housenumber processing function.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -7,20 +7,28 @@
|
|||||||
"""
|
"""
|
||||||
Configuration for Sanitizers.
|
Configuration for Sanitizers.
|
||||||
"""
|
"""
|
||||||
|
from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
|
||||||
from collections import UserDict
|
from collections import UserDict
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
class SanitizerConfig(UserDict):
|
# working around missing generics in Python < 3.8
|
||||||
|
# See https://github.com/python/typing/issues/60#issuecomment-869757075
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
_BaseUserDict = UserDict[str, Any]
|
||||||
|
else:
|
||||||
|
_BaseUserDict = UserDict
|
||||||
|
|
||||||
|
class SanitizerConfig(_BaseUserDict):
|
||||||
""" Dictionary with configuration options for a sanitizer.
|
""" Dictionary with configuration options for a sanitizer.
|
||||||
|
|
||||||
In addition to the usualy dictionary function, the class provides
|
In addition to the usual dictionary function, the class provides
|
||||||
accessors to standard sanatizer options that are used by many of the
|
accessors to standard sanatizer options that are used by many of the
|
||||||
sanitizers.
|
sanitizers.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_string_list(self, param, default=tuple()):
|
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
|
||||||
""" Extract a configuration parameter as a string list.
|
""" Extract a configuration parameter as a string list.
|
||||||
If the parameter value is a simple string, it is returned as a
|
If the parameter value is a simple string, it is returned as a
|
||||||
one-item list. If the parameter value does not exist, the given
|
one-item list. If the parameter value does not exist, the given
|
||||||
@ -44,7 +52,7 @@ class SanitizerConfig(UserDict):
|
|||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
def get_bool(self, param, default=None):
|
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
|
||||||
""" Extract a configuration parameter as a boolean.
|
""" Extract a configuration parameter as a boolean.
|
||||||
The parameter must be one of the yaml boolean values or an
|
The parameter must be one of the yaml boolean values or an
|
||||||
user error will be raised. If `default` is given, then the parameter
|
user error will be raised. If `default` is given, then the parameter
|
||||||
@ -58,7 +66,7 @@ class SanitizerConfig(UserDict):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
def get_delimiter(self, default=',;'):
|
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
|
||||||
""" Return the 'delimiter' parameter in the configuration as a
|
""" Return the 'delimiter' parameter in the configuration as a
|
||||||
compiled regular expression that can be used to split the names on the
|
compiled regular expression that can be used to split the names on the
|
||||||
delimiters. The regular expression makes sure that the resulting names
|
delimiters. The regular expression makes sure that the resulting names
|
||||||
@ -76,7 +84,7 @@ class SanitizerConfig(UserDict):
|
|||||||
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
|
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
|
||||||
|
|
||||||
|
|
||||||
def get_filter_kind(self, *default):
|
def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
|
||||||
""" Return a filter function for the name kind from the 'filter-kind'
|
""" Return a filter function for the name kind from the 'filter-kind'
|
||||||
config parameter. The filter functions takes a name item and returns
|
config parameter. The filter functions takes a name item and returns
|
||||||
True when the item passes the filter.
|
True when the item passes the filter.
|
||||||
@ -93,4 +101,4 @@ class SanitizerConfig(UserDict):
|
|||||||
|
|
||||||
regexes = [re.compile(regex) for regex in filters]
|
regexes = [re.compile(regex) for regex in filters]
|
||||||
|
|
||||||
return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
|
return lambda name: any(regex.fullmatch(name) for regex in regexes)
|
||||||
|
@ -11,13 +11,18 @@ Arguments:
|
|||||||
delimiters: Define the set of characters to be used for
|
delimiters: Define the set of characters to be used for
|
||||||
splitting the list. (default: ',;')
|
splitting the list. (default: ',;')
|
||||||
"""
|
"""
|
||||||
def create(config):
|
from typing import Callable
|
||||||
|
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
|
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a name processing function that splits name values with
|
""" Create a name processing function that splits name values with
|
||||||
multiple values into their components.
|
multiple values into their components.
|
||||||
"""
|
"""
|
||||||
regexp = config.get_delimiter()
|
regexp = config.get_delimiter()
|
||||||
|
|
||||||
def _process(obj):
|
def _process(obj: ProcessInfo) -> None:
|
||||||
if not obj.names:
|
if not obj.names:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -9,12 +9,17 @@ This sanitizer creates additional name variants for names that have
|
|||||||
addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
|
addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
|
||||||
only the main name part with the bracket part removed.
|
only the main name part with the bracket part removed.
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
def create(_):
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
|
|
||||||
|
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a name processing function that creates additional name variants
|
""" Create a name processing function that creates additional name variants
|
||||||
for bracket addendums.
|
for bracket addendums.
|
||||||
"""
|
"""
|
||||||
def _process(obj):
|
def _process(obj: ProcessInfo) -> None:
|
||||||
""" Add variants for names that have a bracket extension.
|
""" Add variants for names that have a bracket extension.
|
||||||
"""
|
"""
|
||||||
if obj.names:
|
if obj.names:
|
||||||
|
@ -30,13 +30,17 @@ Arguments:
|
|||||||
any analyzer tagged) is retained. (default: replace)
|
any analyzer tagged) is retained. (default: replace)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable, Dict, Optional, List
|
||||||
|
|
||||||
from nominatim.data import country_info
|
from nominatim.data import country_info
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
class _AnalyzerByLanguage:
|
class _AnalyzerByLanguage:
|
||||||
""" Processor for tagging the language of names in a place.
|
""" Processor for tagging the language of names in a place.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config: SanitizerConfig) -> None:
|
||||||
self.filter_kind = config.get_filter_kind()
|
self.filter_kind = config.get_filter_kind()
|
||||||
self.replace = config.get('mode', 'replace') != 'append'
|
self.replace = config.get('mode', 'replace') != 'append'
|
||||||
self.whitelist = config.get('whitelist')
|
self.whitelist = config.get('whitelist')
|
||||||
@ -44,8 +48,8 @@ class _AnalyzerByLanguage:
|
|||||||
self._compute_default_languages(config.get('use-defaults', 'no'))
|
self._compute_default_languages(config.get('use-defaults', 'no'))
|
||||||
|
|
||||||
|
|
||||||
def _compute_default_languages(self, use_defaults):
|
def _compute_default_languages(self, use_defaults: str) -> None:
|
||||||
self.deflangs = {}
|
self.deflangs: Dict[Optional[str], List[str]] = {}
|
||||||
|
|
||||||
if use_defaults in ('mono', 'all'):
|
if use_defaults in ('mono', 'all'):
|
||||||
for ccode, clangs in country_info.iterate('languages'):
|
for ccode, clangs in country_info.iterate('languages'):
|
||||||
@ -56,21 +60,21 @@ class _AnalyzerByLanguage:
|
|||||||
self.deflangs[ccode] = clangs
|
self.deflangs[ccode] = clangs
|
||||||
|
|
||||||
|
|
||||||
def _suffix_matches(self, suffix):
|
def _suffix_matches(self, suffix: str) -> bool:
|
||||||
if self.whitelist is None:
|
if self.whitelist is None:
|
||||||
return len(suffix) in (2, 3) and suffix.islower()
|
return len(suffix) in (2, 3) and suffix.islower()
|
||||||
|
|
||||||
return suffix in self.whitelist
|
return suffix in self.whitelist
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, obj):
|
def __call__(self, obj: ProcessInfo) -> None:
|
||||||
if not obj.names:
|
if not obj.names:
|
||||||
return
|
return
|
||||||
|
|
||||||
more_names = []
|
more_names = []
|
||||||
|
|
||||||
for name in (n for n in obj.names
|
for name in (n for n in obj.names
|
||||||
if not n.has_attr('analyzer') and self.filter_kind(n)):
|
if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
|
||||||
if name.suffix:
|
if name.suffix:
|
||||||
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
|
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
|
||||||
else:
|
else:
|
||||||
@ -88,7 +92,7 @@ class _AnalyzerByLanguage:
|
|||||||
obj.names.extend(more_names)
|
obj.names.extend(more_names)
|
||||||
|
|
||||||
|
|
||||||
def create(config):
|
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a function that sets the analyzer property depending on the
|
""" Create a function that sets the analyzer property depending on the
|
||||||
language of the tag.
|
language of the tag.
|
||||||
"""
|
"""
|
||||||
|
@ -82,32 +82,32 @@ def test_create_split_regex_empty_delimiter():
|
|||||||
def test_create_kind_filter_no_params(inp):
|
def test_create_kind_filter_no_params(inp):
|
||||||
filt = SanitizerConfig().get_filter_kind()
|
filt = SanitizerConfig().get_filter_kind()
|
||||||
|
|
||||||
assert filt(PlaceName('something', inp, ''))
|
assert filt(inp)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
|
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
|
||||||
def test_create_kind_filter_custom_regex_positive(kind):
|
def test_create_kind_filter_custom_regex_positive(kind):
|
||||||
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
||||||
|
|
||||||
assert filt(PlaceName('something', kind, ''))
|
assert filt(kind)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
|
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
|
||||||
def test_create_kind_filter_custom_regex_negative(kind):
|
def test_create_kind_filter_custom_regex_negative(kind):
|
||||||
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
||||||
|
|
||||||
assert not filt(PlaceName('something', kind, ''))
|
assert not filt(kind)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
|
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
|
||||||
def test_create_kind_filter_many_positive(kind):
|
def test_create_kind_filter_many_positive(kind):
|
||||||
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
||||||
|
|
||||||
assert filt(PlaceName('something', kind, ''))
|
assert filt(kind)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
|
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
|
||||||
def test_create_kind_filter_many_negative(kind):
|
def test_create_kind_filter_many_negative(kind):
|
||||||
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
||||||
|
|
||||||
assert not filt(PlaceName('something', kind, ''))
|
assert not filt(kind)
|
||||||
|
Loading…
Reference in New Issue
Block a user