mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-27 06:51:42 +03:00
Merge pull request #2602 from lonvia/filter-bad-housenumbers
Handle mistagged housenumbers like names
This commit is contained in:
commit
02894ca4a4
@ -10,6 +10,7 @@ ignored-modules=icu,datrie
|
||||
# closing added here because it sometimes triggers a false positive with
|
||||
# 'with' statements.
|
||||
ignored-classes=NominatimArgs,closing
|
||||
disable=too-few-public-methods,duplicate-code
|
||||
# 'too-many-ancestors' is triggered already by deriving from UserDict
|
||||
disable=too-few-public-methods,duplicate-code,too-many-ancestors
|
||||
|
||||
good-names=i,x,y,fd,db
|
||||
|
@ -45,7 +45,7 @@ class ICURuleLoader:
|
||||
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG')
|
||||
|
||||
# Make sure country information is available to analyzers and sanatizers.
|
||||
# Make sure country information is available to analyzers and sanitizers.
|
||||
nominatim.tools.country_info.setup_country_config(config)
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
|
@ -11,6 +11,7 @@ is handed to the token analysis.
|
||||
import importlib
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||
|
||||
class PlaceName:
|
||||
""" A searchable name for a place together with properties.
|
||||
@ -117,7 +118,7 @@ class PlaceSanitizer:
|
||||
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
||||
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
||||
handler_module = importlib.import_module(module_name)
|
||||
self.handlers.append(handler_module.create(func))
|
||||
self.handlers.append(handler_module.create(SanitizerConfig(func)))
|
||||
|
||||
|
||||
def process_names(self, place):
|
||||
|
@ -19,15 +19,22 @@ Arguments:
|
||||
where each string is a regular expression. An address item
|
||||
is considered a house number if the 'kind' fully matches any
|
||||
of the given regular expressions. (default: 'housenumber')
|
||||
|
||||
convert-to-name: Define house numbers that should be treated as a name
|
||||
instead of a house number. Either takes a single string
|
||||
or a list of strings, where each string is a regular
|
||||
expression that must match the full house number value.
|
||||
"""
|
||||
from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter
|
||||
import re
|
||||
|
||||
class _HousenumberSanitizer:
|
||||
|
||||
def __init__(self, config):
|
||||
self.filter_kind = create_kind_filter(config, 'housenumber')
|
||||
self.split_regexp = create_split_regex(config)
|
||||
self.filter_kind = config.get_filter_kind('housenumber')
|
||||
self.split_regexp = config.get_delimiter()
|
||||
|
||||
nameregexps = config.get_string_list('convert-to-name', [])
|
||||
self.is_name_regexp = [re.compile(r) for r in nameregexps]
|
||||
|
||||
|
||||
|
||||
def __call__(self, obj):
|
||||
@ -37,8 +44,11 @@ class _HousenumberSanitizer:
|
||||
new_address = []
|
||||
for item in obj.address:
|
||||
if self.filter_kind(item):
|
||||
new_address.extend(item.clone(kind='housenumber', name=n)
|
||||
for n in self.sanitize(item.name))
|
||||
if self._treat_as_name(item.name):
|
||||
obj.names.append(item.clone(kind='housenumber'))
|
||||
else:
|
||||
new_address.extend(item.clone(kind='housenumber', name=n)
|
||||
for n in self.sanitize(item.name))
|
||||
else:
|
||||
# Don't touch other address items.
|
||||
new_address.append(item)
|
||||
@ -62,6 +72,10 @@ class _HousenumberSanitizer:
|
||||
yield hnr
|
||||
|
||||
|
||||
def _treat_as_name(self, housenumber):
|
||||
return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
|
||||
|
||||
|
||||
def create(config):
|
||||
""" Create a housenumber processing function.
|
||||
"""
|
||||
|
82
nominatim/tokenizer/sanitizers/config.py
Normal file
82
nominatim/tokenizer/sanitizers/config.py
Normal file
@ -0,0 +1,82 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Configuration for Sanitizers.
|
||||
"""
|
||||
from collections import UserDict
|
||||
import re
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
class SanitizerConfig(UserDict):
|
||||
""" Dictionary with configuration options for a sanitizer.
|
||||
|
||||
In addition to the usualy dictionary function, the class provides
|
||||
accessors to standard sanatizer options that are used by many of the
|
||||
sanitizers.
|
||||
"""
|
||||
|
||||
def get_string_list(self, param, default=tuple()):
|
||||
""" Extract a configuration parameter as a string list.
|
||||
If the parameter value is a simple string, it is returned as a
|
||||
one-item list. If the parameter value does not exist, the given
|
||||
default is returned. If the parameter value is a list, it is checked
|
||||
to contain only strings before being returned.
|
||||
"""
|
||||
values = self.data.get(param, None)
|
||||
|
||||
if values is None:
|
||||
return None if default is None else list(default)
|
||||
|
||||
if isinstance(values, str):
|
||||
return [values] if values else []
|
||||
|
||||
if not isinstance(values, (list, tuple)):
|
||||
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
|
||||
|
||||
if any(not isinstance(value, str) for value in values):
|
||||
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
|
||||
|
||||
return values
|
||||
|
||||
|
||||
def get_delimiter(self, default=',;'):
|
||||
""" Return the 'delimiter' parameter in the configuration as a
|
||||
compiled regular expression that can be used to split the names on the
|
||||
delimiters. The regular expression makes sure that the resulting names
|
||||
are stripped and that repeated delimiters
|
||||
are ignored but it will still create empty fields on occasion. The
|
||||
code needs to filter those.
|
||||
|
||||
The 'default' parameter defines the delimiter set to be used when
|
||||
not explicitly configured.
|
||||
"""
|
||||
delimiter_set = set(self.data.get('delimiters', default))
|
||||
if not delimiter_set:
|
||||
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
|
||||
|
||||
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
|
||||
|
||||
|
||||
def get_filter_kind(self, *default):
|
||||
""" Return a filter function for the name kind from the 'filter-kind'
|
||||
config parameter. The filter functions takes a name item and returns
|
||||
True when the item passes the filter.
|
||||
|
||||
If the parameter is empty, the filter lets all items pass. If the
|
||||
paramter is a string, it is interpreted as a single regular expression
|
||||
that must match the full kind string. If the parameter is a list then
|
||||
any of the regular expressions in the list must match to pass.
|
||||
"""
|
||||
filters = self.get_string_list('filter-kind', default)
|
||||
|
||||
if not filters:
|
||||
return lambda _: True
|
||||
|
||||
regexes = [re.compile(regex) for regex in filters]
|
||||
|
||||
return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
|
@ -1,52 +0,0 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Helper functions for sanitizers.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
def create_split_regex(config, default=',;'):
|
||||
""" Converts the 'delimiter' parameter in the configuration into a
|
||||
compiled regular expression that can be used to split the names on the
|
||||
delimiters. The regular expression makes sure that the resulting names
|
||||
are stripped and that repeated delimiters
|
||||
are ignored but it will still create empty fields on occasion. The
|
||||
code needs to filter those.
|
||||
|
||||
The 'default' parameter defines the delimiter set to be used when
|
||||
not explicitly configured.
|
||||
"""
|
||||
delimiter_set = set(config.get('delimiters', default))
|
||||
if not delimiter_set:
|
||||
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
|
||||
|
||||
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
|
||||
|
||||
|
||||
def create_kind_filter(config, default=None):
|
||||
""" Create a filter function for the name kind from the 'filter-kind'
|
||||
config parameter. The filter functions takes a name item and returns
|
||||
True when the item passes the filter.
|
||||
|
||||
If the parameter is empty, the filter lets all items pass. If the
|
||||
paramter is a string, it is interpreted as a single regular expression
|
||||
that must match the full kind string. If the parameter is a list then
|
||||
any of the regular expressions in the list must match to pass.
|
||||
"""
|
||||
filters = config.get('filter-kind', default)
|
||||
|
||||
if not filters:
|
||||
return lambda _: True
|
||||
|
||||
if isinstance(filters, str):
|
||||
regex = re.compile(filters)
|
||||
return lambda name: regex.fullmatch(name.kind)
|
||||
|
||||
regexes = [re.compile(regex) for regex in filters]
|
||||
return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
|
@ -11,13 +11,11 @@ Arguments:
|
||||
delimiters: Define the set of characters to be used for
|
||||
splitting the list. (default: ',;')
|
||||
"""
|
||||
from nominatim.tokenizer.sanitizers.helpers import create_split_regex
|
||||
|
||||
def create(func):
|
||||
def create(config):
|
||||
""" Create a name processing function that splits name values with
|
||||
multiple values into their components.
|
||||
"""
|
||||
regexp = create_split_regex(func)
|
||||
regexp = config.get_delimiter()
|
||||
|
||||
def _process(obj):
|
||||
if not obj.names:
|
||||
|
@ -31,21 +31,20 @@ Arguments:
|
||||
|
||||
"""
|
||||
from nominatim.tools import country_info
|
||||
from nominatim.tokenizer.sanitizers.helpers import create_kind_filter
|
||||
|
||||
class _AnalyzerByLanguage:
|
||||
""" Processor for tagging the language of names in a place.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.filter_kind = create_kind_filter(config)
|
||||
self.filter_kind = config.get_filter_kind()
|
||||
self.replace = config.get('mode', 'replace') != 'append'
|
||||
self.whitelist = config.get('whitelist')
|
||||
|
||||
self.__compute_default_languages(config.get('use-defaults', 'no'))
|
||||
self._compute_default_languages(config.get('use-defaults', 'no'))
|
||||
|
||||
|
||||
def __compute_default_languages(self, use_defaults):
|
||||
def _compute_default_languages(self, use_defaults):
|
||||
self.deflangs = {}
|
||||
|
||||
if use_defaults in ('mono', 'all'):
|
||||
|
@ -25,13 +25,15 @@ transliteration:
|
||||
- "[^a-z0-9[:Space:]] >"
|
||||
- ":: NFC ()"
|
||||
sanitizers:
|
||||
- step: split-name-list
|
||||
- step: strip-brace-terms
|
||||
- step: clean-housenumbers
|
||||
filter-kind:
|
||||
- housenumber
|
||||
- conscriptionnumber
|
||||
- streetnumber
|
||||
convert-to-name:
|
||||
- (\A|.*,)[^\d,]{3,}(,.*|\Z)
|
||||
- step: split-name-list
|
||||
- step: strip-brace-terms
|
||||
- step: tag-analyzer-by-language
|
||||
filter-kind: [".*name.*"]
|
||||
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
|
||||
|
@ -53,3 +53,17 @@ Feature: Searching of house numbers
|
||||
| 2;4;12 |
|
||||
| 2,4,12 |
|
||||
| 2, 4, 12 |
|
||||
|
||||
|
||||
Scenario: A name mapped as a housenumber is found
|
||||
Given the places
|
||||
| osm | class | type | housenr | geometry |
|
||||
| N1 | building | yes | Warring | 9 |
|
||||
And the places
|
||||
| osm | class | type | name | geometry |
|
||||
| W10 | highway | path | Chester St | 1,2,3 |
|
||||
When importing
|
||||
When sending search query "Chester St Warring"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N1 |
|
||||
|
@ -42,3 +42,27 @@ def test_housenumber_lists(sanitize, number):
|
||||
def test_filter_kind(sanitize):
|
||||
assert sanitize(housenumber='34', number='4', badnumber='65') == \
|
||||
[('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('number', ('6523', 'n/a', '4'))
|
||||
def test_convert_to_name_converted(number):
|
||||
sanitizer_args = {'step': 'clean-housenumbers',
|
||||
'convert-to-name': (r'\d+', 'n/a')}
|
||||
|
||||
place = PlaceInfo({'address': {'housenumber': number}})
|
||||
names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
|
||||
|
||||
assert ('housenumber', number) in set((p.kind, p.name) for p in names)
|
||||
assert 'housenumber' not in set(p.kind for p in address)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('number', ('a54', 'n.a', 'bow'))
|
||||
def test_convert_to_name_unconverted(number):
|
||||
sanitizer_args = {'step': 'clean-housenumbers',
|
||||
'convert-to-name': (r'\d+', 'n/a')}
|
||||
|
||||
place = PlaceInfo({'address': {'housenumber': number}})
|
||||
names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
|
||||
|
||||
assert 'housenumber' not in set(p.kind for p in names)
|
||||
assert ('housenumber', number) in set((p.kind, p.name) for p in address)
|
||||
|
@ -5,17 +5,51 @@
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for sanitizer helper functions.
|
||||
Tests for sanitizer configuration helper functions.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
from nominatim.tokenizer.place_sanitizer import PlaceName
|
||||
import nominatim.tokenizer.sanitizers.helpers as helpers
|
||||
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||
|
||||
def test_string_list_default_empty():
|
||||
assert SanitizerConfig().get_string_list('op') == []
|
||||
|
||||
|
||||
def test_string_list_default_none():
|
||||
assert SanitizerConfig().get_string_list('op', default=None) is None
|
||||
|
||||
|
||||
def test_string_list_default_something():
|
||||
assert SanitizerConfig().get_string_list('op', default=['a', 'b']) == ['a', 'b']
|
||||
|
||||
|
||||
def test_string_list_value_string():
|
||||
assert SanitizerConfig({'op': 't'}).get_string_list('op', default=['a', 'b']) == ['t']
|
||||
|
||||
|
||||
def test_string_list_value_list():
|
||||
assert SanitizerConfig({'op': ['1', '2']}).get_string_list('op') == ['1', '2']
|
||||
|
||||
|
||||
def test_string_list_value_empty():
|
||||
assert SanitizerConfig({'op': ''}).get_string_list('op', default=['a', 'b']) == []
|
||||
|
||||
|
||||
def test_string_list_value_dict():
|
||||
with pytest.raises(UsageError):
|
||||
SanitizerConfig({'op': {'1': 'a'}}).get_string_list('op')
|
||||
|
||||
|
||||
def test_string_list_value_int_list():
|
||||
with pytest.raises(UsageError):
|
||||
SanitizerConfig({'op': [1, 2]}).get_string_list('op')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
|
||||
def test_create_split_regex_no_params_unsplit(inp):
|
||||
regex = helpers.create_split_regex({})
|
||||
regex = SanitizerConfig().get_delimiter()
|
||||
|
||||
assert list(regex.split(inp)) == [inp]
|
||||
|
||||
@ -26,14 +60,14 @@ def test_create_split_regex_no_params_unsplit(inp):
|
||||
('1, 3 ,5', ['1', '3', '5'])
|
||||
])
|
||||
def test_create_split_regex_no_params_split(inp, outp):
|
||||
regex = helpers.create_split_regex({})
|
||||
regex = SanitizerConfig().get_delimiter()
|
||||
|
||||
assert list(regex.split(inp)) == outp
|
||||
|
||||
|
||||
@pytest.mark.parametrize('delimiter', ['.', '\\', '[]', ' ', '/.*+'])
|
||||
def test_create_split_regex_custom(delimiter):
|
||||
regex = helpers.create_split_regex({'delimiters': delimiter})
|
||||
regex = SanitizerConfig({'delimiters': delimiter}).get_delimiter()
|
||||
|
||||
assert list(regex.split(f'out{delimiter}house')) == ['out', 'house']
|
||||
assert list(regex.split('out,house')) == ['out,house']
|
||||
@ -41,39 +75,39 @@ def test_create_split_regex_custom(delimiter):
|
||||
|
||||
def test_create_split_regex_empty_delimiter():
|
||||
with pytest.raises(UsageError):
|
||||
regex = helpers.create_split_regex({'delimiters': ''})
|
||||
regex = SanitizerConfig({'delimiters': ''}).get_delimiter()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
|
||||
def test_create_kind_filter_no_params(inp):
|
||||
filt = helpers.create_kind_filter({})
|
||||
filt = SanitizerConfig().get_filter_kind()
|
||||
|
||||
assert filt(PlaceName('something', inp, ''))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
|
||||
def test_create_kind_filter_custom_regex_positive(kind):
|
||||
filt = helpers.create_kind_filter({'filter-kind': '.*de'})
|
||||
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
||||
|
||||
assert filt(PlaceName('something', kind, ''))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
|
||||
def test_create_kind_filter_custom_regex_negative(kind):
|
||||
filt = helpers.create_kind_filter({'filter-kind': '.*de'})
|
||||
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
||||
|
||||
assert not filt(PlaceName('something', kind, ''))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
|
||||
def test_create_kind_filter_many_positive(kind):
|
||||
filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
|
||||
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
||||
|
||||
assert filt(PlaceName('something', kind, ''))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
|
||||
def test_create_kind_filter_many_negative(kind):
|
||||
filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
|
||||
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
||||
|
||||
assert not filt(PlaceName('something', kind, ''))
|
Loading…
Reference in New Issue
Block a user