Merge pull request #2602 from lonvia/filter-bad-housenumbers

Handle mistagged housenumbers like names
This commit is contained in:
Sarah Hoffmann 2022-02-07 16:27:04 +01:00 committed by GitHub
commit 02894ca4a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 199 additions and 82 deletions

View File

@ -10,6 +10,7 @@ ignored-modules=icu,datrie
# closing added here because it sometimes triggers a false positive with
# 'with' statements.
ignored-classes=NominatimArgs,closing
disable=too-few-public-methods,duplicate-code
# 'too-many-ancestors' is triggered already by deriving from UserDict
disable=too-few-public-methods,duplicate-code,too-many-ancestors
good-names=i,x,y,fd,db

View File

@ -45,7 +45,7 @@ class ICURuleLoader:
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
# Make sure country information is available to analyzers and sanatizers.
# Make sure country information is available to analyzers and sanitizers.
nominatim.tools.country_info.setup_country_config(config)
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')

View File

@ -11,6 +11,7 @@ is handed to the token analysis.
import importlib
from nominatim.errors import UsageError
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
class PlaceName:
""" A searchable name for a place together with properties.
@ -117,7 +118,7 @@ class PlaceSanitizer:
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
handler_module = importlib.import_module(module_name)
self.handlers.append(handler_module.create(func))
self.handlers.append(handler_module.create(SanitizerConfig(func)))
def process_names(self, place):

View File

@ -19,15 +19,22 @@ Arguments:
where each string is a regular expression. An address item
is considered a house number if the 'kind' fully matches any
of the given regular expressions. (default: 'housenumber')
convert-to-name: Define house numbers that should be treated as a name
instead of a house number. Either takes a single string
or a list of strings, where each string is a regular
expression that must match the full house number value.
"""
from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter
import re
class _HousenumberSanitizer:
def __init__(self, config):
self.filter_kind = create_kind_filter(config, 'housenumber')
self.split_regexp = create_split_regex(config)
self.filter_kind = config.get_filter_kind('housenumber')
self.split_regexp = config.get_delimiter()
nameregexps = config.get_string_list('convert-to-name', [])
self.is_name_regexp = [re.compile(r) for r in nameregexps]
def __call__(self, obj):
@ -37,8 +44,11 @@ class _HousenumberSanitizer:
new_address = []
for item in obj.address:
if self.filter_kind(item):
new_address.extend(item.clone(kind='housenumber', name=n)
for n in self.sanitize(item.name))
if self._treat_as_name(item.name):
obj.names.append(item.clone(kind='housenumber'))
else:
new_address.extend(item.clone(kind='housenumber', name=n)
for n in self.sanitize(item.name))
else:
# Don't touch other address items.
new_address.append(item)
@ -62,6 +72,10 @@ class _HousenumberSanitizer:
yield hnr
def _treat_as_name(self, housenumber):
return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
def create(config):
""" Create a housenumber processing function.
"""

View File

@ -0,0 +1,82 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Configuration for Sanitizers.
"""
from collections import UserDict
import re
from nominatim.errors import UsageError
class SanitizerConfig(UserDict):
""" Dictionary with configuration options for a sanitizer.
In addition to the usualy dictionary function, the class provides
accessors to standard sanatizer options that are used by many of the
sanitizers.
"""
def get_string_list(self, param, default=tuple()):
""" Extract a configuration parameter as a string list.
If the parameter value is a simple string, it is returned as a
one-item list. If the parameter value does not exist, the given
default is returned. If the parameter value is a list, it is checked
to contain only strings before being returned.
"""
values = self.data.get(param, None)
if values is None:
return None if default is None else list(default)
if isinstance(values, str):
return [values] if values else []
if not isinstance(values, (list, tuple)):
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
if any(not isinstance(value, str) for value in values):
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
return values
def get_delimiter(self, default=',;'):
""" Return the 'delimiter' parameter in the configuration as a
compiled regular expression that can be used to split the names on the
delimiters. The regular expression makes sure that the resulting names
are stripped and that repeated delimiters
are ignored but it will still create empty fields on occasion. The
code needs to filter those.
The 'default' parameter defines the delimiter set to be used when
not explicitly configured.
"""
delimiter_set = set(self.data.get('delimiters', default))
if not delimiter_set:
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
def get_filter_kind(self, *default):
""" Return a filter function for the name kind from the 'filter-kind'
config parameter. The filter functions takes a name item and returns
True when the item passes the filter.
If the parameter is empty, the filter lets all items pass. If the
paramter is a string, it is interpreted as a single regular expression
that must match the full kind string. If the parameter is a list then
any of the regular expressions in the list must match to pass.
"""
filters = self.get_string_list('filter-kind', default)
if not filters:
return lambda _: True
regexes = [re.compile(regex) for regex in filters]
return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)

View File

@ -1,52 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for sanitizers.
"""
import re
from nominatim.errors import UsageError
def create_split_regex(config, default=',;'):
""" Converts the 'delimiter' parameter in the configuration into a
compiled regular expression that can be used to split the names on the
delimiters. The regular expression makes sure that the resulting names
are stripped and that repeated delimiters
are ignored but it will still create empty fields on occasion. The
code needs to filter those.
The 'default' parameter defines the delimiter set to be used when
not explicitly configured.
"""
delimiter_set = set(config.get('delimiters', default))
if not delimiter_set:
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
def create_kind_filter(config, default=None):
""" Create a filter function for the name kind from the 'filter-kind'
config parameter. The filter functions takes a name item and returns
True when the item passes the filter.
If the parameter is empty, the filter lets all items pass. If the
paramter is a string, it is interpreted as a single regular expression
that must match the full kind string. If the parameter is a list then
any of the regular expressions in the list must match to pass.
"""
filters = config.get('filter-kind', default)
if not filters:
return lambda _: True
if isinstance(filters, str):
regex = re.compile(filters)
return lambda name: regex.fullmatch(name.kind)
regexes = [re.compile(regex) for regex in filters]
return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)

View File

@ -11,13 +11,11 @@ Arguments:
delimiters: Define the set of characters to be used for
splitting the list. (default: ',;')
"""
from nominatim.tokenizer.sanitizers.helpers import create_split_regex
def create(func):
def create(config):
""" Create a name processing function that splits name values with
multiple values into their components.
"""
regexp = create_split_regex(func)
regexp = config.get_delimiter()
def _process(obj):
if not obj.names:

View File

@ -31,21 +31,20 @@ Arguments:
"""
from nominatim.tools import country_info
from nominatim.tokenizer.sanitizers.helpers import create_kind_filter
class _AnalyzerByLanguage:
""" Processor for tagging the language of names in a place.
"""
def __init__(self, config):
self.filter_kind = create_kind_filter(config)
self.filter_kind = config.get_filter_kind()
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
self.__compute_default_languages(config.get('use-defaults', 'no'))
self._compute_default_languages(config.get('use-defaults', 'no'))
def __compute_default_languages(self, use_defaults):
def _compute_default_languages(self, use_defaults):
self.deflangs = {}
if use_defaults in ('mono', 'all'):

View File

@ -25,13 +25,15 @@ transliteration:
- "[^a-z0-9[:Space:]] >"
- ":: NFC ()"
sanitizers:
- step: split-name-list
- step: strip-brace-terms
- step: clean-housenumbers
filter-kind:
- housenumber
- conscriptionnumber
- streetnumber
convert-to-name:
- (\A|.*,)[^\d,]{3,}(,.*|\Z)
- step: split-name-list
- step: strip-brace-terms
- step: tag-analyzer-by-language
filter-kind: [".*name.*"]
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]

View File

@ -53,3 +53,17 @@ Feature: Searching of house numbers
| 2;4;12 |
| 2,4,12 |
| 2, 4, 12 |
Scenario: A name mapped as a housenumber is found
Given the places
| osm | class | type | housenr | geometry |
| N1 | building | yes | Warring | 9 |
And the places
| osm | class | type | name | geometry |
| W10 | highway | path | Chester St | 1,2,3 |
When importing
When sending search query "Chester St Warring"
Then results contain
| osm |
| N1 |

View File

@ -42,3 +42,27 @@ def test_housenumber_lists(sanitize, number):
def test_filter_kind(sanitize):
assert sanitize(housenumber='34', number='4', badnumber='65') == \
[('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')]
@pytest.mark.parametrize('number', ('6523', 'n/a', '4'))
def test_convert_to_name_converted(number):
sanitizer_args = {'step': 'clean-housenumbers',
'convert-to-name': (r'\d+', 'n/a')}
place = PlaceInfo({'address': {'housenumber': number}})
names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
assert ('housenumber', number) in set((p.kind, p.name) for p in names)
assert 'housenumber' not in set(p.kind for p in address)
@pytest.mark.parametrize('number', ('a54', 'n.a', 'bow'))
def test_convert_to_name_unconverted(number):
sanitizer_args = {'step': 'clean-housenumbers',
'convert-to-name': (r'\d+', 'n/a')}
place = PlaceInfo({'address': {'housenumber': number}})
names, address = PlaceSanitizer([sanitizer_args]).process_names(place)
assert 'housenumber' not in set(p.kind for p in names)
assert ('housenumber', number) in set((p.kind, p.name) for p in address)

View File

@ -5,17 +5,51 @@
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for sanitizer helper functions.
Tests for sanitizer configuration helper functions.
"""
import pytest
from nominatim.errors import UsageError
from nominatim.tokenizer.place_sanitizer import PlaceName
import nominatim.tokenizer.sanitizers.helpers as helpers
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
def test_string_list_default_empty():
assert SanitizerConfig().get_string_list('op') == []
def test_string_list_default_none():
assert SanitizerConfig().get_string_list('op', default=None) is None
def test_string_list_default_something():
assert SanitizerConfig().get_string_list('op', default=['a', 'b']) == ['a', 'b']
def test_string_list_value_string():
assert SanitizerConfig({'op': 't'}).get_string_list('op', default=['a', 'b']) == ['t']
def test_string_list_value_list():
assert SanitizerConfig({'op': ['1', '2']}).get_string_list('op') == ['1', '2']
def test_string_list_value_empty():
assert SanitizerConfig({'op': ''}).get_string_list('op', default=['a', 'b']) == []
def test_string_list_value_dict():
with pytest.raises(UsageError):
SanitizerConfig({'op': {'1': 'a'}}).get_string_list('op')
def test_string_list_value_int_list():
with pytest.raises(UsageError):
SanitizerConfig({'op': [1, 2]}).get_string_list('op')
@pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
def test_create_split_regex_no_params_unsplit(inp):
regex = helpers.create_split_regex({})
regex = SanitizerConfig().get_delimiter()
assert list(regex.split(inp)) == [inp]
@ -26,14 +60,14 @@ def test_create_split_regex_no_params_unsplit(inp):
('1, 3 ,5', ['1', '3', '5'])
])
def test_create_split_regex_no_params_split(inp, outp):
regex = helpers.create_split_regex({})
regex = SanitizerConfig().get_delimiter()
assert list(regex.split(inp)) == outp
@pytest.mark.parametrize('delimiter', ['.', '\\', '[]', ' ', '/.*+'])
def test_create_split_regex_custom(delimiter):
regex = helpers.create_split_regex({'delimiters': delimiter})
regex = SanitizerConfig({'delimiters': delimiter}).get_delimiter()
assert list(regex.split(f'out{delimiter}house')) == ['out', 'house']
assert list(regex.split('out,house')) == ['out,house']
@ -41,39 +75,39 @@ def test_create_split_regex_custom(delimiter):
def test_create_split_regex_empty_delimiter():
with pytest.raises(UsageError):
regex = helpers.create_split_regex({'delimiters': ''})
regex = SanitizerConfig({'delimiters': ''}).get_delimiter()
@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
def test_create_kind_filter_no_params(inp):
filt = helpers.create_kind_filter({})
filt = SanitizerConfig().get_filter_kind()
assert filt(PlaceName('something', inp, ''))
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
def test_create_kind_filter_custom_regex_positive(kind):
filt = helpers.create_kind_filter({'filter-kind': '.*de'})
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
assert filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
def test_create_kind_filter_custom_regex_negative(kind):
filt = helpers.create_kind_filter({'filter-kind': '.*de'})
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
assert not filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
def test_create_kind_filter_many_positive(kind):
filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
assert filt(PlaceName('something', kind, ''))
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
def test_create_kind_filter_many_negative(kind):
filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']})
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
assert not filt(PlaceName('something', kind, ''))