clean_housenumbers: make kinds and delimiters configurable

Also adds unit tests for various options.
This commit is contained in:
Sarah Hoffmann 2022-01-20 12:07:12 +01:00
parent 206ee87188
commit 4774e45218
7 changed files with 133 additions and 20 deletions

View File

@ -6,13 +6,19 @@
# For a full list of authors see the git log.
"""
Sanitizer that cleans and normalizes housenumbers.
Arguments:
delimiters: Define the set of characters to be used for
splitting a list of housenumbers into parts. (default: ',;')
"""
import re
from nominatim.tokenizer.sanitizers.helpers import create_split_regex
class _HousenumberSanitizer:
def __init__(self, config):
pass
self.kinds = config.get('filter-kind', ('housenumber', ))
self.split_regexp = create_split_regex(config)
def __call__(self, obj):
@ -21,7 +27,7 @@ class _HousenumberSanitizer:
new_address = []
for item in obj.address:
if item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
if item.kind in self.kinds:
new_address.extend(item.clone(kind='housenumber', name=n) for n in self.sanitize(item.name))
else:
# Don't touch other address items.
@ -36,13 +42,9 @@ class _HousenumberSanitizer:
The function works as a generator that yields all valid housenumbers
that can be created from the value.
"""
for hnr in self._split_number(value):
yield from self._regularize(hnr)
def _split_number(self, hnr):
for part in re.split(r'[;,]', hnr):
yield part.strip()
for hnr in self.split_regexp.split(value):
if hnr:
yield from self._regularize(hnr)
def _regularize(self, hnr):

View File

@ -0,0 +1,29 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for sanitizers.
"""
import re
from nominatim.errors import UsageError
def create_split_regex(config, default=',;'):
""" Converts the 'delimiter' parameter in the configuration into a
compiled regular expression that can be used to split the names on the
delimiters. The regular expression makes sure that the resulting names
are stripped and that repeated delimiters
are ignored but it will still create empty fields on occasion. The
code needs to filter those.
The 'default' parameter defines the delimiter set to be used when
not explicitly configured.
"""
delimiter_set = set(config.get('delimiters', default))
if not delimiter_set:
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))

View File

@ -9,21 +9,16 @@ Sanitizer that splits lists of names into their components.
Arguments:
delimiters: Define the set of characters to be used for
splitting the list. (default: `,;`)
splitting the list. (default: ',;')
"""
import re
from nominatim.errors import UsageError
from nominatim.tokenizer.sanitizers.helpers import create_split_regex
def create(func):
""" Create a name processing function that splits name values with
multiple values into their components.
"""
delimiter_set = set(func.get('delimiters', ',;'))
if not delimiter_set:
raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
regexp = create_split_regex(func)
def _process(obj):
if not obj.names:

View File

@ -13,7 +13,7 @@ Arguments:
filter-kind: Restrict the names the sanitizer should be applied to
to the given tags. The parameter expects a list of
regular expressions which are matched against `kind`.
regular expressions which are matched against 'kind'.
Note that a match against the full string is expected.
whitelist: Restrict the set of languages that should be tagged.
Expects a list of acceptable suffixes. When unset,

View File

@ -0,0 +1,44 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for the sanitizer that normalizes housenumbers.
"""
import pytest
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.indexer.place_info import PlaceInfo
@pytest.fixture
def sanitize(request):
sanitizer_args = {'step': 'clean-housenumbers'}
for mark in request.node.iter_markers(name="sanitizer_params"):
sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
def _run(**kwargs):
place = PlaceInfo({'address': kwargs})
_, address = PlaceSanitizer([sanitizer_args]).process_names(place)
return sorted([(p.kind, p.name) for p in address])
return _run
def test_simple_number(sanitize):
assert sanitize(housenumber='34') == [('housenumber', '34')]
@pytest.mark.parametrize('number', ['1;2;3', '1,2,3', '1; 3 ,2',
'2,,3,1', '1;2;3;;', ';3;2;1'])
def test_housenumber_lists(sanitize, number):
assert sanitize(housenumber=number) == \
[('housenumber', '1'), ('housenumber', '2'), ('housenumber', '3')]
@pytest.mark.sanitizer_params(filter_kind=('number', 'streetnumber'))
def test_filter_kind(sanitize):
assert sanitize(housenumber='34', number='4', badnumber='65') == \
[('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')]

View File

@ -0,0 +1,43 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for sanitizer helper functions.
"""
import pytest
from nominatim.errors import UsageError
import nominatim.tokenizer.sanitizers.helpers as helpers
@pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78'))
def test_create_split_regex_no_params_unsplit(inp):
regex = helpers.create_split_regex({})
assert list(regex.split(inp)) == [inp]
@pytest.mark.parametrize('inp,outp', [('here,there', ['here', 'there']),
('ying;;yang', ['ying', 'yang']),
(';a; ;c;d,', ['', 'a', '', 'c', 'd', '']),
('1, 3 ,5', ['1', '3', '5'])
])
def test_create_split_regex_no_params_split(inp, outp):
regex = helpers.create_split_regex({})
assert list(regex.split(inp)) == outp
@pytest.mark.parametrize('delimiter', ['.', '\\', '[]', ' ', '/.*+'])
def test_create_split_regex_custom(delimiter):
regex = helpers.create_split_regex({'delimiters': delimiter})
assert list(regex.split(f'out{delimiter}house')) == ['out', 'house']
assert list(regex.split('out,house')) == ['out,house']
def test_create_split_regex_empty_delimiter():
with pytest.raises(UsageError):
regex = helpers.create_split_regex({'delimiters': ''})

View File

@ -5,7 +5,7 @@
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for the sanitizer that splitts multivalue lists.
Tests for the sanitizer that splits multivalue lists.
"""
import pytest