add unit tests for new sanatizer functions

This commit is contained in:
Sarah Hoffmann 2021-10-01 09:50:17 +02:00
parent 8171fe4571
commit 732cd27d2e
5 changed files with 191 additions and 4 deletions

View File

@ -3,13 +3,19 @@ Name processor that splits name values with multiple values into their component
"""
import re
from nominatim.errors import UsageError
def create(func):
""" Create a name processing function that splits name values with
multiple values into their components. The optional parameter
'delimiters' can be used to define the characters that should be used
for splitting. The default is ',;'.
"""
regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
delimiter_set = set(func.get('delimiters', ',;'))
if not delimiter_set:
raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
def _process(obj):
if not obj.names:
@ -18,10 +24,11 @@ def create(func):
new_names = []
for name in obj.names:
split_names = regexp.split(name.name)
print(split_names)
if len(split_names) == 1:
new_names.append(name)
else:
new_names.extend(name.clone(name=n) for n in split_names)
new_names.extend(name.clone(name=n) for n in split_names if n)
obj.names = new_names

View File

@ -10,8 +10,8 @@ def create(_):
def _process(obj):
""" Add variants for names that have a bracket extension.
"""
new_names = []
if obj.names:
new_names = []
for name in (n for n in obj.names if '(' in n.name):
new_name = name.name.split('(')[0].strip()
if new_name:

View File

@ -0,0 +1,65 @@
"""
Tests for the sanitizer that splitts multivalue lists.
"""
import pytest
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.indexer.place_info import PlaceInfo
from nominatim.errors import UsageError
def run_sanitizer_on(**kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
return sorted([(p.name, p.kind, p.suffix) for p in name])
def sanitize_with_delimiter(delimiter, name):
place = PlaceInfo({'name': {'name': name}})
san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
name, _ = san.process_names(place)
return sorted([p.name for p in name])
def test_simple():
assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
assert run_sanitizer_on(name='') == [('', 'name', None)]
def test_splits():
assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
('B', 'name', None),
('C', 'name', None)]
assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
('boat', 'short_name', None)]
def test_empty_fields():
assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
('B', 'name', None)]
assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
('B', 'name', None)]
assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
def test_custom_delimiters():
assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
assert sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun']
def test_empty_delimiter_set():
with pytest.raises(UsageError):
sanitize_with_delimiter('', 'abc')
def test_no_name_list():
place = PlaceInfo({'address': {'housenumber': '3'}})
name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
assert not name
assert len(address) == 1

View File

@ -0,0 +1,44 @@
"""
Tests for the sanitizer that handles braced suffixes.
"""
import pytest
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.indexer.place_info import PlaceInfo
def run_sanitizer_on(**kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
return sorted([(p.name, p.kind, p.suffix) for p in name])
def test_no_braces():
assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
('foo', 'name', None)]
def test_simple_braces():
assert run_sanitizer_on(name='Halle (Saale)', ref='3')\
== [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
assert run_sanitizer_on(name='ack ( bar')\
== [('ack', 'name', None), ('ack ( bar', 'name', None)]
def test_only_braces():
assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
def test_double_braces():
assert run_sanitizer_on(name='a((b))') == [('a', 'name', None),
('a((b))', 'name', None)]
assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
('a (b) (c)', 'name', None)]
def test_no_names():
place = PlaceInfo({'address': {'housenumber': '3'}})
name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
assert not name
assert len(address) == 1

View File

@ -0,0 +1,71 @@
"""
Tests for execution of the sanitztion step.
"""
import pytest
from nominatim.errors import UsageError
import nominatim.tokenizer.place_sanitizer as sanitizer
from nominatim.indexer.place_info import PlaceInfo
def test_placeinfo_clone_new_name():
place = sanitizer.PlaceName('foo', 'ki', 'su')
newplace = place.clone(name='bar')
assert place.name == 'foo'
assert newplace.name == 'bar'
assert newplace.kind == 'ki'
assert newplace.suffix == 'su'
def test_placeinfo_clone_merge_attr():
place = sanitizer.PlaceName('foo', 'ki', 'su')
place.set_attr('a1', 'v1')
place.set_attr('a2', 'v2')
newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'})
assert place.get_attr('a2') == 'v2'
assert place.get_attr('b2') is None
assert newplace.get_attr('a1') == 'v1'
assert newplace.get_attr('a2') == 'new'
assert newplace.get_attr('b2') == 'foo'
def test_placeinfo_has_attr():
place = sanitizer.PlaceName('foo', 'ki', 'su')
place.set_attr('a1', 'v1')
assert place.has_attr('a1')
assert not place.has_attr('whatever')
def test_sanitizer_default():
san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}])
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
'address': {'street': 'Bald'}}))
assert len(name) == 3
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
assert all(n.kind == 'name' for n in name)
assert all(n.suffix == 'de:de' for n in name)
assert len(address) == 1
assert all(isinstance(n, sanitizer.PlaceName) for n in address)
@pytest.mark.parametrize('rules', [None, []])
def test_sanitizer_empty_list(rules):
san = sanitizer.PlaceSanitizer(rules)
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
assert len(name) == 1
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
def test_sanitizer_missing_step_definition():
with pytest.raises(UsageError):
san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])