From 206ee8718864d623507a0ae69070478dec411e84 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 7 Jan 2022 22:41:09 +0100 Subject: [PATCH] factor out housenumber splitting into sanitizer --- nominatim/tokenizer/icu_tokenizer.py | 29 ++-------- .../sanitizers/clean_housenumbers.py | 56 +++++++++++++++++++ settings/icu_tokenizer.yaml | 1 + test/bdd/db/query/housenumbers.feature | 55 ++++++++++++++++++ 4 files changed, 118 insertions(+), 23 deletions(-) create mode 100644 nominatim/tokenizer/sanitizers/clean_housenumbers.py create mode 100644 test/bdd/db/query/housenumbers.feature diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 7b820c9d..cfbb44e3 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -413,14 +413,16 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): - hnrs = [] + hnrs = set() addr_terms = [] streets = [] for item in address: if item.kind == 'postcode': self._add_postcode(item.name) - elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(item.name) + elif item.kind == 'housenumber': + norm_name = self._make_standard_hnr(item.name) + if norm_name: + hnrs.add(norm_name) elif item.kind == 'street': streets.extend(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': @@ -431,8 +433,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + token_info.add_housenumbers(self.conn, hnrs) if addr_terms: token_info.add_address_terms(addr_terms) @@ -545,24 +546,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): self._cache.postcodes.add(postcode) - @staticmethod - def _split_housenumbers(hnrs): - if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: - # split numbers if necessary - simple_list = [] - for hnr in hnrs: - simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr))) - - if len(simple_list) > 1: - hnrs = list(set(simple_list)) - else: - hnrs = simple_list - - return hnrs - - - - class _TokenInfo: """ Collect token information to be sent back to the database. """ diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py new file mode 100644 index 00000000..5b592bcf --- /dev/null +++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Sanitizer that cleans and normalizes housenumbers. +""" +import re + +class _HousenumberSanitizer: + + def __init__(self, config): + pass + + + def __call__(self, obj): + if not obj.address: + return + + new_address = [] + for item in obj.address: + if item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): + new_address.extend(item.clone(kind='housenumber', name=n) for n in self.sanitize(item.name)) + else: + # Don't touch other address items. + new_address.append(item) + + obj.address = new_address + + + def sanitize(self, value): + """ Extract housenumbers in a regularized format from an OSM value. + + The function works as a generator that yields all valid housenumbers + that can be created from the value. + """ + for hnr in self._split_number(value): + yield from self._regularize(hnr) + + + def _split_number(self, hnr): + for part in re.split(r'[;,]', hnr): + yield part.strip() + + + def _regularize(self, hnr): + yield hnr + + +def create(config): + """ Create a housenumber processing function. + """ + + return _HousenumberSanitizer(config) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index c6601faf..d00cffb9 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -27,6 +27,7 @@ transliteration: sanitizers: - step: split-name-list - step: strip-brace-terms + - step: clean-housenumbers - step: tag-analyzer-by-language filter-kind: [".*name.*"] whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi] diff --git a/test/bdd/db/query/housenumbers.feature b/test/bdd/db/query/housenumbers.feature new file mode 100644 index 00000000..63bd8984 --- /dev/null +++ b/test/bdd/db/query/housenumbers.feature @@ -0,0 +1,55 @@ +@DB +Feature: Searching of house numbers + Test for specialised treeatment of housenumbers + + Background: + Given the grid + | 1 | | 2 | | 3 | + | | 9 | | | | + | | | | | 4 | + + + Scenario: A simple numeral housenumber is found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | 45 | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | North Road | 1,2,3 | + When importing + And sending search query "45, North Road" + Then results contain + | osm | + | N1 | + When sending search query "North Road 45" + Then results contain + | osm | + | N1 | + + + Scenario Outline: Each housenumber in a list is found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | Multistr | 1,2,3 | + When importing + When sending search query "2 Multistr" + Then results contain + | osm | + | N1 | + When sending search query "4 Multistr" + Then results contain + | osm | + | N1 | + When sending search query "12 Multistr" + Then results contain + | osm | + | N1 | + + Examples: + | hnrs | + | 2;4;12 | + | 2,4,12 | + | 2, 4, 12 |