From 206ee8718864d623507a0ae69070478dec411e84 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 7 Jan 2022 22:41:09 +0100
Subject: [PATCH] factor out housenumber splitting into sanitizer

---
 nominatim/tokenizer/icu_tokenizer.py          | 29 ++--------
 .../sanitizers/clean_housenumbers.py          | 56 +++++++++++++++++++
 settings/icu_tokenizer.yaml                   |  1 +
 test/bdd/db/query/housenumbers.feature        | 55 ++++++++++++++++++
 4 files changed, 118 insertions(+), 23 deletions(-)
 create mode 100644 nominatim/tokenizer/sanitizers/clean_housenumbers.py
 create mode 100644 test/bdd/db/query/housenumbers.feature

diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 7b820c9d..cfbb44e3 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -413,14 +413,16 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
 
     def _process_place_address(self, token_info, address):
-        hnrs = []
+        hnrs = set()
         addr_terms = []
         streets = []
         for item in address:
             if item.kind == 'postcode':
                 self._add_postcode(item.name)
-            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(item.name)
+            elif item.kind == 'housenumber':
+                norm_name = self._make_standard_hnr(item.name)
+                if norm_name:
+                    hnrs.add(norm_name)
             elif item.kind == 'street':
                 streets.extend(self._retrieve_full_tokens(item.name))
             elif item.kind == 'place':
@@ -431,8 +433,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 
         if hnrs:
-            hnrs = self._split_housenumbers(hnrs)
-            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+            token_info.add_housenumbers(self.conn, hnrs)
 
         if addr_terms:
             token_info.add_address_terms(addr_terms)
@@ -545,24 +546,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                 self._cache.postcodes.add(postcode)
 
 
-    @staticmethod
-    def _split_housenumbers(hnrs):
-        if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
-            # split numbers if necessary
-            simple_list = []
-            for hnr in hnrs:
-                simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
-
-            if len(simple_list) > 1:
-                hnrs = list(set(simple_list))
-            else:
-                hnrs = simple_list
-
-        return hnrs
-
-
-
-
 class _TokenInfo:
     """ Collect token information to be sent back to the database.
     """
diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
new file mode 100644
index 00000000..5b592bcf
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that cleans and normalizes housenumbers.
+"""
+import re
+
+class _HousenumberSanitizer:
+
+    def __init__(self, config):
+        pass
+
+
+    def __call__(self, obj):
+        if not obj.address:
+            return
+
+        new_address = []
+        for item in obj.address:
+            if item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                new_address.extend(item.clone(kind='housenumber', name=n) for n in self.sanitize(item.name))
+            else:
+                # Don't touch other address items.
+                new_address.append(item)
+
+        obj.address = new_address
+
+
+    def sanitize(self, value):
+        """ Extract housenumbers in a regularized format from an OSM value.
+
+            The function works as a generator that yields all valid housenumbers
+            that can be created from the value.
+        """
+        for hnr in self._split_number(value):
+            yield from self._regularize(hnr)
+
+
+    def _split_number(self, hnr):
+        for part in re.split(r'[;,]', hnr):
+            yield part.strip()
+
+
+    def _regularize(self, hnr):
+        yield hnr
+
+
+def create(config):
+    """ Create a housenumber processing function.
+    """
+
+    return _HousenumberSanitizer(config)
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index c6601faf..d00cffb9 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -27,6 +27,7 @@ transliteration:
 sanitizers:
     - step: split-name-list
     - step: strip-brace-terms
+    - step: clean-housenumbers
     - step: tag-analyzer-by-language
       filter-kind: [".*name.*"]
       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
diff --git a/test/bdd/db/query/housenumbers.feature b/test/bdd/db/query/housenumbers.feature
new file mode 100644
index 00000000..63bd8984
--- /dev/null
+++ b/test/bdd/db/query/housenumbers.feature
@@ -0,0 +1,55 @@
+@DB
+Feature: Searching of house numbers
+    Test for specialised treeatment of housenumbers
+
+    Background:
+        Given the grid
+         | 1 |   | 2 |   | 3 |
+         |   | 9 |   |   |   |
+         |   |   |   |   | 4 |
+
+
+    Scenario: A simple numeral housenumber is found
+        Given the places
+         | osm | class    | type | housenr | geometry |
+         | N1  | building | yes  | 45      | 9        |
+        And the places
+         | osm | class   | type | name       | geometry |
+         | W10 | highway | path | North Road | 1,2,3    |
+        When importing
+        And sending search query "45, North Road"
+        Then results contain
+         | osm |
+         | N1  |
+        When sending search query "North Road 45"
+        Then results contain
+         | osm |
+         | N1  |
+
+
+    Scenario Outline: Each housenumber in a list is found
+        Given the places
+         | osm | class    | type | housenr | geometry |
+         | N1  | building | yes  | <hnrs>  | 9        |
+        And the places
+         | osm | class   | type | name     | geometry |
+         | W10 | highway | path | Multistr | 1,2,3    |
+        When importing
+        When sending search query "2 Multistr"
+        Then results contain
+         | osm |
+         | N1  |
+        When sending search query "4 Multistr"
+        Then results contain
+         | osm |
+         | N1  |
+        When sending search query "12 Multistr"
+        Then results contain
+         | osm |
+         | N1  |
+
+     Examples:
+        | hnrs |
+        | 2;4;12 |
+        | 2,4,12 |
+        | 2, 4, 12 |