From 2813bf18e67a6a4e947c7eb53f9d57058e27610d Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sat, 27 Jan 2024 20:48:40 +0100
Subject: [PATCH 1/6] avoid duplicates in the list of partial tokens for a
 query

This messes with the estimates for expected results.
---
 nominatim/api/search/db_search_builder.py | 29 +++++++++++------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py
index fd8cc7af..6d5fa41a 100644
--- a/nominatim/api/search/db_search_builder.py
+++ b/nominatim/api/search/db_search_builder.py
@@ -166,15 +166,15 @@ class SearchBuilder:
         sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
         expected_count = sum(t.count for t in hnrs)
 
-        partials = [t for trange in address
-                       for t in self.query.get_partials_list(trange)]
+        partials = {t.token: t.count for trange in address
+                       for t in self.query.get_partials_list(trange)}
 
         if expected_count < 8000:
             sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
-                                                 [t.token for t in partials], lookups.Restrict))
-        elif len(partials) != 1 or partials[0].count < 10000:
+                                                 list(partials), lookups.Restrict))
+        elif len(partials) != 1 or list(partials.values())[0] < 10000:
             sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
-                                                 [t.token for t in partials], lookups.LookupAll))
+                                                 list(partials), lookups.LookupAll))
         else:
             sdata.lookups.append(
                 dbf.FieldLookup('nameaddress_vector',
@@ -208,18 +208,17 @@ class SearchBuilder:
             are and tries to find a lookup that optimizes index use.
         """
         penalty = 0.0 # extra penalty
-        name_partials = self.query.get_partials_list(name)
-        name_tokens = [t.token for t in name_partials]
+        name_partials = {t.token: t for t in self.query.get_partials_list(name)}
 
         addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
-        addr_tokens = [t.token for t in addr_partials]
+        addr_tokens = list({t.token for t in addr_partials})
 
-        partials_indexed = all(t.is_indexed for t in name_partials) \
+        partials_indexed = all(t.is_indexed for t in name_partials.values()) \
                            and all(t.is_indexed for t in addr_partials)
-        exp_count = min(t.count for t in name_partials) / (2**(len(name_partials) - 1))
+        exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
 
         if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
-            yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
+            yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
             return
 
         # Partial term to frequent. Try looking up by rare full names first.
@@ -232,15 +231,15 @@ class SearchBuilder:
                 addr_tokens = [t.token for t in addr_partials if t.is_indexed]
                 penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
             # Any of the full names applies with all of the partials from the address
-            yield penalty, fulls_count / (2**len(addr_partials)),\
+            yield penalty, fulls_count / (2**len(addr_tokens)),\
                   dbf.lookup_by_any_name([t.token for t in name_fulls],
                                          addr_tokens, fulls_count > 10000)
 
         # To catch remaining results, lookup by name and address
         # We only do this if there is a reasonable number of results expected.
-        exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
-        if exp_count < 10000 and all(t.is_indexed for t in name_partials):
-            lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
+        exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
+        if exp_count < 10000 and all(t.is_indexed for t in name_partials.values()):
+            lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)]
             if addr_tokens:
                 lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
             penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))

From 2703442fd2b470a986db0b8b4fbe3f3e6905dcb2 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sat, 27 Jan 2024 21:26:03 +0100
Subject: [PATCH 2/6] protect against very frequent bad partials

---
 nominatim/api/search/db_search_builder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py
index 6d5fa41a..94c492c2 100644
--- a/nominatim/api/search/db_search_builder.py
+++ b/nominatim/api/search/db_search_builder.py
@@ -233,7 +233,8 @@ class SearchBuilder:
             # Any of the full names applies with all of the partials from the address
             yield penalty, fulls_count / (2**len(addr_tokens)),\
                   dbf.lookup_by_any_name([t.token for t in name_fulls],
-                                         addr_tokens, fulls_count > 10000)
+                                         addr_tokens,
+                                         fulls_count > 30000 / max(1, len(addr_tokens)))
 
         # To catch remaining results, lookup by name and address
         # We only do this if there is a reasonable number of results expected.

From fed46240d5af3a2543e7cac930a862d6a0abde7b Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sun, 28 Jan 2024 11:35:30 +0100
Subject: [PATCH 3/6] disallow category tokens in the middle of a query string

This already worked for left-to-right readings and now is also
implemented for right-to-left reading. A qualifier must always be
before or after the name.
---
 nominatim/api/search/token_assignment.py        |  5 +++++
 test/python/api/search/test_token_assignment.py | 11 +++++++++++
 2 files changed, 16 insertions(+)

diff --git a/nominatim/api/search/token_assignment.py b/nominatim/api/search/token_assignment.py
index d94d6903..b874dcd7 100644
--- a/nominatim/api/search/token_assignment.py
+++ b/nominatim/api/search/token_assignment.py
@@ -132,6 +132,11 @@ class _TokenSequence:
 
         # Name tokens are always acceptable and don't change direction
         if ttype == qmod.TokenType.PARTIAL:
+            # qualifiers cannot appear in the middle of the qeury. They need
+            # to be near the next phrase.
+            if self.direction == -1 \
+               and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
+                return None
             return self.direction
 
         # Other tokens may only appear once
diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py
index 2ed55a0f..54e8af14 100644
--- a/test/python/api/search/test_token_assignment.py
+++ b/test/python/api/search/test_token_assignment.py
@@ -337,3 +337,14 @@ def test_qualifier_after_housenumber():
                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
 
     check_assignments(yield_token_assignments(q))
+
+
+def test_qualifier_in_middle_of_phrase():
+    q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
+                   (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
+                   (BreakType.WORD, PhraseType.NONE, [(3, TokenType.QUALIFIER)]),
+                   (BreakType.WORD, PhraseType.NONE, [(4, TokenType.PARTIAL)]),
+                   (BreakType.PHRASE, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
+
+    check_assignments(yield_token_assignments(q))
+

From f9ba7a465a8192f89b4150a43b247d8dd29087cd Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sun, 28 Jan 2024 11:48:03 +0100
Subject: [PATCH 4/6] always add a penalty for name + address search fallback

If there already was a search by full names, the search is likely
a repeatition that yields the same results, only running slower.
---
 nominatim/api/search/db_search_builder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py
index 94c492c2..62b4e1b5 100644
--- a/nominatim/api/search/db_search_builder.py
+++ b/nominatim/api/search/db_search_builder.py
@@ -243,7 +243,8 @@ class SearchBuilder:
             lookup = [dbf.FieldLookup('name_vector', list(name_partials.keys()), lookups.LookupAll)]
             if addr_tokens:
                 lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
-            penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
+            penalty += 0.35 * max(1 if name_fulls else 0.1,
+                                  5 - len(name_partials) - len(addr_tokens))
             yield penalty, exp_count, lookup
 
 

From 103800a732759d044bcc767d70a1ef943db53f09 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sun, 28 Jan 2024 17:54:22 +0100
Subject: [PATCH 5/6] adjust rankings for housenumber-only searches

A normal address search with housenumber will use name rankings for
the street name. This is slightly different than weighing for
address parts. Use the same ranking for the first part of the
address for housenumber-only searches to make sure that penalties
remain comparable.
---
 nominatim/api/search/db_search_builder.py | 55 +++++++++++++++++------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py
index 62b4e1b5..3d5796c5 100644
--- a/nominatim/api/search/db_search_builder.py
+++ b/nominatim/api/search/db_search_builder.py
@@ -248,7 +248,8 @@ class SearchBuilder:
             yield penalty, exp_count, lookup
 
 
-    def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
+    def get_name_ranking(self, trange: TokenRange,
+                         db_field: str = 'name_vector') -> dbf.FieldRanking:
         """ Create a ranking expression for a name term in the given range.
         """
         name_fulls = self.query.get_tokens(trange, TokenType.WORD)
@@ -257,7 +258,7 @@ class SearchBuilder:
         # Fallback, sum of penalty for partials
         name_partials = self.query.get_partials_list(trange)
         default = sum(t.penalty for t in name_partials) + 0.2
-        return dbf.FieldRanking('name_vector', default, ranks)
+        return dbf.FieldRanking(db_field, default, ranks)
 
 
     def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
@@ -315,11 +316,9 @@ class SearchBuilder:
         sdata = dbf.SearchData()
         sdata.penalty = assignment.penalty
         if assignment.country:
-            tokens = self.query.get_tokens(assignment.country, TokenType.COUNTRY)
-            if self.details.countries:
-                tokens = [t for t in tokens if t.lookup_word in self.details.countries]
-                if not tokens:
-                    return None
+            tokens = self.get_country_tokens(assignment.country)
+            if not tokens:
+                return None
             sdata.set_strings('countries', tokens)
         elif self.details.countries:
             sdata.countries = dbf.WeightedStrings(self.details.countries,
@@ -333,24 +332,54 @@ class SearchBuilder:
                               self.query.get_tokens(assignment.postcode,
                                                     TokenType.POSTCODE))
         if assignment.qualifier:
-            tokens = self.query.get_tokens(assignment.qualifier, TokenType.QUALIFIER)
-            if self.details.categories:
-                tokens = [t for t in tokens if t.get_category() in self.details.categories]
-                if not tokens:
-                    return None
+            tokens = self.get_qualifier_tokens(assignment.qualifier)
+            if not tokens:
+                return None
             sdata.set_qualifiers(tokens)
         elif self.details.categories:
             sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
                                                       [0.0] * len(self.details.categories))
 
         if assignment.address:
-            sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
+            if not assignment.name and assignment.housenumber:
+                # housenumber search: the first item needs to be handled like
+                # a name in ranking or penalties are not comparable with
+                # normal searches.
+                sdata.set_ranking([self.get_name_ranking(assignment.address[0],
+                                                         db_field='nameaddress_vector')]
+                                  + [self.get_addr_ranking(r) for r in assignment.address[1:]])
+            else:
+                sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
         else:
             sdata.rankings = []
 
         return sdata
 
 
+    def get_country_tokens(self, trange: TokenRange) -> List[Token]:
+        """ Return the list of country tokens for the given range,
+            optionally filtered by the country list from the details
+            parameters.
+        """
+        tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
+        if self.details.countries:
+            tokens = [t for t in tokens if t.lookup_word in self.details.countries]
+
+        return tokens
+
+
+    def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
+        """ Return the list of qualifier tokens for the given range,
+            optionally filtered by the qualifier list from the details
+            parameters.
+        """
+        tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
+        if self.details.categories:
+            tokens = [t for t in tokens if t.get_category() in self.details.categories]
+
+        return tokens
+
+
     def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
         """ Collect tokens for near items search or use the categories
             requested per parameter.

From f07f8530a824ed7b4d26e8d6697c57a69426919e Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sun, 28 Jan 2024 18:09:48 +0100
Subject: [PATCH 6/6] housenumber-only searches cannot be combined with
 qualifiers

---
 nominatim/api/search/token_assignment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nominatim/api/search/token_assignment.py b/nominatim/api/search/token_assignment.py
index b874dcd7..7a53a20e 100644
--- a/nominatim/api/search/token_assignment.py
+++ b/nominatim/api/search/token_assignment.py
@@ -390,7 +390,7 @@ class _TokenSequence:
                 yield from self._get_assignments_address_backward(base, query)
 
             # variant for special housenumber searches
-            if base.housenumber:
+            if base.housenumber and not base.qualifier:
                 yield dataclasses.replace(base, penalty=self.penalty)