From 10a5424a71022a787902b86ddcefedb8688bb2b5 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 6 Jan 2024 17:49:58 +0100 Subject: [PATCH] do not run near queries on qualifier words There is too much potential for confusion (e.g. 'Rio Grande' read as 'river near Grande') fir too little gain. Use near phrases instead. --- nominatim/api/search/icu_tokenizer.py | 5 ----- test/python/api/search/test_icu_query_analyzer.py | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index eabd329d..72e0f547 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -8,7 +8,6 @@ Implementation of query analysis for the ICU tokenizer. """ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast -from copy import copy from collections import defaultdict import dataclasses import difflib @@ -188,10 +187,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) else: query.add_token(trange, qmod.TokenType.QUALIFIER, token) - if trange.start == 0 or trange.end == query.num_token_slots(): - token = copy(token) - token.penalty += 0.1 * (query.num_token_slots()) - query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) else: query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) diff --git a/test/python/api/search/test_icu_query_analyzer.py b/test/python/api/search/test_icu_query_analyzer.py index a88ca8b8..6a17e32a 100644 --- a/test/python/api/search/test_icu_query_analyzer.py +++ b/test/python/api/search/test_icu_query_analyzer.py @@ -148,9 +148,9 @@ async def test_qualifier_words(conn): query = await ana.analyze_query(make_phrase('foo BAR foo BAR foo')) assert query.num_token_slots() == 5 - assert set(t.ttype for t in query.nodes[0].starting) == {TokenType.NEAR_ITEM, TokenType.QUALIFIER} + assert set(t.ttype for t in query.nodes[0].starting) == {TokenType.QUALIFIER} assert set(t.ttype for t in query.nodes[2].starting) == {TokenType.QUALIFIER} - assert set(t.ttype for t in query.nodes[4].starting) == {TokenType.NEAR_ITEM, TokenType.QUALIFIER} + assert set(t.ttype for t in query.nodes[4].starting) == {TokenType.QUALIFIER} @pytest.mark.asyncio