Merge pull request #3206 from lonvia/rerank-by-locale

Reintroduce result reranking
This commit is contained in:
Sarah Hoffmann 2023-09-21 09:24:18 +02:00 committed by GitHub
commit 87c91ec5c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 145 additions and 73 deletions

View File

@ -292,12 +292,6 @@ class SearchResults(List[SearchResult]):
May be empty when no result was found.
"""
def localize(self, locales: Locales) -> None:
""" Apply the given locales to all results.
"""
for result in self:
result.localize(locales)
def _filter_geometries(row: SaRow) -> Dict[str, str]:
return {k[9:]: v for k, v in row._mapping.items() # pylint: disable=W0212
@ -459,6 +453,8 @@ async def add_result_details(conn: SearchConnection, results: List[BaseResultT],
log().comment('Query keywords')
for result in results:
await complete_keywords(conn, result)
for result in results:
result.localize(details.locales)
def _result_row_to_address_row(row: SaRow) -> AddressLine:

View File

@ -7,13 +7,15 @@
"""
Public interface to the search code.
"""
from typing import List, Any, Optional, Iterator, Tuple
from typing import List, Any, Optional, Iterator, Tuple, Dict
import itertools
import re
import datetime as dt
import difflib
from nominatim.api.connection import SearchConnection
from nominatim.api.types import SearchDetails
from nominatim.api.results import SearchResults, add_result_details
from nominatim.api.results import SearchResult, SearchResults, add_result_details
from nominatim.api.search.token_assignment import yield_token_assignments
from nominatim.api.search.db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
from nominatim.api.search.db_searches import AbstractSearch
@ -73,42 +75,86 @@ class ForwardGeocoder:
is found.
"""
log().section('Execute database searches')
results = SearchResults()
results: Dict[Any, SearchResult] = {}
end_time = dt.datetime.now() + self.timeout
num_results = 0
min_ranking = 1000.0
prev_penalty = 0.0
for i, search in enumerate(searches):
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
break
log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
for result in await search.lookup(self.conn, self.params):
results.append(result)
lookup_results = await search.lookup(self.conn, self.params)
for result in lookup_results:
rhash = (result.source_table, result.place_id,
result.housenumber, result.country_code)
prevresult = results.get(rhash)
if prevresult:
prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
else:
results[rhash] = result
min_ranking = min(min_ranking, result.ranking + 0.5, search.penalty + 0.3)
log().result_dump('Results', ((r.accuracy, r) for r in results[num_results:]))
num_results = len(results)
log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
prev_penalty = search.penalty
if dt.datetime.now() >= end_time:
break
return SearchResults(results.values())
def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
""" Remove badly matching results, sort by ranking and
limit to the configured number of results.
"""
if results:
min_ranking = min(r.ranking for r in results)
results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
results.sort(key=lambda r: r.ranking)
if results:
min_rank = min(r.rank_search for r in results)
min_rank = results[0].rank_search
results = SearchResults(r for r in results
if r.ranking + 0.05 * (r.rank_search - min_rank)
< min_ranking + 0.5)
results.sort(key=lambda r: r.accuracy - r.calculated_importance())
results = SearchResults(results[:self.limit])
return results
def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
""" Adjust the accuracy of the localized result according to how well
they match the original query.
"""
assert self.query_analyzer is not None
qwords = [word for phrase in query.source
for word in re.split('[, ]+', phrase.text) if word]
if not qwords:
return
for result in results:
if not result.display_name:
continue
distance = 0.0
norm = self.query_analyzer.normalize_text(result.display_name)
words = set((w for w in norm.split(' ') if w))
if not words:
continue
for qword in qwords:
wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
if wdist < 0.5:
distance += len(qword)
else:
distance += (1.0 - wdist) * len(qword)
# Compensate for the fact that country names do not get a
# match penalty yet by the tokenizer.
# Temporary hack that needs to be removed!
if result.rank_address == 4:
distance *= 2
result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
async def lookup_pois(self, categories: List[Tuple[str, str]],
phrases: List[Phrase]) -> SearchResults:
""" Look up places by category. If phrase is given, a place search
@ -123,13 +169,16 @@ class ForwardGeocoder:
if query:
searches = [wrap_near_search(categories, s) for s in searches[:50]]
results = await self.execute_searches(query, searches)
await add_result_details(self.conn, results, self.params)
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
results = self.sort_and_cut_results(results)
else:
results = SearchResults()
else:
search = build_poi_search(categories, self.params.countries)
results = await search.lookup(self.conn, self.params)
await add_result_details(self.conn, results, self.params)
await add_result_details(self.conn, results, self.params)
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
return results
@ -150,6 +199,10 @@ class ForwardGeocoder:
# Execute SQL until an appropriate result is found.
results = await self.execute_searches(query, searches[:50])
await add_result_details(self.conn, results, self.params)
log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
self.rerank_by_query(query, results)
log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
results = self.sort_and_cut_results(results)
log().result_dump('Final Results', ((r.accuracy, r) for r in results))
return results

View File

@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
return query
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form.
This only removes case, so some difference with the normalization
in the phrase remains.
"""
return text.lower()
def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
Dict[str, List[qmod.TokenRange]]]:
""" Transliterate the phrases and split them into tokens.

View File

@ -30,6 +30,15 @@ class AbstractQueryAnalyzer(ABC):
"""
@abstractmethod
def normalize_text(self, text: str) -> str:
""" Bring the given text into a normalized form. That is the
standardized form search will work with. All information removed
at this stage is inevitably lost.
"""
async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
""" Create a query analyzer for the tokenizer used by the database.
"""

View File

@ -17,6 +17,7 @@ from struct import unpack
from binascii import unhexlify
from nominatim.errors import UsageError
from nominatim.api.localization import Locales
# pylint: disable=no-member,too-many-boolean-expressions,too-many-instance-attributes
@ -386,7 +387,7 @@ TParam = TypeVar('TParam', bound='LookupDetails') # pylint: disable=invalid-name
@dataclasses.dataclass
class LookupDetails:
""" Collection of parameters that define the amount of details
""" Collection of parameters that define which kind of details are
returned with a lookup or details result.
"""
geometry_output: GeometryFormat = GeometryFormat.NONE
@ -413,6 +414,9 @@ class LookupDetails:
0.0 means the original geometry is kept. The higher the value, the
more the geometry gets simplified.
"""
locales: Locales = Locales()
""" Prefered languages for localization of results.
"""
@classmethod
def from_kwargs(cls: Type[TParam], kwargs: Dict[str, Any]) -> TParam:

View File

@ -308,7 +308,8 @@ async def details_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) ->
keywords=params.get_bool('keywords', False),
geometry_output = napi.GeometryFormat.GEOJSON
if params.get_bool('polygon_geojson', False)
else napi.GeometryFormat.NONE
else napi.GeometryFormat.NONE,
locales=locales
)
if debug:
@ -317,8 +318,6 @@ async def details_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) ->
if result is None:
params.raise_error('No place with that OSM ID found.', status=404)
result.localize(locales)
output = formatting.format_result(result, fmt,
{'locales': locales,
'group_hierarchy': params.get_bool('group_hierarchy', False),
@ -337,6 +336,7 @@ async def reverse_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) ->
details = params.parse_geometry_details(fmt)
details['max_rank'] = helpers.zoom_to_rank(params.get_int('zoom', 18))
details['layers'] = params.get_layers()
details['locales'] = napi.Locales.from_accept_languages(params.get_accepted_languages())
result = await api.reverse(coord, **details)
@ -357,9 +357,6 @@ async def reverse_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) ->
'namedetails': params.get_bool('namedetails', False),
'addressdetails': params.get_bool('addressdetails', True)}
if result:
result.localize(napi.Locales.from_accept_languages(params.get_accepted_languages()))
output = formatting.format_result(napi.ReverseResults([result] if result else []),
fmt, fmt_options)
@ -372,6 +369,7 @@ async def lookup_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> A
fmt = params.parse_format(napi.SearchResults, 'xml')
debug = params.setup_debugging()
details = params.parse_geometry_details(fmt)
details['locales'] = napi.Locales.from_accept_languages(params.get_accepted_languages())
places = []
for oid in (params.get('osm_ids') or '').split(','):
@ -394,8 +392,6 @@ async def lookup_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> A
'namedetails': params.get_bool('namedetails', False),
'addressdetails': params.get_bool('addressdetails', True)}
results.localize(napi.Locales.from_accept_languages(params.get_accepted_languages()))
output = formatting.format_result(results, fmt, fmt_options)
return params.build_response(output, num_results=len(results))
@ -456,6 +452,8 @@ async def search_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> A
else:
details['layers'] = params.get_layers()
details['locales'] = napi.Locales.from_accept_languages(params.get_accepted_languages())
# unstructured query parameters
query = params.get('q', None)
# structured query parameters
@ -480,8 +478,6 @@ async def search_endpoint(api: napi.NominatimAPIAsync, params: ASGIAdaptor) -> A
except UsageError as err:
params.raise_error(str(err))
results.localize(napi.Locales.from_accept_languages(params.get_accepted_languages()))
if details['dedupe'] and len(results) > 1:
results = helpers.deduplicate_results(results, max_results)

View File

@ -109,7 +109,8 @@ class APISearch:
'countries': args.countrycodes,
'excluded': args.exclude_place_ids,
'viewbox': args.viewbox,
'bounded_viewbox': args.bounded
'bounded_viewbox': args.bounded,
'locales': args.get_locales(api.config.DEFAULT_LANGUAGE)
}
if args.query:
@ -124,9 +125,6 @@ class APISearch:
country=args.country,
**params)
for result in results:
result.localize(args.get_locales(api.config.DEFAULT_LANGUAGE))
if args.dedupe and len(results) > 1:
results = deduplicate_results(results, args.limit)
@ -187,14 +185,14 @@ class APIReverse:
layers=args.get_layers(napi.DataLayer.ADDRESS | napi.DataLayer.POI),
address_details=True, # needed for display name
geometry_output=args.get_geometry_output(),
geometry_simplification=args.polygon_threshold)
geometry_simplification=args.polygon_threshold,
locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
if args.format == 'debug':
print(loglib.get_and_disable())
return 0
if result:
result.localize(args.get_locales(api.config.DEFAULT_LANGUAGE))
output = api_output.format_result(
napi.ReverseResults([result]),
args.format,
@ -249,10 +247,8 @@ class APILookup:
results = api.lookup(places,
address_details=True, # needed for display name
geometry_output=args.get_geometry_output(),
geometry_simplification=args.polygon_threshold or 0.0)
for result in results:
result.localize(args.get_locales(api.config.DEFAULT_LANGUAGE))
geometry_simplification=args.polygon_threshold or 0.0,
locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
output = api_output.format_result(
results,
@ -326,6 +322,7 @@ class APIDetails:
api = napi.NominatimAPI(args.project_dir)
locales = args.get_locales(api.config.DEFAULT_LANGUAGE)
result = api.details(place,
address_details=args.addressdetails,
linked_places=args.linkedplaces,
@ -333,13 +330,11 @@ class APIDetails:
keywords=args.keywords,
geometry_output=napi.GeometryFormat.GEOJSON
if args.polygon_geojson
else napi.GeometryFormat.NONE)
else napi.GeometryFormat.NONE,
locales=locales)
if result:
locales = args.get_locales(api.config.DEFAULT_LANGUAGE)
result.localize(locales)
output = api_output.format_result(
result,
'json',

View File

@ -56,7 +56,7 @@ Feature: Structured search queries
| Liechtenstein |
And results contain
| class | type |
| amenity | ^(pub)\|(bar) |
| amenity | ^(pub)\|(bar)\|(restaurant) |
#176
Scenario: Structured search restricts rank

View File

@ -150,17 +150,20 @@ def test_lookup_placex_with_address_details(apiobj):
category=('highway', 'residential'),
names={'name': 'Street'}, extratags={},
admin_level=15, fromarea=True, isaddress=True,
rank_address=26, distance=0.0),
rank_address=26, distance=0.0,
local_name='Street'),
napi.AddressLine(place_id=1000, osm_object=('N', 3333),
category=('place', 'suburb'),
names={'name': 'Smallplace'}, extratags={},
admin_level=13, fromarea=False, isaddress=True,
rank_address=23, distance=0.0034),
rank_address=23, distance=0.0034,
local_name='Smallplace'),
napi.AddressLine(place_id=1001, osm_object=('N', 3334),
category=('place', 'city'),
names={'name': 'Bigplace'}, extratags={},
admin_level=15, fromarea=True, isaddress=True,
rank_address=16, distance=0.0),
rank_address=16, distance=0.0,
local_name='Bigplace'),
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'country_code'),
names={'ref': 'pl'}, extratags={},
@ -341,22 +344,26 @@ def test_lookup_osmline_with_address_details(apiobj):
category=('place', 'house_number'),
names={'ref': '2'}, extratags={},
admin_level=None, fromarea=True, isaddress=True,
rank_address=28, distance=0.0),
rank_address=28, distance=0.0,
local_name='2'),
napi.AddressLine(place_id=332, osm_object=('W', 4),
category=('highway', 'residential'),
names={'name': 'Street'}, extratags={},
admin_level=15, fromarea=True, isaddress=True,
rank_address=26, distance=0.0),
rank_address=26, distance=0.0,
local_name='Street'),
napi.AddressLine(place_id=1000, osm_object=('N', 3333),
category=('place', 'suburb'),
names={'name': 'Smallplace'}, extratags={},
admin_level=13, fromarea=False, isaddress=True,
rank_address=23, distance=0.0034),
rank_address=23, distance=0.0034,
local_name='Smallplace'),
napi.AddressLine(place_id=1001, osm_object=('N', 3334),
category=('place', 'city'),
names={'name': 'Bigplace'}, extratags={},
admin_level=15, fromarea=True, isaddress=True,
rank_address=16, distance=0.0),
rank_address=16, distance=0.0,
local_name='Bigplace'),
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'country_code'),
names={'ref': 'pl'}, extratags={},
@ -441,22 +448,26 @@ def test_lookup_tiger_with_address_details(apiobj):
category=('place', 'house_number'),
names={'ref': '2'}, extratags={},
admin_level=None, fromarea=True, isaddress=True,
rank_address=28, distance=0.0),
rank_address=28, distance=0.0,
local_name='2'),
napi.AddressLine(place_id=332, osm_object=('W', 4),
category=('highway', 'residential'),
names={'name': 'Street'}, extratags={},
admin_level=15, fromarea=True, isaddress=True,
rank_address=26, distance=0.0),
rank_address=26, distance=0.0,
local_name='Street'),
napi.AddressLine(place_id=1000, osm_object=('N', 3333),
category=('place', 'suburb'),
names={'name': 'Smallplace'}, extratags={},
admin_level=13, fromarea=False, isaddress=True,
rank_address=23, distance=0.0034),
rank_address=23, distance=0.0034,
local_name='Smallplace'),
napi.AddressLine(place_id=1001, osm_object=('N', 3334),
category=('place', 'city'),
names={'name': 'Bigplace'}, extratags={},
admin_level=15, fromarea=True, isaddress=True,
rank_address=16, distance=0.0),
rank_address=16, distance=0.0,
local_name='Bigplace'),
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'country_code'),
names={'ref': 'us'}, extratags={},
@ -536,17 +547,20 @@ def test_lookup_postcode_with_address_details(apiobj):
category=('place', 'suburb'),
names={'name': 'Smallplace'}, extratags={},
admin_level=13, fromarea=True, isaddress=True,
rank_address=23, distance=0.0),
rank_address=23, distance=0.0,
local_name='Smallplace'),
napi.AddressLine(place_id=1001, osm_object=('N', 3334),
category=('place', 'city'),
names={'name': 'Bigplace'}, extratags={},
admin_level=15, fromarea=True, isaddress=True,
rank_address=16, distance=0.0),
rank_address=16, distance=0.0,
local_name='Bigplace'),
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'postcode'),
names={'ref': '34 425'}, extratags={},
admin_level=None, fromarea=False, isaddress=True,
rank_address=5, distance=0.0),
rank_address=5, distance=0.0,
local_name='34 425'),
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'country_code'),
names={'ref': 'gb'}, extratags={},

View File

@ -67,7 +67,9 @@ class TestCliReverseCall:
result = napi.ReverseResult(napi.SourceTable.PLACEX, ('place', 'thing'),
napi.Point(1.0, -3.0),
names={'name':'Name', 'name:fr': 'Nom'},
extratags={'extra':'Extra'})
extratags={'extra':'Extra'},
locale_name='Name',
display_name='Name')
monkeypatch.setattr(napi.NominatimAPI, 'reverse',
lambda *args, **kwargs: result)
@ -109,16 +111,6 @@ class TestCliReverseCall:
assert out['type'] == 'FeatureCollection'
def test_reverse_language(self, cli_call, tmp_path, capsys):
result = cli_call('reverse', '--project-dir', str(tmp_path),
'--lat', '34', '--lon', '34', '--lang', 'fr')
assert result == 0
out = json.loads(capsys.readouterr().out)
assert out['name'] == 'Nom'
class TestCliLookupCall:
@pytest.fixture(autouse=True)
@ -126,7 +118,9 @@ class TestCliLookupCall:
result = napi.SearchResult(napi.SourceTable.PLACEX, ('place', 'thing'),
napi.Point(1.0, -3.0),
names={'name':'Name', 'name:fr': 'Nom'},
extratags={'extra':'Extra'})
extratags={'extra':'Extra'},
locale_name='Name',
display_name='Name')
monkeypatch.setattr(napi.NominatimAPI, 'lookup',
lambda *args, **kwargs: napi.SearchResults([result]))
@ -150,9 +144,11 @@ class TestCliLookupCall:
])
def test_search(cli_call, tmp_path, capsys, monkeypatch, endpoint, params):
result = napi.SearchResult(napi.SourceTable.PLACEX, ('place', 'thing'),
napi.Point(1.0, -3.0),
names={'name':'Name', 'name:fr': 'Nom'},
extratags={'extra':'Extra'})
napi.Point(1.0, -3.0),
names={'name':'Name', 'name:fr': 'Nom'},
extratags={'extra':'Extra'},
locale_name='Name',
display_name='Name')
monkeypatch.setattr(napi.NominatimAPI, endpoint,
lambda *args, **kwargs: napi.SearchResults([result]))