move get_addressdata() implementation to Python

The pgsql function get_addressdata() does a lookup of a lot of data
that is already available in Python.
This commit is contained in:
Sarah Hoffmann 2023-09-23 10:44:37 +02:00
parent 87c91ec5c4
commit 7fcbe13669
10 changed files with 261 additions and 116 deletions

View File

@ -187,6 +187,7 @@ BEGIN
-- --- Return the record for the base entry.
current_rank_address := 1000;
FOR location IN
SELECT placex.place_id, osm_type, osm_id, name,
coalesce(extratags->'linked_place', extratags->'place') as place_type,

View File

@ -183,9 +183,6 @@ async def get_detailed_place(conn: SearchConnection, place: ntyp.PlaceRef,
# add missing details
assert result is not None
result.parent_place_id = row.parent_place_id
result.linked_place_id = getattr(row, 'linked_place_id', None)
result.admin_level = getattr(row, 'admin_level', 15)
indexed_date = getattr(row, 'indexed_date', None)
if indexed_date is not None:
result.indexed_date = indexed_date.replace(tzinfo=dt.timezone.utc)

View File

@ -11,14 +11,15 @@ Data classes are part of the public API while the functions are for
internal use only. That's why they are implemented as free-standing functions
instead of member functions.
"""
from typing import Optional, Tuple, Dict, Sequence, TypeVar, Type, List, Any, Union
from typing import Optional, Tuple, Dict, Sequence, TypeVar, Type, List, cast
import enum
import dataclasses
import datetime as dt
import sqlalchemy as sa
from nominatim.typing import SaSelect, SaRow, SaColumn
from nominatim.typing import SaSelect, SaRow
from nominatim.db.sqlalchemy_functions import CrosscheckNames
from nominatim.api.types import Point, Bbox, LookupDetails
from nominatim.api.connection import SearchConnection
from nominatim.api.logging import log
@ -81,12 +82,6 @@ class AddressLine:
and its function as an address object. Most fields are optional.
Their presence depends on the kind and function of the address part.
"""
place_id: Optional[int]
""" Internal ID of the place.
"""
osm_object: Optional[Tuple[str, int]]
""" OSM type and ID of the place, if such an object exists.
"""
category: Tuple[str, str]
""" Main category of the place, described by a key-value pair.
"""
@ -94,16 +89,6 @@ class AddressLine:
""" All available names for the place including references, alternative
names and translations.
"""
extratags: Optional[Dict[str, str]]
""" Any extra information available about the place. This is a dictionary
that usually contains OSM tag key-value pairs.
"""
admin_level: Optional[int]
""" The administrative level of a boundary as tagged in the input data.
This field is only meaningful for places of the category
(boundary, administrative).
"""
fromarea: bool
""" If true, then the exact area of the place is known. Without area
information, Nominatim has to make an educated guess if an address
@ -123,6 +108,22 @@ class AddressLine:
distance: float
""" Distance in degrees between the result place and this address part.
"""
place_id: Optional[int] = None
""" Internal ID of the place.
"""
osm_object: Optional[Tuple[str, int]] = None
""" OSM type and ID of the place, if such an object exists.
"""
extratags: Optional[Dict[str, str]] = None
""" Any extra information available about the place. This is a dictionary
that usually contains OSM tag key-value pairs.
"""
admin_level: Optional[int] = None
""" The administrative level of a boundary as tagged in the input data.
This field is only meaningful for places of the category
(boundary, administrative).
"""
local_name: Optional[str] = None
""" Place holder for localization of this address part. See
@ -184,6 +185,9 @@ class BaseResult:
place_id : Optional[int] = None
osm_object: Optional[Tuple[str, int]] = None
parent_place_id: Optional[int] = None
linked_place_id: Optional[int] = None
admin_level: int = 15
locale_name: Optional[str] = None
display_name: Optional[str] = None
@ -251,9 +255,6 @@ class DetailedResult(BaseResult):
""" A search result with more internal information from the database
added.
"""
parent_place_id: Optional[int] = None
linked_place_id: Optional[int] = None
admin_level: int = 15
indexed_date: Optional[dt.datetime] = None
@ -311,6 +312,9 @@ def create_from_placex_row(row: Optional[SaRow],
place_id=row.place_id,
osm_object=(row.osm_type, row.osm_id),
category=(row.class_, row.type),
parent_place_id = row.parent_place_id,
linked_place_id = getattr(row, 'linked_place_id', None),
admin_level = getattr(row, 'admin_level', 15),
names=_mingle_name_tags(row.name),
address=row.address,
extratags=row.extratags,
@ -341,6 +345,7 @@ def create_from_osmline_row(row: Optional[SaRow],
res = class_type(source_table=SourceTable.OSMLINE,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
osm_object=('W', row.osm_id),
category=('place', 'houses' if hnr is None else 'house'),
address=row.address,
@ -377,6 +382,7 @@ def create_from_tiger_row(row: Optional[SaRow],
res = class_type(source_table=SourceTable.TIGER,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
osm_object=(osm_type or row.osm_type, osm_id or row.osm_id),
category=('place', 'houses' if hnr is None else 'house'),
postcode=row.postcode,
@ -405,6 +411,7 @@ def create_from_postcode_row(row: Optional[SaRow],
return class_type(source_table=SourceTable.POSTCODE,
place_id=row.place_id,
parent_place_id = row.parent_place_id,
category=('place', 'postcode'),
names={'ref': row.postcode},
rank_search=row.rank_search,
@ -457,17 +464,20 @@ async def add_result_details(conn: SearchConnection, results: List[BaseResultT],
result.localize(details.locales)
def _result_row_to_address_row(row: SaRow) -> AddressLine:
def _result_row_to_address_row(row: SaRow, isaddress: Optional[bool] = None) -> AddressLine:
""" Create a new AddressLine from the results of a datbase query.
"""
extratags: Dict[str, str] = getattr(row, 'extratags', {})
if hasattr(row, 'place_type') and row.place_type:
extratags['place'] = row.place_type
extratags: Dict[str, str] = getattr(row, 'extratags', {}) or {}
if 'linked_place' in extratags:
extratags['place'] = extratags['linked_place']
names = _mingle_name_tags(row.name) or {}
if getattr(row, 'housenumber', None) is not None:
names['housenumber'] = row.housenumber
if isaddress is None:
isaddress = getattr(row, 'isaddress', True)
return AddressLine(place_id=row.place_id,
osm_object=None if row.osm_type is None else (row.osm_type, row.osm_id),
category=(getattr(row, 'class'), row.type),
@ -475,7 +485,7 @@ def _result_row_to_address_row(row: SaRow) -> AddressLine:
extratags=extratags,
admin_level=row.admin_level,
fromarea=row.fromarea,
isaddress=getattr(row, 'isaddress', True),
isaddress=isaddress,
rank_address=row.rank_address,
distance=row.distance)
@ -498,73 +508,196 @@ def _get_housenumber_details(results: List[BaseResultT]) -> Tuple[List[int], Lis
return places, hnrs
def _get_address_lookup_id(result: BaseResultT) -> int:
assert result.place_id
if result.source_table != SourceTable.PLACEX or result.rank_search > 27:
return result.parent_place_id or result.place_id
return result.linked_place_id or result.place_id
async def _finalize_entry(conn: SearchConnection, result: BaseResultT) -> None:
assert result.address_rows
postcode = result.postcode
if not postcode and result.address:
postcode = result.address.get('postcode')
if postcode and ',' not in postcode and ';' not in postcode:
result.address_rows.append(AddressLine(
category=('place', 'postcode'),
names={'ref': postcode},
fromarea=False, isaddress=True, rank_address=5,
distance=0.0))
if result.country_code:
async def _get_country_names() -> Optional[Dict[str, str]]:
t = conn.t.country_name
sql = sa.select(t.c.name, t.c.derived_name)\
.where(t.c.country_code == result.country_code)
for cres in await conn.execute(sql):
names = cast(Dict[str, str], cres[0])
if cres[1]:
names.update(cast(Dict[str, str], cres[1]))
return names
return None
country_names = await conn.get_cached_value('COUNTRY_NAME',
result.country_code,
_get_country_names)
if country_names:
result.address_rows.append(AddressLine(
category=('place', 'country'),
names=country_names,
fromarea=False, isaddress=True, rank_address=4,
distance=0.0))
result.address_rows.append(AddressLine(
category=('place', 'country_code'),
names={'ref': result.country_code}, extratags = {},
fromarea=True, isaddress=False, rank_address=4,
distance=0.0))
def _setup_address_details(result: BaseResultT) -> None:
""" Retrieve information about places that make up the address of the result.
"""
result.address_rows = AddressLines()
if result.names:
result.address_rows.append(AddressLine(
place_id=result.place_id,
osm_object=result.osm_object,
category=result.category,
names=result.names,
extratags=result.extratags or {},
admin_level=result.admin_level,
fromarea=True, isaddress=True,
rank_address=result.rank_address, distance=0.0))
if result.source_table == SourceTable.PLACEX and result.address:
housenumber = result.address.get('housenumber')\
or result.address.get('streetnumber')\
or result.address.get('conscriptionnumber')
elif result.housenumber:
housenumber = result.housenumber
else:
housenumber = None
if housenumber:
result.address_rows.append(AddressLine(
category=('place', 'house_number'),
names={'ref': housenumber},
fromarea=True, isaddress=True, rank_address=28, distance=0))
if result.address and '_unlisted_place' in result.address:
result.address_rows.append(AddressLine(
category=('place', 'locality'),
names={'name': result.address['_unlisted_place']},
fromarea=False, isaddress=True, rank_address=25, distance=0))
async def complete_address_details(conn: SearchConnection, results: List[BaseResultT]) -> None:
""" Retrieve information about places that make up the address of the result.
"""
places, hnrs = _get_housenumber_details(results)
for result in results:
_setup_address_details(result)
if not places:
### Lookup entries from place_address line
lookup_ids = [{'pid': r.place_id,
'lid': _get_address_lookup_id(r),
'names': list(r.address.values()) if r.address else [],
'c': ('SRID=4326;' + r.centroid.to_wkt()) if r.centroid else '' }
for r in results if r.place_id]
if not lookup_ids:
return
def _get_addressdata(place_id: Union[int, SaColumn], hnr: Union[int, SaColumn]) -> Any:
return sa.func.get_addressdata(place_id, hnr)\
.table_valued( # type: ignore[no-untyped-call]
sa.column('place_id', type_=sa.Integer),
'osm_type',
sa.column('osm_id', type_=sa.BigInteger),
sa.column('name', type_=conn.t.types.Composite),
'class', 'type', 'place_type',
sa.column('admin_level', type_=sa.Integer),
sa.column('fromarea', type_=sa.Boolean),
sa.column('isaddress', type_=sa.Boolean),
sa.column('rank_address', type_=sa.SmallInteger),
sa.column('distance', type_=sa.Float),
joins_implicitly=True)
ltab = sa.func.json_array_elements(sa.type_coerce(lookup_ids, sa.JSON))\
.table_valued(sa.column('value', type_=sa.JSON)) # type: ignore[no-untyped-call]
t = conn.t.placex
taddr = conn.t.addressline
if len(places) == 1:
# Optimized case for exactly one result (reverse)
sql = sa.select(_get_addressdata(places[0], hnrs[0]))\
.order_by(sa.column('rank_address').desc(),
sa.column('isaddress').desc())
sql = sa.select(ltab.c.value['pid'].as_integer().label('src_place_id'),
t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.extratags,
t.c.admin_level, taddr.c.fromarea,
sa.case((t.c.rank_address == 11, 5),
else_=t.c.rank_address).label('rank_address'),
taddr.c.distance, t.c.country_code, t.c.postcode)\
.join(taddr, sa.or_(taddr.c.place_id == ltab.c.value['pid'].as_integer(),
taddr.c.place_id == ltab.c.value['lid'].as_integer()))\
.join(t, taddr.c.address_place_id == t.c.place_id)\
.order_by('src_place_id')\
.order_by(sa.column('rank_address').desc())\
.order_by((taddr.c.place_id == ltab.c.value['pid'].as_integer()).desc())\
.order_by(sa.case((CrosscheckNames(t.c.name, ltab.c.value['names']), 2),
(taddr.c.isaddress, 0),
(sa.and_(taddr.c.fromarea,
t.c.geometry.ST_Contains(
sa.func.ST_GeomFromEWKT(
ltab.c.value['c'].as_string()))), 1),
else_=-1).desc())\
.order_by(taddr.c.fromarea.desc())\
.order_by(taddr.c.distance.desc())\
.order_by(t.c.rank_search.desc())
alines = AddressLines()
for row in await conn.execute(sql):
alines.append(_result_row_to_address_row(row))
for result in results:
if result.place_id == places[0]:
result.address_rows = alines
return
darray = sa.func.unnest(conn.t.types.to_array(places), conn.t.types.to_array(hnrs))\
.table_valued( # type: ignore[no-untyped-call]
sa.column('place_id', type_= sa.Integer),
sa.column('housenumber', type_= sa.Integer)
).render_derived()
sfn = _get_addressdata(darray.c.place_id, darray.c.housenumber)
sql = sa.select(darray.c.place_id.label('result_place_id'), sfn)\
.order_by(darray.c.place_id,
sa.column('rank_address').desc(),
sa.column('isaddress').desc())
current_result = None
current_rank_address = -1
for row in await conn.execute(sql):
if current_result is None or row.result_place_id != current_result.place_id:
for result in results:
if result.place_id == row.result_place_id:
current_result = result
break
if current_result is None or row.src_place_id != current_result.place_id:
current_result = next((r for r in results if r.place_id == row.src_place_id), None)
assert current_result is not None
current_rank_address = -1
location_isaddress = row.rank_address != current_rank_address
if current_result.country_code is None and row.country_code:
current_result.country_code = row.country_code
if row.type in ('postcode', 'postal_code') and location_isaddress:
if not row.fromarea or \
(current_result.address and 'postcode' in current_result.address):
location_isaddress = False
else:
assert False
current_result.address_rows = AddressLines()
current_result.address_rows.append(_result_row_to_address_row(row))
current_result.postcode = None
assert current_result.address_rows is not None
current_result.address_rows.append(_result_row_to_address_row(row, location_isaddress))
current_rank_address = row.rank_address
for result in results:
await _finalize_entry(conn, result)
### Finally add the record for the parent entry where necessary.
parent_lookup_ids = list(filter(lambda e: e['pid'] != e['lid'], lookup_ids))
if parent_lookup_ids:
ltab = sa.func.json_array_elements(sa.type_coerce(parent_lookup_ids, sa.JSON))\
.table_valued(sa.column('value', type_=sa.JSON)) # type: ignore[no-untyped-call]
sql = sa.select(ltab.c.value['pid'].as_integer().label('src_place_id'),
t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.extratags,
t.c.admin_level,
t.c.rank_address)\
.where(t.c.place_id == ltab.c.value['lid'].as_integer())
for row in await conn.execute(sql):
current_result = next((r for r in results if r.place_id == row.src_place_id), None)
assert current_result is not None
assert current_result.address_rows is not None
current_result.address_rows.append(AddressLine(
place_id=row.place_id,
osm_object=(row.osm_type, row.osm_id),
category=(row.class_, row.type),
names=row.name, extratags=row.extratags or {},
admin_level=row.admin_level,
fromarea=True, isaddress=True,
rank_address=row.rank_address, distance=0.0))
### Now sort everything
for result in results:
assert result.address_rows is not None
result.address_rows.sort(key=lambda a: (-a.rank_address, a.isaddress))
# pylint: disable=consider-using-f-string
def _placex_select_address_row(conn: SearchConnection,
centroid: Point) -> SaSelect:
t = conn.t.placex
@ -575,9 +708,10 @@ def _placex_select_address_row(conn: SearchConnection,
('ST_Polygon','ST_MultiPolygon')""").label('fromarea'),
t.c.rank_address,
sa.literal_column(
"""ST_DistanceSpheroid(geometry, 'SRID=4326;POINT(%f %f)'::geometry,
f"""ST_DistanceSpheroid(geometry,
'SRID=4326;{centroid.to_wkt()}'::geometry,
'SPHEROID["WGS 84",6378137,298.257223563, AUTHORITY["EPSG","7030"]]')
""" % centroid).label('distance'))
""").label('distance'))
async def complete_linked_places(conn: SearchConnection, result: BaseResult) -> None:

View File

@ -57,6 +57,7 @@ def _select_from_placex(t: SaFromClause, use_wkt: bool = True) -> SaSelect:
t.c.importance, t.c.wikipedia,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
centroid,
t.c.linked_place_id, t.c.admin_level,
distance.label('distance'),
t.c.geometry.ST_Expand(0).label('bbox'))

View File

@ -61,6 +61,7 @@ def _select_placex(t: SaFromClause) -> SaSelect:
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.importance, t.c.wikipedia,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
t.c.linked_place_id, t.c.admin_level,
t.c.centroid,
t.c.geometry.ST_Expand(0).label('bbox'))
@ -580,7 +581,7 @@ class PlaceSearch(AbstractSearch):
sql: SaLambdaSelect = sa.lambda_stmt(lambda:
sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type,
t.c.address, t.c.extratags,
t.c.address, t.c.extratags, t.c.admin_level,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.wikipedia,
t.c.parent_place_id, t.c.rank_address, t.c.rank_search,

View File

@ -102,7 +102,8 @@ async def export(args: NominatimArgs) -> int:
async with api.begin() as conn, api.begin() as detail_conn:
t = conn.t.placex
sql = sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
sql = sa.select(t.c.place_id, t.c.parent_place_id,
t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.admin_level,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
@ -153,17 +154,15 @@ async def dump_results(conn: napi.SearchConnection,
results: List[ReverseResult],
writer: 'csv.DictWriter[str]',
lang: Optional[str]) -> None:
await add_result_details(conn, results,
LookupDetails(address_details=True))
locale = napi.Locales([lang] if lang else None)
await add_result_details(conn, results,
LookupDetails(address_details=True, locales=locale))
for result in results:
data = {'placeid': result.place_id,
'postcode': result.postcode}
result.localize(locale)
for line in (result.address_rows or []):
if line.isaddress and line.local_name:
if line.category[1] == 'postcode':

View File

@ -7,8 +7,15 @@
"""
Custom functions and expressions for SQLAlchemy.
"""
from typing import Any
import sqlalchemy as sa
from sqlalchemy.sql.expression import FunctionElement
from sqlalchemy.ext.compiler import compiles
from nominatim.typing import SaColumn
# pylint: disable=abstract-method,missing-function-docstring,consider-using-f-string
def select_index_placex_geometry_reverse_lookuppolygon(table: str) -> 'sa.TextClause':
""" Create an expression with the necessary conditions over a placex
@ -32,3 +39,18 @@ def select_index_placex_geometry_reverse_lookupplacenode(table: str) -> 'sa.Text
f" AND {table}.name is not null"
f" AND {table}.linked_place_id is null"
f" AND {table}.osm_type = 'N'")
class CrosscheckNames(FunctionElement[Any]):
""" Check if in the given list of names in parameters 1 any of the names
from the JSON array in parameter 2 are contained.
"""
name = 'CrosscheckNames'
inherit_cache = True
@compiles(CrosscheckNames) # type: ignore[no-untyped-call, misc]
def compile_crosscheck_names(element: SaColumn,
compiler: 'sa.Compiled', **kw: Any) -> str:
arg1, arg2 = list(element.clauses)
return "coalesce(avals(%s) && ARRAY(SELECT * FROM json_array_elements_text(%s)), false)" % (
compiler.process(arg1, **kw), compiler.process(arg2, **kw))

View File

@ -11,7 +11,7 @@ Feature: Querying fo postcode variants
When sending search query "399174"
Then results contain
| ID | type | display_name |
| 0 | postcode | 399174 |
| 0 | postcode | 399174, Singapore |
@fail-legacy
@ -25,11 +25,11 @@ Feature: Querying fo postcode variants
When sending search query "3993 DX"
Then results contain
| ID | type | display_name |
| 0 | postcode | 3993 DX |
| 0 | postcode | 3993 DX, Nederland |
When sending search query "3993dx"
Then results contain
| ID | type | display_name |
| 0 | postcode | 3993 DX |
| 0 | postcode | 3993 DX, Nederland |
Examples:
| postcode |
@ -49,7 +49,7 @@ Feature: Querying fo postcode variants
When sending search query "399174"
Then results contain
| ID | type | display_name |
| 0 | postcode | 399174 |
| 0 | postcode | 399174, Singapore |
@fail-legacy
@ -63,11 +63,11 @@ Feature: Querying fo postcode variants
When sending search query "675"
Then results contain
| ID | type | display_name |
| 0 | postcode | AD675 |
| 0 | postcode | AD675, Andorra |
When sending search query "AD675"
Then results contain
| ID | type | display_name |
| 0 | postcode | AD675 |
| 0 | postcode | AD675, Andorra |
Examples:
| postcode |
@ -89,9 +89,9 @@ Feature: Querying fo postcode variants
When sending search query "EH4 7EA"
Then results contain
| type | display_name |
| postcode | EH4 7EA |
| postcode | EH4 7EA, United Kingdom |
When sending search query "E4 7EA"
Then results contain
| type | display_name |
| postcode | E4 7EA |
| postcode | E4 7EA, United Kingdom |

View File

@ -340,12 +340,6 @@ def test_lookup_osmline_with_address_details(apiobj):
result = apiobj.api.details(napi.PlaceID(9000), address_details=True)
assert result.address_rows == [
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'house_number'),
names={'ref': '2'}, extratags={},
admin_level=None, fromarea=True, isaddress=True,
rank_address=28, distance=0.0,
local_name='2'),
napi.AddressLine(place_id=332, osm_object=('W', 4),
category=('highway', 'residential'),
names={'name': 'Street'}, extratags={},
@ -444,12 +438,6 @@ def test_lookup_tiger_with_address_details(apiobj):
result = apiobj.api.details(napi.PlaceID(9000), address_details=True)
assert result.address_rows == [
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'house_number'),
names={'ref': '2'}, extratags={},
admin_level=None, fromarea=True, isaddress=True,
rank_address=28, distance=0.0,
local_name='2'),
napi.AddressLine(place_id=332, osm_object=('W', 4),
category=('highway', 'residential'),
names={'name': 'Street'}, extratags={},
@ -543,6 +531,12 @@ def test_lookup_postcode_with_address_details(apiobj):
result = apiobj.api.details(napi.PlaceID(9000), address_details=True)
assert result.address_rows == [
napi.AddressLine(place_id=9000, osm_object=None,
category=('place', 'postcode'),
names={'ref': '34 425'}, extratags={},
admin_level=15, fromarea=True, isaddress=True,
rank_address=25, distance=0.0,
local_name='34 425'),
napi.AddressLine(place_id=332, osm_object=('N', 3333),
category=('place', 'suburb'),
names={'name': 'Smallplace'}, extratags={},
@ -555,12 +549,6 @@ def test_lookup_postcode_with_address_details(apiobj):
admin_level=15, fromarea=True, isaddress=True,
rank_address=16, distance=0.0,
local_name='Bigplace'),
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'postcode'),
names={'ref': '34 425'}, extratags={},
admin_level=None, fromarea=False, isaddress=True,
rank_address=5, distance=0.0,
local_name='34 425'),
napi.AddressLine(place_id=None, osm_object=None,
category=('place', 'country_code'),
names={'ref': 'gb'}, extratags={},

View File

@ -23,6 +23,8 @@ def mkpoint(x, y):
class FakeRow:
def __init__(self, **kwargs):
if 'parent_place_id' not in kwargs:
kwargs['parent_place_id'] = None
for k, v in kwargs.items():
setattr(self, k, v)
self._mapping = kwargs