From 645ea5a057346a698f007c0b9e35a58c9ed11755 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 30 Jun 2023 11:08:25 +0200 Subject: [PATCH 1/3] use information from tokenizer to determine street vs. place address So far the SQL logic used the information from the address field to determine if an address is attached to a street or place. This changes the logic to use the information provided in the token_info. This allows sanitizers to enforce a certain parenting without changing the visible address information. --- docs/develop/Tokenizers.md | 22 ++++++++++++++++++++++ lib-sql/functions/placex_triggers.sql | 4 ++-- lib-sql/tokenizer/icu_tokenizer.sql | 9 ++++++++- lib-sql/tokenizer/legacy_tokenizer.sql | 7 +++++++ nominatim/tokenizer/icu_tokenizer.py | 6 ++++-- test/python/tokenizer/test_icu.py | 4 ++-- 6 files changed, 45 insertions(+), 7 deletions(-) diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md index 273e65e2..eb0d4ea2 100644 --- a/docs/develop/Tokenizers.md +++ b/docs/develop/Tokenizers.md @@ -189,6 +189,28 @@ a house number token text. If a place has multiple house numbers they must be listed with a semicolon as delimiter. Must be NULL when the place has no house numbers. +```sql +FUNCTION token_is_street_address(info JSONB) RETURNS BOOLEAN +``` + +Return true if this is an object that should be parented against a street. +Only relevant for objects with address rank 30. + +```sql +FUNCTION token_has_addr_street(info JSONB) RETURNS BOOLEAN +``` + +Return true if there are street names to match against for finding the +parent of the object. + + +```sql +FUNCTION token_has_addr_place(info JSONB) RETURNS BOOLEAN +``` + +Return true if there are place names to match against for finding the +parent of the object. + ```sql FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN ``` diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 99d2872f..a252e9aa 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -996,7 +996,7 @@ BEGIN {% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %} NEW.parent_place_id := null; - is_place_address := coalesce(not NEW.address ? 'street' and NEW.address ? 'place', FALSE); + is_place_address := not token_is_street_address(NEW.token_info); -- We have to find our parent road. NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id, @@ -1013,7 +1013,7 @@ BEGIN SELECT p.country_code, p.postcode, p.name FROM placex p WHERE p.place_id = NEW.parent_place_id INTO location; - IF is_place_address THEN + IF is_place_address and NEW.address ? 'place' THEN -- Check if the addr:place tag is part of the parent name SELECT count(*) INTO i FROM svals(location.name) AS pname WHERE pname = NEW.address->'place'; diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 599d0eb0..04fcedcb 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -41,10 +41,17 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; +CREATE OR REPLACE FUNCTION token_is_street_address(info JSONB) + RETURNS BOOLEAN +AS $$ + SELECT info->>'street' is not null or info->>'place' is null; +$$ LANGUAGE SQL IMMUTABLE; + + CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) RETURNS BOOLEAN AS $$ - SELECT info->>'street' is not null; + SELECT info->>'street' is not null and info->>'street' != '{}'; $$ LANGUAGE SQL IMMUTABLE; diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index 5826f74a..3b82619f 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -41,6 +41,13 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; +CREATE OR REPLACE FUNCTION token_is_street_address(info JSONB) + RETURNS BOOLEAN +AS $$ + SELECT info->>'street' is not null or info->>'place' is null; +$$ LANGUAGE SQL IMMUTABLE; + + CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) RETURNS BOOLEAN AS $$ diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 79f383f6..b6e64637 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -720,7 +720,7 @@ class _TokenInfo: self.names: Optional[str] = None self.housenumbers: Set[str] = set() self.housenumber_tokens: Set[int] = set() - self.street_tokens: Set[int] = set() + self.street_tokens: Optional[Set[int]] = None self.place_tokens: Set[int] = set() self.address_tokens: Dict[str, str] = {} self.postcode: Optional[str] = None @@ -742,7 +742,7 @@ class _TokenInfo: out['hnr'] = ';'.join(self.housenumbers) out['hnr_tokens'] = self._mk_array(self.housenumber_tokens) - if self.street_tokens: + if self.street_tokens is not None: out['street'] = self._mk_array(self.street_tokens) if self.place_tokens: @@ -776,6 +776,8 @@ class _TokenInfo: def add_street(self, tokens: Iterable[int]) -> None: """ Add addr:street match terms. """ + if self.street_tokens is None: + self.street_tokens = set() self.street_tokens.update(tokens) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 7f0ffce1..2d9da69a 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -523,7 +523,7 @@ class TestPlaceAddress: def test_process_place_nonexisting_street(self): info = self.process_address(street='Grand Road') - assert 'street' not in info + assert info['street'] == '{}' def test_process_place_multiple_street_tags(self): @@ -538,7 +538,7 @@ class TestPlaceAddress: def test_process_place_street_empty(self): info = self.process_address(street='🜵') - assert 'street' not in info + assert info['street'] == '{}' def test_process_place_street_from_cache(self): From 6c5589c9d26c2ad21318798f0e46421fbf545a05 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 29 Jun 2023 22:02:55 +0200 Subject: [PATCH 2/3] fix optional string representation or repr(PlaceName) --- nominatim/data/place_name.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nominatim/data/place_name.py b/nominatim/data/place_name.py index 47464e2b..abba3544 100644 --- a/nominatim/data/place_name.py +++ b/nominatim/data/place_name.py @@ -34,7 +34,7 @@ class PlaceName: def __repr__(self) -> str: - return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')" + return f"PlaceName(name={self.name!r},kind={self.kind!r},suffix={self.suffix!r})" def clone(self, name: Optional[str] = None, From d7a3039c2a4bd26d05f08ee3140b8dfaecd68f02 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 30 Jun 2023 15:28:00 +0200 Subject: [PATCH 3/3] also switch legacy tokenizer to new street/place choice behaviour --- lib-sql/tokenizer/legacy_tokenizer.sql | 4 ++-- nominatim/tokenizer/legacy_tokenizer.py | 7 +++---- test/python/tokenizer/test_legacy.py | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index 3b82619f..8c8f56e1 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -44,14 +44,14 @@ $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_is_street_address(info JSONB) RETURNS BOOLEAN AS $$ - SELECT info->>'street' is not null or info->>'place' is null; + SELECT info->>'street' is not null or info->>'place_search' is null; $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) RETURNS BOOLEAN AS $$ - SELECT info->>'street' is not null; + SELECT info->>'street' is not null and info->>'street' != '{}'; $$ LANGUAGE SQL IMMUTABLE; diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index a50dedb2..e09700d9 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -564,14 +564,13 @@ class _TokenInfo: def add_street(self, conn: Connection, street: str) -> None: """ Add addr:street match terms. """ - def _get_street(name: str) -> List[int]: + def _get_street(name: str) -> Optional[str]: with conn.cursor() as cur: - return cast(List[int], + return cast(Optional[str], cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))) tokens = self.cache.streets.get(street, _get_street) - if tokens: - self.data['street'] = tokens + self.data['street'] = tokens or '{}' def add_place(self, conn: Connection, place: str) -> None: diff --git a/test/python/tokenizer/test_legacy.py b/test/python/tokenizer/test_legacy.py index 57a82b8a..d63ee8e1 100644 --- a/test/python/tokenizer/test_legacy.py +++ b/test/python/tokenizer/test_legacy.py @@ -549,7 +549,7 @@ class TestPlaceAddress: def test_process_place_street_empty(self): info = self.process_address(street='🜵') - assert 'street' not in info + assert info['street'] == '{}' def test_process_place_place(self):