From 44cfce1ca4caf9fb716ff7c37682427d9bd3d31e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Dec 2021 11:38:38 +0100 Subject: [PATCH 1/5] revert to using full names for street name matching Using partial names turned out to not work well because there are often similarly named streets next to each other. It also prevents us from being able to take into account all addr:street:* tags. This change gets all the full term tokens for the addr:street tags from the DB. As they are used for matching only, we can assume that the term must already be there or there will be no match. This avoid creating unused full name tags. --- nominatim/tokenizer/icu_tokenizer.py | 37 +++++++++++++++++++++++++--- test/python/tokenizer/test_icu.py | 28 ++++++++++++++++++++- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index ea6e5d3c..f8f6af2e 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -409,13 +409,16 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): hnrs = [] addr_terms = [] + streets = [] for item in address: if item.kind == 'postcode': self._add_postcode(item.name) elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): hnrs.append(item.name) elif item.kind == 'street': - token_info.add_street(self._compute_partial_tokens(item.name)) + token = self._retrieve_full_token(item.name) + if token: + streets.append(token) elif item.kind == 'place': token_info.add_place(self._compute_partial_tokens(item.name)) elif not item.kind.startswith('_') and \ @@ -429,6 +432,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if addr_terms: token_info.add_address_terms(addr_terms) + if streets: + token_info.add_street(streets) + def _compute_partial_tokens(self, name): """ Normalize the given term, split it into partial words and return @@ -458,6 +464,31 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return tokens + def _retrieve_full_token(self, name): + """ Get the full name token for the given name, if it exists. + The name is only retrived for the standard analyser. + """ + norm_name = self._normalized(name) + + # return cached if possible + if norm_name in self._cache.fulls: + return self._cache.fulls[norm_name] + + # otherwise compute + full, _ = self._cache.names.get(norm_name, (None, None)) + + if full is None: + with self.conn.cursor() as cur: + cur.execute("SELECT word_id FROM word WHERE word = %s and type = 'W' LIMIT 1", + (norm_name, )) + if cur.rowcount > 0: + full = cur.fetchone()[0] + + self._cache.fulls[norm_name] = full + + return full + + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. @@ -561,8 +592,7 @@ class _TokenInfo: def add_street(self, tokens): """ Add addr:street match terms. """ - if tokens: - self.data['street'] = self._mk_array(tokens) + self.data['street'] = self._mk_array(tokens) def add_place(self, tokens): @@ -591,6 +621,7 @@ class _TokenCache: def __init__(self): self.names = {} self.partials = {} + self.fulls = {} self.postcodes = set() self.housenumbers = {} diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 642aaceb..22112220 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -471,9 +471,25 @@ class TestPlaceAddress: def test_process_place_street(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}})) info = self.process_address(street='Grand Road') - assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD') + assert eval(info['street']) == self.name_token_set('#Grand Road') + + + def test_process_place_nonexisting_street(self): + info = self.process_address(street='Grand Road') + + assert 'street' not in info + + + def test_process_place_multiple_street_tags(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road', + 'ref': '05989'}})) + info = self.process_address(**{'street': 'Grand Road', + 'street:sym_ul': '05989'}) + + assert eval(info['street']) == self.name_token_set('#Grand Road', '#05989') def test_process_place_street_empty(self): @@ -482,6 +498,16 @@ class TestPlaceAddress: assert 'street' not in info + def test_process_place_street_from_cache(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}})) + self.process_address(street='Grand Road') + + # request address again + info = self.process_address(street='Grand Road') + + assert eval(info['street']) == self.name_token_set('#Grand Road') + + def test_process_place_place(self): info = self.process_address(place='Honu Lulu') From 5e435b41baea139bf64a385f334f85c721503893 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Dec 2021 14:26:08 +0100 Subject: [PATCH 2/5] ICU: matching any street name will do again --- lib-sql/tokenizer/icu_tokenizer.sql | 2 +- test/bdd/.behaverc | 2 +- test/bdd/db/import/parenting.feature | 24 ++++++++++++++++++++++++ test/bdd/environment.py | 6 ++++++ 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 6092319a..547facce 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -51,7 +51,7 @@ $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->>'street')::INTEGER[] <@ street_tokens + SELECT (info->>'street')::INTEGER[] && street_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; diff --git a/test/bdd/.behaverc b/test/bdd/.behaverc index 32aa6dfa..1b426ec9 100644 --- a/test/bdd/.behaverc +++ b/test/bdd/.behaverc @@ -1,3 +1,3 @@ [behave] show_skipped=False -tags=~@Fail +default_tags=~@Fail diff --git a/test/bdd/db/import/parenting.feature b/test/bdd/db/import/parenting.feature index b5210f94..f6d88ca8 100644 --- a/test/bdd/db/import/parenting.feature +++ b/test/bdd/db/import/parenting.feature @@ -87,6 +87,30 @@ Feature: Parenting of objects | N3 | W2 | | N4 | W1 | + @fail-legacy + Scenario: addr:street tag parents to appropriately named street, locale names + Given the scene roads-with-pois + And the places + | osm | class | type | street| addr+street:de | geometry | + | N1 | place | house | south | Süd | :p-N1 | + | N2 | place | house | north | Nord | :p-N2 | + | N3 | place | house | south | Süd | :p-S1 | + | N4 | place | house | north | Nord | :p-S2 | + And the places + | osm | class | type | name | geometry | + | W1 | highway | residential | Nord | :w-north | + | W2 | highway | residential | Süd | :w-south | + And the places + | osm | class | type | name | name+name:old | + | N5 | place | hamlet | south | north | + When importing + Then placex contains + | object | parent_place_id | + | N1 | W2 | + | N2 | W1 | + | N3 | W2 | + | N4 | W1 | + Scenario: addr:street tag parents to next named street Given the scene roads-with-pois And the places diff --git a/test/bdd/environment.py b/test/bdd/environment.py index f179c8f1..2c277165 100644 --- a/test/bdd/environment.py +++ b/test/bdd/environment.py @@ -49,3 +49,9 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): if 'DB' in context.tags: context.nominatim.teardown_db(context) + + +def before_tag(context, tag): + if tag == 'fail-legacy': + if context.config.userdata['TOKENIZER'] in (None, 'legacy'): + context.scenario.skip("Not implemented in legacy tokenizer") From 7f7d2fd5b35fa31f1d53cd0e319705a4600c3f29 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Dec 2021 14:46:40 +0100 Subject: [PATCH 3/5] skip most addr: tags with suffixes Only one addr: tag can be processed currently, so make sure it is the one without suffixes to not get odd data. addr:street is the exception because it uses a different matching mechanism. --- nominatim/tokenizer/icu_tokenizer.py | 5 +++-- test/python/tokenizer/test_icu.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index f8f6af2e..33f05cc4 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -420,8 +420,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if token: streets.append(token) elif item.kind == 'place': - token_info.add_place(self._compute_partial_tokens(item.name)) - elif not item.kind.startswith('_') and \ + if not item.suffix: + token_info.add_place(self._compute_partial_tokens(item.name)) + elif not item.kind.startswith('_') and not item.suffix and \ item.kind not in ('country', 'full'): addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 22112220..83668b39 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -514,6 +514,12 @@ class TestPlaceAddress: assert eval(info['place']) == self.name_token_set('HONU', 'LULU') + def test_process_place_place_extra(self): + info = self.process_address(**{'place:en': 'Honu Lulu'}) + + assert 'place' not in info + + def test_process_place_place_empty(self): info = self.process_address(place='🜵') @@ -533,6 +539,14 @@ class TestPlaceAddress: assert result == {'city': city, 'suburb': city, 'state': state} + def test_process_place_multiple_address_terms(self): + info = self.process_address(**{'city': 'Bruxelles', 'city:de': 'Brüssel'}) + + result = {k: eval(v) for k,v in info['addr'].items()} + + assert result == {'city': self.name_token_set('Bruxelles')} + + def test_process_place_address_terms_empty(self): info = self.process_address(country='de', city=' ', street='Hauptstr', full='right behind the church') From 5e792078b3ed580f723a47182325405d54cae822 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Dec 2021 15:17:00 +0100 Subject: [PATCH 4/5] remove some odd varaints of addr:street from the styles Some import has added names in partial tags which confuse the street name matching. --- settings/import-address.style | 5 +++-- settings/import-admin.style | 5 +++-- settings/import-extratags.style | 5 +++-- settings/import-full.style | 5 +++-- settings/import-street.style | 5 +++-- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/settings/import-address.style b/settings/import-address.style index d872e289..c69f2c76 100644 --- a/settings/import-address.style +++ b/settings/import-address.style @@ -11,8 +11,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "*wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "skip" } diff --git a/settings/import-admin.style b/settings/import-admin.style index 4e41763f..92484cb0 100644 --- a/settings/import-admin.style +++ b/settings/import-admin.style @@ -5,8 +5,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "*wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "skip" } diff --git a/settings/import-extratags.style b/settings/import-extratags.style index ce32ae1e..52945bba 100644 --- a/settings/import-extratags.style +++ b/settings/import-extratags.style @@ -6,8 +6,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "wikidata", "*:wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "wikidata", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "extra" } diff --git a/settings/import-full.style b/settings/import-full.style index 2e2f25f0..ce82ff83 100644 --- a/settings/import-full.style +++ b/settings/import-full.style @@ -6,8 +6,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "wikidata", "*:wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "wikidata", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "extra" } diff --git a/settings/import-street.style b/settings/import-street.style index 77cf4790..f1bc61ee 100644 --- a/settings/import-street.style +++ b/settings/import-street.style @@ -5,8 +5,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "*wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "skip" } From f9b56a8581be3663239d23ee1194891dde9a3857 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 8 Dec 2021 21:58:43 +0100 Subject: [PATCH 5/5] correctly match abbreviated addr:street This only works when addr:street is abbreviated and the street name isn't. It does not work the other way around. --- nominatim/tokenizer/icu_tokenizer.py | 21 +++++++-------------- test/bdd/db/import/parenting.feature | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 33f05cc4..90caec1c 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -416,9 +416,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): hnrs.append(item.name) elif item.kind == 'street': - token = self._retrieve_full_token(item.name) - if token: - streets.append(token) + streets.extend(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': if not item.suffix: token_info.add_place(self._compute_partial_tokens(item.name)) @@ -465,25 +463,20 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return tokens - def _retrieve_full_token(self, name): + def _retrieve_full_tokens(self, name): """ Get the full name token for the given name, if it exists. The name is only retrived for the standard analyser. """ - norm_name = self._normalized(name) + norm_name = self._search_normalized(name) # return cached if possible if norm_name in self._cache.fulls: return self._cache.fulls[norm_name] - # otherwise compute - full, _ = self._cache.names.get(norm_name, (None, None)) - - if full is None: - with self.conn.cursor() as cur: - cur.execute("SELECT word_id FROM word WHERE word = %s and type = 'W' LIMIT 1", - (norm_name, )) - if cur.rowcount > 0: - full = cur.fetchone()[0] + with self.conn.cursor() as cur: + cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'", + (norm_name, )) + full = [row[0] for row in cur] self._cache.fulls[norm_name] = full diff --git a/test/bdd/db/import/parenting.feature b/test/bdd/db/import/parenting.feature index f6d88ca8..ef25b6cc 100644 --- a/test/bdd/db/import/parenting.feature +++ b/test/bdd/db/import/parenting.feature @@ -111,6 +111,28 @@ Feature: Parenting of objects | N3 | W2 | | N4 | W1 | + Scenario: addr:street tag parents to appropriately named street with abbreviation + Given the scene roads-with-pois + And the places + | osm | class | type | street| geometry | + | N1 | place | house | south st | :p-N1 | + | N2 | place | house | north st | :p-N2 | + | N3 | place | house | south st | :p-S1 | + | N4 | place | house | north st | :p-S2 | + And the places + | osm | class | type | name+name:en | geometry | + | W1 | highway | residential | north street | :w-north | + | W2 | highway | residential | south street | :w-south | + When importing + Then placex contains + | object | parent_place_id | + | N1 | W2 | + | N2 | W1 | + | N3 | W2 | + | N4 | W1 | + + + Scenario: addr:street tag parents to next named street Given the scene roads-with-pois And the places