diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 6092319a..547facce 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -51,7 +51,7 @@ $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->>'street')::INTEGER[] <@ street_tokens + SELECT (info->>'street')::INTEGER[] && street_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index ea6e5d3c..90caec1c 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -409,16 +409,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): hnrs = [] addr_terms = [] + streets = [] for item in address: if item.kind == 'postcode': self._add_postcode(item.name) elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): hnrs.append(item.name) elif item.kind == 'street': - token_info.add_street(self._compute_partial_tokens(item.name)) + streets.extend(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': - token_info.add_place(self._compute_partial_tokens(item.name)) - elif not item.kind.startswith('_') and \ + if not item.suffix: + token_info.add_place(self._compute_partial_tokens(item.name)) + elif not item.kind.startswith('_') and not item.suffix and \ item.kind not in ('country', 'full'): addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) @@ -429,6 +431,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if addr_terms: token_info.add_address_terms(addr_terms) + if streets: + token_info.add_street(streets) + def _compute_partial_tokens(self, name): """ Normalize the given term, split it into partial words and return @@ -458,6 +463,26 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return tokens + def _retrieve_full_tokens(self, name): + """ Get the full name token for the given name, if it exists. + The name is only retrived for the standard analyser. + """ + norm_name = self._search_normalized(name) + + # return cached if possible + if norm_name in self._cache.fulls: + return self._cache.fulls[norm_name] + + with self.conn.cursor() as cur: + cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'", + (norm_name, )) + full = [row[0] for row in cur] + + self._cache.fulls[norm_name] = full + + return full + + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. @@ -561,8 +586,7 @@ class _TokenInfo: def add_street(self, tokens): """ Add addr:street match terms. """ - if tokens: - self.data['street'] = self._mk_array(tokens) + self.data['street'] = self._mk_array(tokens) def add_place(self, tokens): @@ -591,6 +615,7 @@ class _TokenCache: def __init__(self): self.names = {} self.partials = {} + self.fulls = {} self.postcodes = set() self.housenumbers = {} diff --git a/settings/import-address.style b/settings/import-address.style index d872e289..c69f2c76 100644 --- a/settings/import-address.style +++ b/settings/import-address.style @@ -11,8 +11,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "*wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "skip" } diff --git a/settings/import-admin.style b/settings/import-admin.style index 4e41763f..92484cb0 100644 --- a/settings/import-admin.style +++ b/settings/import-admin.style @@ -5,8 +5,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "*wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "skip" } diff --git a/settings/import-extratags.style b/settings/import-extratags.style index ce32ae1e..52945bba 100644 --- a/settings/import-extratags.style +++ b/settings/import-extratags.style @@ -6,8 +6,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "wikidata", "*:wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "wikidata", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "extra" } diff --git a/settings/import-full.style b/settings/import-full.style index 2e2f25f0..ce82ff83 100644 --- a/settings/import-full.style +++ b/settings/import-full.style @@ -6,8 +6,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "wikidata", "*:wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "wikidata", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "extra" } diff --git a/settings/import-street.style b/settings/import-street.style index 77cf4790..f1bc61ee 100644 --- a/settings/import-street.style +++ b/settings/import-street.style @@ -5,8 +5,9 @@ } }, { - "keys" : ["name:prefix", "name:suffix", "name:prefix:*", "name:suffix:*", - "name:etymology", "name:signed", "name:botanical", "*wikidata"], + "keys" : ["*:prefix", "*:suffix", "name:prefix:*", "name:suffix:*", + "name:etymology", "name:signed", "name:botanical", "*:wikidata", + "addr:street:name", "addr:street:type"], "values" : { "" : "skip" } diff --git a/test/bdd/.behaverc b/test/bdd/.behaverc index 32aa6dfa..1b426ec9 100644 --- a/test/bdd/.behaverc +++ b/test/bdd/.behaverc @@ -1,3 +1,3 @@ [behave] show_skipped=False -tags=~@Fail +default_tags=~@Fail diff --git a/test/bdd/db/import/parenting.feature b/test/bdd/db/import/parenting.feature index b5210f94..ef25b6cc 100644 --- a/test/bdd/db/import/parenting.feature +++ b/test/bdd/db/import/parenting.feature @@ -87,6 +87,52 @@ Feature: Parenting of objects | N3 | W2 | | N4 | W1 | + @fail-legacy + Scenario: addr:street tag parents to appropriately named street, locale names + Given the scene roads-with-pois + And the places + | osm | class | type | street| addr+street:de | geometry | + | N1 | place | house | south | Süd | :p-N1 | + | N2 | place | house | north | Nord | :p-N2 | + | N3 | place | house | south | Süd | :p-S1 | + | N4 | place | house | north | Nord | :p-S2 | + And the places + | osm | class | type | name | geometry | + | W1 | highway | residential | Nord | :w-north | + | W2 | highway | residential | Süd | :w-south | + And the places + | osm | class | type | name | name+name:old | + | N5 | place | hamlet | south | north | + When importing + Then placex contains + | object | parent_place_id | + | N1 | W2 | + | N2 | W1 | + | N3 | W2 | + | N4 | W1 | + + Scenario: addr:street tag parents to appropriately named street with abbreviation + Given the scene roads-with-pois + And the places + | osm | class | type | street| geometry | + | N1 | place | house | south st | :p-N1 | + | N2 | place | house | north st | :p-N2 | + | N3 | place | house | south st | :p-S1 | + | N4 | place | house | north st | :p-S2 | + And the places + | osm | class | type | name+name:en | geometry | + | W1 | highway | residential | north street | :w-north | + | W2 | highway | residential | south street | :w-south | + When importing + Then placex contains + | object | parent_place_id | + | N1 | W2 | + | N2 | W1 | + | N3 | W2 | + | N4 | W1 | + + + Scenario: addr:street tag parents to next named street Given the scene roads-with-pois And the places diff --git a/test/bdd/environment.py b/test/bdd/environment.py index f179c8f1..2c277165 100644 --- a/test/bdd/environment.py +++ b/test/bdd/environment.py @@ -49,3 +49,9 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): if 'DB' in context.tags: context.nominatim.teardown_db(context) + + +def before_tag(context, tag): + if tag == 'fail-legacy': + if context.config.userdata['TOKENIZER'] in (None, 'legacy'): + context.scenario.skip("Not implemented in legacy tokenizer") diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 642aaceb..83668b39 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -471,9 +471,25 @@ class TestPlaceAddress: def test_process_place_street(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}})) info = self.process_address(street='Grand Road') - assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD') + assert eval(info['street']) == self.name_token_set('#Grand Road') + + + def test_process_place_nonexisting_street(self): + info = self.process_address(street='Grand Road') + + assert 'street' not in info + + + def test_process_place_multiple_street_tags(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road', + 'ref': '05989'}})) + info = self.process_address(**{'street': 'Grand Road', + 'street:sym_ul': '05989'}) + + assert eval(info['street']) == self.name_token_set('#Grand Road', '#05989') def test_process_place_street_empty(self): @@ -482,12 +498,28 @@ class TestPlaceAddress: assert 'street' not in info + def test_process_place_street_from_cache(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}})) + self.process_address(street='Grand Road') + + # request address again + info = self.process_address(street='Grand Road') + + assert eval(info['street']) == self.name_token_set('#Grand Road') + + def test_process_place_place(self): info = self.process_address(place='Honu Lulu') assert eval(info['place']) == self.name_token_set('HONU', 'LULU') + def test_process_place_place_extra(self): + info = self.process_address(**{'place:en': 'Honu Lulu'}) + + assert 'place' not in info + + def test_process_place_place_empty(self): info = self.process_address(place='🜵') @@ -507,6 +539,14 @@ class TestPlaceAddress: assert result == {'city': city, 'suburb': city, 'state': state} + def test_process_place_multiple_address_terms(self): + info = self.process_address(**{'city': 'Bruxelles', 'city:de': 'Brüssel'}) + + result = {k: eval(v) for k,v in info['addr'].items()} + + assert result == {'city': self.name_token_set('Bruxelles')} + + def test_process_place_address_terms_empty(self): info = self.process_address(country='de', city=' ', street='Hauptstr', full='right behind the church')