From 4f4d15c28a8743c2f3dfb6d3e5b787b94ef66fc5 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 23 May 2021 23:58:58 +0200 Subject: [PATCH] reorganize keyword creation for legacy tokenizer - only save partial words without internal spaces - consider comma and semicolon a separator of full words - consider parts before an opening bracket a full word (but not the part after the bracket) Fixes #244. --- lib-sql/tokenizer/legacy_tokenizer.sql | 42 +++++++++------------ nominatim/tokenizer/legacy_icu_tokenizer.py | 22 ++++++++++- test/bdd/db/import/search_name.feature | 25 +++++++++++- test/python/test_tokenizer_legacy_icu.py | 25 +++++++++++- 4 files changed, 85 insertions(+), 29 deletions(-) diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index fe82762e..a2c6b520 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -287,26 +287,21 @@ DECLARE s TEXT; w INTEGER; words TEXT[]; - item RECORD; + value TEXT; j INTEGER; BEGIN result := '{}'::INTEGER[]; - FOR item IN SELECT (each(src)).* LOOP - - s := make_standard_name(item.value); - w := getorcreate_name_id(s, item.value); + FOR value IN SELECT unnest(regexp_split_to_array(svals(src), E'[,;]')) LOOP + -- full name + s := make_standard_name(value); + w := getorcreate_name_id(s, value); IF not(ARRAY[w] <@ result) THEN result := result || w; END IF; - w := getorcreate_word_id(s); - - IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN - result := result || w; - END IF; - + -- partial single-word terms words := string_to_array(s, ' '); IF array_upper(words, 1) IS NOT NULL THEN FOR j IN 1..array_upper(words, 1) LOOP @@ -319,24 +314,23 @@ BEGIN END LOOP; END IF; - words := regexp_split_to_array(item.value, E'[,;()]'); - IF array_upper(words, 1) != 1 THEN - FOR j IN 1..array_upper(words, 1) LOOP - s := make_standard_name(words[j]); - IF s != '' THEN - w := getorcreate_word_id(s); - IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN - result := result || w; - END IF; + -- consider parts before an opening braket a full word as well + words := regexp_split_to_array(value, E'[(]'); + IF array_upper(words, 1) > 1 THEN + s := make_standard_name(words[1]); + IF s != '' THEN + w := getorcreate_name_id(s, words[1]); + IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN + result := result || w; END IF; - END LOOP; + END IF; END IF; - s := regexp_replace(item.value, '市$', ''); - IF s != item.value THEN + s := regexp_replace(value, '市$', ''); + IF s != value THEN s := make_standard_name(s); IF s != '' THEN - w := getorcreate_name_id(s, item.value); + w := getorcreate_name_id(s, value); IF NOT (ARRAY[w] <@ result) THEN result := result || w; END IF; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 156e99ec..b4d85356 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -423,8 +423,7 @@ class LegacyICUNameAnalyzer: names = place.get('name') if names: - full_names = set((self.make_standard_word(name) for name in names.values())) - full_names.discard('') + full_names = self._compute_full_names(names) token_info.add_names(self.conn, full_names) @@ -461,6 +460,25 @@ class LegacyICUNameAnalyzer: return token_info.data + def _compute_full_names(self, names): + """ Return the set of all full name word ids to be used with the + given dictionary of names. + """ + full_names = set() + for name in (n for ns in names.values() for n in re.split('[;,]', ns)): + word = self.make_standard_word(name) + if word: + full_names.add(word) + + brace_split = name.split('(', 2) + if len(brace_split) > 1: + word = self.make_standard_word(brace_split[0]) + if word: + full_names.add(word) + + return full_names + + def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ diff --git a/test/bdd/db/import/search_name.feature b/test/bdd/db/import/search_name.feature index fd207059..9e899053 100644 --- a/test/bdd/db/import/search_name.feature +++ b/test/bdd/db/import/search_name.feature @@ -2,6 +2,29 @@ Feature: Creation of search terms Tests that search_name table is filled correctly + Scenario Outline: Comma- and semicolon separated names appear as full names + Given the places + | osm | class | type | name+alt_name | + | N1 | place | city | New YorkBig Apple | + When importing + Then search_name contains + | object | name_vector | + | N1 | #New York, #Big Apple | + + Examples: + | sep | + | , | + | ; | + + Scenario Outline: Name parts before brackets appear as full names + Given the places + | osm | class | type | name+name | + | N1 | place | city | Halle (Saale) | + When importing + Then search_name contains + | object | name_vector | + | N1 | #Halle Saale, #Halle | + Scenario: Unnamed POIs have no search entry Given the scene roads-with-pois And the places @@ -49,7 +72,7 @@ Feature: Creation of search terms When importing Then search_name contains | object | nameaddress_vector | - | N1 | Rose Street, Little, Big, Town | + | N1 | #Rose Street, rose, Little, Big, Town | When searching for "23 Rose Street, Little Big Town" Then results contain | osm_type | osm_id | name | diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py index 798fea37..ebce7218 100644 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_legacy_icu.py @@ -223,11 +223,32 @@ def test_update_special_phrase_modify(analyzer, word_table): def test_process_place_names(analyzer, getorcreate_term_id): - with analyzer() as anl: info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}}) - assert info['names'] == '{1,2,3,4,5,6}' + assert info['names'] == '{1,2,3,4,5}' + + +@pytest.mark.parametrize('sep', [',' , ';']) +def test_full_names_with_separator(analyzer, getorcreate_term_id, sep): + with analyzer() as anl: + full_names = + anl._compute_full_names({'name' : sep.join(('New York', 'Big Apple'))}) + + expect = set((anl.make_standard_word(w) for w in ('New York', 'Big Apple'))) + + assert full_names == expect + + +def test_process_place_names_with_bracket(analyzer, getorcreate_term_id): + with analyzer() as anl: + info = anl.process_place({'name' : + {'name' : 'Houseboat (left)'}}) + + expect = set((anl.make_standard_word(w) for w in + (' houseboat', ' houseboat left', 'houseboat', 'left'))) + + assert eval(info['names']) == expect @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])