mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-13 00:36:42 +03:00
reorganize keyword creation for legacy tokenizer
- only save partial words without internal spaces - consider comma and semicolon a separator of full words - consider parts before an opening bracket a full word (but not the part after the bracket) Fixes #244.
This commit is contained in:
parent
fa3e48c59f
commit
4f4d15c28a
@ -287,26 +287,21 @@ DECLARE
|
|||||||
s TEXT;
|
s TEXT;
|
||||||
w INTEGER;
|
w INTEGER;
|
||||||
words TEXT[];
|
words TEXT[];
|
||||||
item RECORD;
|
value TEXT;
|
||||||
j INTEGER;
|
j INTEGER;
|
||||||
BEGIN
|
BEGIN
|
||||||
result := '{}'::INTEGER[];
|
result := '{}'::INTEGER[];
|
||||||
|
|
||||||
FOR item IN SELECT (each(src)).* LOOP
|
FOR value IN SELECT unnest(regexp_split_to_array(svals(src), E'[,;]')) LOOP
|
||||||
|
-- full name
|
||||||
s := make_standard_name(item.value);
|
s := make_standard_name(value);
|
||||||
w := getorcreate_name_id(s, item.value);
|
w := getorcreate_name_id(s, value);
|
||||||
|
|
||||||
IF not(ARRAY[w] <@ result) THEN
|
IF not(ARRAY[w] <@ result) THEN
|
||||||
result := result || w;
|
result := result || w;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
w := getorcreate_word_id(s);
|
-- partial single-word terms
|
||||||
|
|
||||||
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
|
||||||
result := result || w;
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
words := string_to_array(s, ' ');
|
words := string_to_array(s, ' ');
|
||||||
IF array_upper(words, 1) IS NOT NULL THEN
|
IF array_upper(words, 1) IS NOT NULL THEN
|
||||||
FOR j IN 1..array_upper(words, 1) LOOP
|
FOR j IN 1..array_upper(words, 1) LOOP
|
||||||
@ -319,24 +314,23 @@ BEGIN
|
|||||||
END LOOP;
|
END LOOP;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
words := regexp_split_to_array(item.value, E'[,;()]');
|
-- consider parts before an opening braket a full word as well
|
||||||
IF array_upper(words, 1) != 1 THEN
|
words := regexp_split_to_array(value, E'[(]');
|
||||||
FOR j IN 1..array_upper(words, 1) LOOP
|
IF array_upper(words, 1) > 1 THEN
|
||||||
s := make_standard_name(words[j]);
|
s := make_standard_name(words[1]);
|
||||||
IF s != '' THEN
|
IF s != '' THEN
|
||||||
w := getorcreate_word_id(s);
|
w := getorcreate_name_id(s, words[1]);
|
||||||
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
||||||
result := result || w;
|
result := result || w;
|
||||||
END IF;
|
|
||||||
END IF;
|
END IF;
|
||||||
END LOOP;
|
END IF;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
s := regexp_replace(item.value, '市$', '');
|
s := regexp_replace(value, '市$', '');
|
||||||
IF s != item.value THEN
|
IF s != value THEN
|
||||||
s := make_standard_name(s);
|
s := make_standard_name(s);
|
||||||
IF s != '' THEN
|
IF s != '' THEN
|
||||||
w := getorcreate_name_id(s, item.value);
|
w := getorcreate_name_id(s, value);
|
||||||
IF NOT (ARRAY[w] <@ result) THEN
|
IF NOT (ARRAY[w] <@ result) THEN
|
||||||
result := result || w;
|
result := result || w;
|
||||||
END IF;
|
END IF;
|
||||||
|
@ -423,8 +423,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
names = place.get('name')
|
names = place.get('name')
|
||||||
|
|
||||||
if names:
|
if names:
|
||||||
full_names = set((self.make_standard_word(name) for name in names.values()))
|
full_names = self._compute_full_names(names)
|
||||||
full_names.discard('')
|
|
||||||
|
|
||||||
token_info.add_names(self.conn, full_names)
|
token_info.add_names(self.conn, full_names)
|
||||||
|
|
||||||
@ -461,6 +460,25 @@ class LegacyICUNameAnalyzer:
|
|||||||
return token_info.data
|
return token_info.data
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_full_names(self, names):
|
||||||
|
""" Return the set of all full name word ids to be used with the
|
||||||
|
given dictionary of names.
|
||||||
|
"""
|
||||||
|
full_names = set()
|
||||||
|
for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
|
||||||
|
word = self.make_standard_word(name)
|
||||||
|
if word:
|
||||||
|
full_names.add(word)
|
||||||
|
|
||||||
|
brace_split = name.split('(', 2)
|
||||||
|
if len(brace_split) > 1:
|
||||||
|
word = self.make_standard_word(brace_split[0])
|
||||||
|
if word:
|
||||||
|
full_names.add(word)
|
||||||
|
|
||||||
|
return full_names
|
||||||
|
|
||||||
|
|
||||||
def _add_postcode(self, postcode):
|
def _add_postcode(self, postcode):
|
||||||
""" Make sure the normalized postcode is present in the word table.
|
""" Make sure the normalized postcode is present in the word table.
|
||||||
"""
|
"""
|
||||||
|
@ -2,6 +2,29 @@
|
|||||||
Feature: Creation of search terms
|
Feature: Creation of search terms
|
||||||
Tests that search_name table is filled correctly
|
Tests that search_name table is filled correctly
|
||||||
|
|
||||||
|
Scenario Outline: Comma- and semicolon separated names appear as full names
|
||||||
|
Given the places
|
||||||
|
| osm | class | type | name+alt_name |
|
||||||
|
| N1 | place | city | New York<sep>Big Apple |
|
||||||
|
When importing
|
||||||
|
Then search_name contains
|
||||||
|
| object | name_vector |
|
||||||
|
| N1 | #New York, #Big Apple |
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
| sep |
|
||||||
|
| , |
|
||||||
|
| ; |
|
||||||
|
|
||||||
|
Scenario Outline: Name parts before brackets appear as full names
|
||||||
|
Given the places
|
||||||
|
| osm | class | type | name+name |
|
||||||
|
| N1 | place | city | Halle (Saale) |
|
||||||
|
When importing
|
||||||
|
Then search_name contains
|
||||||
|
| object | name_vector |
|
||||||
|
| N1 | #Halle Saale, #Halle |
|
||||||
|
|
||||||
Scenario: Unnamed POIs have no search entry
|
Scenario: Unnamed POIs have no search entry
|
||||||
Given the scene roads-with-pois
|
Given the scene roads-with-pois
|
||||||
And the places
|
And the places
|
||||||
@ -49,7 +72,7 @@ Feature: Creation of search terms
|
|||||||
When importing
|
When importing
|
||||||
Then search_name contains
|
Then search_name contains
|
||||||
| object | nameaddress_vector |
|
| object | nameaddress_vector |
|
||||||
| N1 | Rose Street, Little, Big, Town |
|
| N1 | #Rose Street, rose, Little, Big, Town |
|
||||||
When searching for "23 Rose Street, Little Big Town"
|
When searching for "23 Rose Street, Little Big Town"
|
||||||
Then results contain
|
Then results contain
|
||||||
| osm_type | osm_id | name |
|
| osm_type | osm_id | name |
|
||||||
|
@ -223,11 +223,32 @@ def test_update_special_phrase_modify(analyzer, word_table):
|
|||||||
|
|
||||||
|
|
||||||
def test_process_place_names(analyzer, getorcreate_term_id):
|
def test_process_place_names(analyzer, getorcreate_term_id):
|
||||||
|
|
||||||
with analyzer() as anl:
|
with analyzer() as anl:
|
||||||
info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
|
info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
|
||||||
|
|
||||||
assert info['names'] == '{1,2,3,4,5,6}'
|
assert info['names'] == '{1,2,3,4,5}'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('sep', [',' , ';'])
|
||||||
|
def test_full_names_with_separator(analyzer, getorcreate_term_id, sep):
|
||||||
|
with analyzer() as anl:
|
||||||
|
full_names =
|
||||||
|
anl._compute_full_names({'name' : sep.join(('New York', 'Big Apple'))})
|
||||||
|
|
||||||
|
expect = set((anl.make_standard_word(w) for w in ('New York', 'Big Apple')))
|
||||||
|
|
||||||
|
assert full_names == expect
|
||||||
|
|
||||||
|
|
||||||
|
def test_process_place_names_with_bracket(analyzer, getorcreate_term_id):
|
||||||
|
with analyzer() as anl:
|
||||||
|
info = anl.process_place({'name' :
|
||||||
|
{'name' : 'Houseboat (left)'}})
|
||||||
|
|
||||||
|
expect = set((anl.make_standard_word(w) for w in
|
||||||
|
(' houseboat', ' houseboat left', 'houseboat', 'left')))
|
||||||
|
|
||||||
|
assert eval(info['names']) == expect
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
||||||
|
Loading…
Reference in New Issue
Block a user