Nominatim/sql/functions/normalization.sql
Sarah Hoffmann 7324431b12 get additional addresses for rank 30 objects
get_addressdata() now also checks if the place itself has entries
in the place_addressline table and merges them into the results.

Also restrict checking for address tag places to cases where the
name cannot be found in the parent's address search terms. Looking
up all address tags is just too slow.
2020-11-16 15:28:01 +01:00

536 lines
15 KiB
PL/PgSQL

-- Functions for term normalisation and access to the 'word' table.
CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
AS '{modulepath}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
AS '{modulepath}/nominatim.so', 'gettokenstring'
LANGUAGE c IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
AS $$
DECLARE
o TEXT;
BEGIN
o := public.gettokenstring(public.transliteration(name));
RETURN trim(substr(o,1,length(o)));
END;
$$
LANGUAGE plpgsql IMMUTABLE;
-- returns NULL if the word is too common
CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
count INTEGER;
BEGIN
lookup_token := trim(lookup_word);
SELECT min(word_id), max(search_name_count) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_id, count;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
ELSE
IF count > get_maxwordfreq() THEN
return_word_id := NULL;
END IF;
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' ' || trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and class='place' and type='house'
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null,
'place', 'house', null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
lookup_word TEXT;
return_word_id INTEGER;
BEGIN
lookup_word := upper(trim(postcode));
lookup_token := ' ' || make_standard_name(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and word = lookup_word
and class='place' and type='postcode'
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
'place', 'postcode', null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
lookup_country_code varchar(2))
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and country_code=lookup_country_code
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, null,
null, null, lookup_country_code, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
lookup_class text, lookup_type text)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and word = normalized_word
and class = lookup_class and type = lookup_type
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
lookup_class, lookup_type, null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT,
normalized_word TEXT,
lookup_class text,
lookup_type text,
op text)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and word = normalized_word
and class = lookup_class and type = lookup_type and operator = op
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
lookup_class, lookup_type, null, 0, op);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
lookup_token TEXT;
nospace_lookup_token TEXT;
return_word_id INTEGER;
BEGIN
lookup_token := ' '||trim(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
null, null, null, 0);
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
DECLARE
BEGIN
RETURN getorcreate_name_id(lookup_word, '');
END;
$$
LANGUAGE plpgsql;
-- Normalize a string and lookup its word ids (partial words).
CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
RETURNS INTEGER[]
AS $$
DECLARE
lookup_token TEXT;
id INTEGER;
return_word_id INTEGER[];
BEGIN
lookup_token := make_standard_name(lookup_word);
SELECT array_agg(word_id) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_id;
IF return_word_id IS NULL THEN
id := nextval('seq_word');
INSERT INTO word VALUES (id, lookup_token, null, null, null, null, 0);
return_word_id = ARRAY[id];
END IF;
RETURN return_word_id;
END;
$$
LANGUAGE plpgsql;
-- Normalize a string and look up its name ids (full words).
CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
RETURNS INTEGER[]
AS $$
DECLARE
lookup_token TEXT;
return_word_ids INTEGER[];
BEGIN
lookup_token := ' '|| make_standard_name(lookup_word);
SELECT array_agg(word_id) FROM word
WHERE word_token = lookup_token and class is null and type is null
INTO return_word_ids;
RETURN return_word_ids;
END;
$$
LANGUAGE plpgsql STABLE STRICT;
CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
RETURNS VOID
AS $$
DECLARE
s TEXT;
w INTEGER;
words TEXT[];
item RECORD;
j INTEGER;
BEGIN
FOR item IN SELECT (each(src)).* LOOP
s := make_standard_name(item.value);
w := getorcreate_country(s, country_code);
words := regexp_split_to_array(item.value, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_country(s, country_code);
END IF;
END LOOP;
END IF;
END LOOP;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
RETURNS INTEGER[]
AS $$
DECLARE
result INTEGER[];
s TEXT;
w INTEGER;
words TEXT[];
item RECORD;
j INTEGER;
BEGIN
result := '{}'::INTEGER[];
FOR item IN SELECT (each(src)).* LOOP
s := make_standard_name(item.value);
w := getorcreate_name_id(s, item.value);
IF not(ARRAY[w] <@ result) THEN
result := result || w;
END IF;
w := getorcreate_word_id(s);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
words := string_to_array(s, ' ');
IF array_upper(words, 1) IS NOT NULL THEN
FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN
w = getorcreate_word_id(words[j]);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END LOOP;
END IF;
words := regexp_split_to_array(item.value, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_word_id(s);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END LOOP;
END IF;
s := regexp_replace(item.value, '市$', '');
IF s != item.value THEN
s := make_standard_name(s);
IF s != '' THEN
w := getorcreate_name_id(s, item.value);
IF NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END IF;
END LOOP;
RETURN result;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION make_keywords(src TEXT)
RETURNS INTEGER[]
AS $$
DECLARE
result INTEGER[];
s TEXT;
w INTEGER;
words TEXT[];
i INTEGER;
j INTEGER;
BEGIN
result := '{}'::INTEGER[];
s := make_standard_name(src);
w := getorcreate_name_id(s, src);
IF NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
w := getorcreate_word_id(s);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
words := string_to_array(s, ' ');
IF array_upper(words, 1) IS NOT NULL THEN
FOR j IN 1..array_upper(words, 1) LOOP
IF (words[j] != '') THEN
w = getorcreate_word_id(words[j]);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END LOOP;
END IF;
words := regexp_split_to_array(src, E'[,;()]');
IF array_upper(words, 1) != 1 THEN
FOR j IN 1..array_upper(words, 1) LOOP
s := make_standard_name(words[j]);
IF s != '' THEN
w := getorcreate_word_id(s);
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END LOOP;
END IF;
s := regexp_replace(src, '市$', '');
IF s != src THEN
s := make_standard_name(s);
IF s != '' THEN
w := getorcreate_name_id(s, src);
IF NOT (ARRAY[w] <@ result) THEN
result := result || w;
END IF;
END IF;
END IF;
RETURN result;
END;
$$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
in_partition SMALLINT,
parent_place_id BIGINT,
address HSTORE,
country TEXT,
housenumber TEXT,
initial_name_vector INTEGER[],
geometry GEOMETRY,
OUT name_vector INTEGER[],
OUT nameaddress_vector INTEGER[])
AS $$
DECLARE
parent_name_vector INTEGER[];
parent_address_vector INTEGER[];
addr_place_ids INTEGER[];
addr_item RECORD;
parent_address_place_ids BIGINT[];
filtered_address HSTORE;
BEGIN
nameaddress_vector := '{}'::INTEGER[];
SELECT s.name_vector, s.nameaddress_vector
INTO parent_name_vector, parent_address_vector
FROM search_name s
WHERE s.place_id = parent_place_id;
-- Find all address tags that don't appear in the parent search names.
SELECT hstore(array_agg(ARRAY[k, v])) INTO filtered_address
FROM (SELECT skeys(address) as k, svals(address) as v) a
WHERE not addr_ids_from_name(v) && parent_address_vector
AND k not in ('country', 'street', 'place', 'postcode',
'housenumber', 'streetnumber', 'consriptionnumber');
-- Compute all search terms from the addr: tags.
IF filtered_address IS NOT NULL THEN
FOR addr_item IN
SELECT * FROM
get_places_for_addr_tags(in_partition, geometry, filtered_address, country)
LOOP
IF addr_item.place_id is null THEN
nameaddress_vector := array_merge(nameaddress_vector,
addr_item.keywords);
CONTINUE;
END IF;
IF parent_address_place_ids is null THEN
SELECT array_agg(parent_place_id) INTO parent_address_place_ids
FROM place_addressline
WHERE place_id = parent_place_id;
END IF;
IF not parent_address_place_ids @> ARRAY[addr_item.place_id] THEN
nameaddress_vector := array_merge(nameaddress_vector,
addr_item.keywords);
INSERT INTO place_addressline (place_id, address_place_id, fromarea,
isaddress, distance, cached_rank_address)
VALUES (obj_place_id, addr_item.place_id, not addr_item.isguess,
true, addr_item.distance, addr_item.rank_address);
END IF;
END LOOP;
END IF;
-- If the POI is named, simply mix in all address terms and be done.
IF array_length(initial_name_vector, 1) is not NULL THEN
-- Cheating here by not recomputing all terms but simply using the ones
-- from the parent object.
name_vector := initial_name_vector;
nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
IF not address ? 'street' and address ? 'place' THEN
-- make sure addr:place terms are always searchable
nameaddress_vector := array_merge(nameaddress_vector,
addr_ids_from_name(address->'place'));
END IF;
RETURN;
END IF;
----- unnamed POIS
IF (array_length(nameaddress_vector, 1) is null
and (address ? 'street'or not address ? 'place'))
or housenumber is null
THEN
RETURN;
END IF;
-- Check if the parent covers all address terms.
-- If not, create a search name entry with the house number as the name.
-- This is unusual for the search_name table but prevents that the place
-- is returned when we only search for the street/place.
IF not nameaddress_vector <@ parent_address_vector THEN
name_vector := ARRAY[getorcreate_name_id(housenumber)];
END IF;
IF not address ? 'street' and address ? 'place' THEN
addr_place_ids := addr_ids_from_name(address->'place');
IF not addr_place_ids <@ parent_name_vector THEN
-- addr:place tag exists without a corresponding place. Mix in addr:place
-- in the address and drop the name from the parent. This would only be
-- the street name of the nearest street.
nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
name_vector := ARRAY[getorcreate_name_id(housenumber)];
END IF;
ELSE
nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
END IF;
-- The address vector always gets merged in.
nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
END;
$$
LANGUAGE plpgsql;