mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-29 08:36:24 +03:00
b88b952f56
Rename function to reflect that it is only used for precomputation. The token IDs are not really needed, so don't bother to compute the array of tokens.
526 lines
15 KiB
PL/PgSQL
526 lines
15 KiB
PL/PgSQL
-- Functions for term normalisation and access to the 'word' table.
|
|
|
|
CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
|
|
AS '{{ modulepath }}/nominatim.so', 'transliteration'
|
|
LANGUAGE c IMMUTABLE STRICT;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
|
|
AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
|
|
LANGUAGE c IMMUTABLE STRICT;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
|
|
AS $$
|
|
DECLARE
|
|
o TEXT;
|
|
BEGIN
|
|
o := public.gettokenstring(public.transliteration(name));
|
|
RETURN trim(substr(o,1,length(o)));
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql IMMUTABLE;
|
|
|
|
-- returns NULL if the word is too common
|
|
CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
lookup_token TEXT;
|
|
return_word_id INTEGER;
|
|
count INTEGER;
|
|
BEGIN
|
|
lookup_token := trim(lookup_word);
|
|
SELECT min(word_id), max(search_name_count) FROM word
|
|
WHERE word_token = lookup_token and class is null and type is null
|
|
INTO return_word_id, count;
|
|
IF return_word_id IS NULL THEN
|
|
return_word_id := nextval('seq_word');
|
|
INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
|
|
ELSE
|
|
IF count > get_maxwordfreq() THEN
|
|
return_word_id := NULL;
|
|
END IF;
|
|
END IF;
|
|
RETURN return_word_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
-- Create housenumber tokens from an OSM addr:housenumber.
|
|
-- The housnumber is split at comma and semicolon as necessary.
|
|
-- The function returns the normalized form of the housenumber suitable
|
|
-- for comparison.
|
|
CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
|
|
RETURNS TEXT
|
|
AS $$
|
|
DECLARE
|
|
normtext TEXT;
|
|
BEGIN
|
|
SELECT array_to_string(array_agg(trans), ';')
|
|
INTO normtext
|
|
FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
|
|
FROM (SELECT make_standard_name(h) as lookup_word
|
|
FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
|
|
|
|
return normtext;
|
|
END;
|
|
$$ LANGUAGE plpgsql STABLE STRICT;
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
lookup_token TEXT;
|
|
return_word_id INTEGER;
|
|
BEGIN
|
|
lookup_token := ' ' || trim(lookup_word);
|
|
SELECT min(word_id) FROM word
|
|
WHERE word_token = lookup_token and class='place' and type='house'
|
|
INTO return_word_id;
|
|
IF return_word_id IS NULL THEN
|
|
return_word_id := nextval('seq_word');
|
|
INSERT INTO word VALUES (return_word_id, lookup_token, null,
|
|
'place', 'house', null, 0);
|
|
END IF;
|
|
RETURN return_word_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
lookup_token TEXT;
|
|
lookup_word TEXT;
|
|
return_word_id INTEGER;
|
|
BEGIN
|
|
lookup_word := upper(trim(postcode));
|
|
lookup_token := ' ' || make_standard_name(lookup_word);
|
|
SELECT min(word_id) FROM word
|
|
WHERE word_token = lookup_token and word = lookup_word
|
|
and class='place' and type='postcode'
|
|
INTO return_word_id;
|
|
IF return_word_id IS NULL THEN
|
|
return_word_id := nextval('seq_word');
|
|
INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
|
|
'place', 'postcode', null, 0);
|
|
END IF;
|
|
RETURN return_word_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
|
|
lookup_country_code varchar(2))
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
lookup_token TEXT;
|
|
return_word_id INTEGER;
|
|
BEGIN
|
|
lookup_token := ' '||trim(lookup_word);
|
|
SELECT min(word_id) FROM word
|
|
WHERE word_token = lookup_token and country_code=lookup_country_code
|
|
INTO return_word_id;
|
|
IF return_word_id IS NULL THEN
|
|
return_word_id := nextval('seq_word');
|
|
INSERT INTO word VALUES (return_word_id, lookup_token, null,
|
|
null, null, lookup_country_code, 0);
|
|
END IF;
|
|
RETURN return_word_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
|
|
lookup_class text, lookup_type text)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
lookup_token TEXT;
|
|
return_word_id INTEGER;
|
|
BEGIN
|
|
lookup_token := ' '||trim(lookup_word);
|
|
SELECT min(word_id) FROM word
|
|
WHERE word_token = lookup_token and word = normalized_word
|
|
and class = lookup_class and type = lookup_type
|
|
INTO return_word_id;
|
|
IF return_word_id IS NULL THEN
|
|
return_word_id := nextval('seq_word');
|
|
INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
|
|
lookup_class, lookup_type, null, 0);
|
|
END IF;
|
|
RETURN return_word_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT,
|
|
normalized_word TEXT,
|
|
lookup_class text,
|
|
lookup_type text,
|
|
op text)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
lookup_token TEXT;
|
|
return_word_id INTEGER;
|
|
BEGIN
|
|
lookup_token := ' '||trim(lookup_word);
|
|
SELECT min(word_id) FROM word
|
|
WHERE word_token = lookup_token and word = normalized_word
|
|
and class = lookup_class and type = lookup_type and operator = op
|
|
INTO return_word_id;
|
|
IF return_word_id IS NULL THEN
|
|
return_word_id := nextval('seq_word');
|
|
INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
|
|
lookup_class, lookup_type, null, 0, op);
|
|
END IF;
|
|
RETURN return_word_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
lookup_token TEXT;
|
|
nospace_lookup_token TEXT;
|
|
return_word_id INTEGER;
|
|
BEGIN
|
|
lookup_token := ' '||trim(lookup_word);
|
|
SELECT min(word_id) FROM word
|
|
WHERE word_token = lookup_token and class is null and type is null
|
|
INTO return_word_id;
|
|
IF return_word_id IS NULL THEN
|
|
return_word_id := nextval('seq_word');
|
|
INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
|
|
null, null, null, 0);
|
|
END IF;
|
|
RETURN return_word_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
BEGIN
|
|
RETURN getorcreate_name_id(lookup_word, '');
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
-- Normalize a string and lookup its word ids (partial words).
|
|
CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
DECLARE
|
|
words TEXT[];
|
|
id INTEGER;
|
|
return_word_id INTEGER[];
|
|
word_ids INTEGER[];
|
|
j INTEGER;
|
|
BEGIN
|
|
words := string_to_array(make_standard_name(lookup_word), ' ');
|
|
IF array_upper(words, 1) IS NOT NULL THEN
|
|
FOR j IN 1..array_upper(words, 1) LOOP
|
|
IF (words[j] != '') THEN
|
|
SELECT array_agg(word_id) INTO word_ids
|
|
FROM word
|
|
WHERE word_token = words[j] and class is null and type is null;
|
|
|
|
IF word_ids IS NULL THEN
|
|
id := nextval('seq_word');
|
|
INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
|
|
return_word_id := return_word_id || id;
|
|
ELSE
|
|
return_word_id := array_merge(return_word_id, word_ids);
|
|
END IF;
|
|
END IF;
|
|
END LOOP;
|
|
END IF;
|
|
|
|
RETURN return_word_id;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
-- Normalize a string and look up its name ids (full words).
|
|
CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
DECLARE
|
|
lookup_token TEXT;
|
|
return_word_ids INTEGER[];
|
|
BEGIN
|
|
lookup_token := ' '|| make_standard_name(lookup_word);
|
|
SELECT array_agg(word_id) FROM word
|
|
WHERE word_token = lookup_token and class is null and type is null
|
|
INTO return_word_ids;
|
|
RETURN return_word_ids;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql STABLE STRICT;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
|
|
RETURNS VOID
|
|
AS $$
|
|
DECLARE
|
|
s TEXT;
|
|
w INTEGER;
|
|
words TEXT[];
|
|
item RECORD;
|
|
j INTEGER;
|
|
BEGIN
|
|
FOR item IN SELECT (each(src)).* LOOP
|
|
|
|
s := make_standard_name(item.value);
|
|
w := getorcreate_country(s, country_code);
|
|
|
|
words := regexp_split_to_array(item.value, E'[,;()]');
|
|
IF array_upper(words, 1) != 1 THEN
|
|
FOR j IN 1..array_upper(words, 1) LOOP
|
|
s := make_standard_name(words[j]);
|
|
IF s != '' THEN
|
|
w := getorcreate_country(s, country_code);
|
|
END IF;
|
|
END LOOP;
|
|
END IF;
|
|
END LOOP;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
|
|
RETURNS INTEGER[]
|
|
AS $$
|
|
DECLARE
|
|
result INTEGER[];
|
|
s TEXT;
|
|
w INTEGER;
|
|
words TEXT[];
|
|
item RECORD;
|
|
j INTEGER;
|
|
BEGIN
|
|
result := '{}'::INTEGER[];
|
|
|
|
FOR item IN SELECT (each(src)).* LOOP
|
|
|
|
s := make_standard_name(item.value);
|
|
w := getorcreate_name_id(s, item.value);
|
|
|
|
IF not(ARRAY[w] <@ result) THEN
|
|
result := result || w;
|
|
END IF;
|
|
|
|
w := getorcreate_word_id(s);
|
|
|
|
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
|
result := result || w;
|
|
END IF;
|
|
|
|
words := string_to_array(s, ' ');
|
|
IF array_upper(words, 1) IS NOT NULL THEN
|
|
FOR j IN 1..array_upper(words, 1) LOOP
|
|
IF (words[j] != '') THEN
|
|
w = getorcreate_word_id(words[j]);
|
|
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
|
result := result || w;
|
|
END IF;
|
|
END IF;
|
|
END LOOP;
|
|
END IF;
|
|
|
|
words := regexp_split_to_array(item.value, E'[,;()]');
|
|
IF array_upper(words, 1) != 1 THEN
|
|
FOR j IN 1..array_upper(words, 1) LOOP
|
|
s := make_standard_name(words[j]);
|
|
IF s != '' THEN
|
|
w := getorcreate_word_id(s);
|
|
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
|
result := result || w;
|
|
END IF;
|
|
END IF;
|
|
END LOOP;
|
|
END IF;
|
|
|
|
s := regexp_replace(item.value, '市$', '');
|
|
IF s != item.value THEN
|
|
s := make_standard_name(s);
|
|
IF s != '' THEN
|
|
w := getorcreate_name_id(s, item.value);
|
|
IF NOT (ARRAY[w] <@ result) THEN
|
|
result := result || w;
|
|
END IF;
|
|
END IF;
|
|
END IF;
|
|
|
|
END LOOP;
|
|
|
|
RETURN result;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
|
|
RETURNS INTEGER
|
|
AS $$
|
|
DECLARE
|
|
s TEXT;
|
|
w INTEGER;
|
|
words TEXT[];
|
|
i INTEGER;
|
|
j INTEGER;
|
|
BEGIN
|
|
s := make_standard_name(src);
|
|
w := getorcreate_name_id(s, src);
|
|
|
|
w := getorcreate_word_id(s);
|
|
|
|
words := string_to_array(s, ' ');
|
|
IF array_upper(words, 1) IS NOT NULL THEN
|
|
FOR j IN 1..array_upper(words, 1) LOOP
|
|
IF (words[j] != '') THEN
|
|
w := getorcreate_word_id(words[j]);
|
|
END IF;
|
|
END LOOP;
|
|
END IF;
|
|
|
|
words := regexp_split_to_array(src, E'[,;()]');
|
|
IF array_upper(words, 1) != 1 THEN
|
|
FOR j IN 1..array_upper(words, 1) LOOP
|
|
s := make_standard_name(words[j]);
|
|
IF s != '' THEN
|
|
w := getorcreate_word_id(s);
|
|
END IF;
|
|
END LOOP;
|
|
END IF;
|
|
|
|
s := regexp_replace(src, '市$', '');
|
|
IF s != src THEN
|
|
s := make_standard_name(s);
|
|
IF s != '' THEN
|
|
w := getorcreate_name_id(s, src);
|
|
END IF;
|
|
END IF;
|
|
|
|
RETURN 1;
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
|
|
in_partition SMALLINT,
|
|
parent_place_id BIGINT,
|
|
address HSTORE,
|
|
country TEXT,
|
|
housenumber TEXT,
|
|
initial_name_vector INTEGER[],
|
|
geometry GEOMETRY,
|
|
OUT name_vector INTEGER[],
|
|
OUT nameaddress_vector INTEGER[])
|
|
AS $$
|
|
DECLARE
|
|
parent_name_vector INTEGER[];
|
|
parent_address_vector INTEGER[];
|
|
addr_place_ids INTEGER[];
|
|
|
|
addr_item RECORD;
|
|
parent_address_place_ids BIGINT[];
|
|
filtered_address HSTORE;
|
|
BEGIN
|
|
nameaddress_vector := '{}'::INTEGER[];
|
|
|
|
SELECT s.name_vector, s.nameaddress_vector
|
|
INTO parent_name_vector, parent_address_vector
|
|
FROM search_name s
|
|
WHERE s.place_id = parent_place_id;
|
|
|
|
-- Find all address tags that don't appear in the parent search names.
|
|
SELECT hstore(array_agg(ARRAY[k, v])) INTO filtered_address
|
|
FROM (SELECT skeys(address) as k, svals(address) as v) a
|
|
WHERE not addr_ids_from_name(v) && parent_address_vector
|
|
AND k not in ('country', 'street', 'place', 'postcode',
|
|
'housenumber', 'streetnumber', 'conscriptionnumber');
|
|
|
|
-- Compute all search terms from the addr: tags.
|
|
IF filtered_address IS NOT NULL THEN
|
|
FOR addr_item IN
|
|
SELECT * FROM
|
|
get_places_for_addr_tags(in_partition, geometry, filtered_address, country)
|
|
LOOP
|
|
IF addr_item.place_id is null THEN
|
|
nameaddress_vector := array_merge(nameaddress_vector,
|
|
addr_item.keywords);
|
|
CONTINUE;
|
|
END IF;
|
|
|
|
IF parent_address_place_ids is null THEN
|
|
SELECT array_agg(parent_place_id) INTO parent_address_place_ids
|
|
FROM place_addressline
|
|
WHERE place_id = parent_place_id;
|
|
END IF;
|
|
|
|
IF not parent_address_place_ids @> ARRAY[addr_item.place_id] THEN
|
|
nameaddress_vector := array_merge(nameaddress_vector,
|
|
addr_item.keywords);
|
|
|
|
INSERT INTO place_addressline (place_id, address_place_id, fromarea,
|
|
isaddress, distance, cached_rank_address)
|
|
VALUES (obj_place_id, addr_item.place_id, not addr_item.isguess,
|
|
true, addr_item.distance, addr_item.rank_address);
|
|
END IF;
|
|
END LOOP;
|
|
END IF;
|
|
|
|
name_vector := initial_name_vector;
|
|
|
|
-- Check if the parent covers all address terms.
|
|
-- If not, create a search name entry with the house number as the name.
|
|
-- This is unusual for the search_name table but prevents that the place
|
|
-- is returned when we only search for the street/place.
|
|
|
|
IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
|
|
name_vector := array_merge(name_vector,
|
|
ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
|
|
END IF;
|
|
|
|
IF not address ? 'street' and address ? 'place' THEN
|
|
addr_place_ids := addr_ids_from_name(address->'place');
|
|
IF not addr_place_ids <@ parent_name_vector THEN
|
|
-- make sure addr:place terms are always searchable
|
|
nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
|
|
-- If there is a housenumber, also add the place name as a name,
|
|
-- so we can search it by the usual housenumber+place algorithms.
|
|
IF housenumber is not null THEN
|
|
name_vector := array_merge(name_vector,
|
|
ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
|
|
END IF;
|
|
END IF;
|
|
END IF;
|
|
|
|
-- Cheating here by not recomputing all terms but simply using the ones
|
|
-- from the parent object.
|
|
nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
|
|
nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
|
|
|
|
END;
|
|
$$
|
|
LANGUAGE plpgsql;
|