From d643ca8dee1d1371b3128129f416fc2e99912b7b Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 5 Feb 2020 10:12:43 +0100 Subject: [PATCH 1/2] move address line computation in its own function --- sql/functions/placex_triggers.sql | 327 +++++++++++++++--------------- 1 file changed, 168 insertions(+), 159 deletions(-) diff --git a/sql/functions/placex_triggers.sql b/sql/functions/placex_triggers.sql index 3450c1f1..fb3f2dc5 100644 --- a/sql/functions/placex_triggers.sql +++ b/sql/functions/placex_triggers.sql @@ -252,11 +252,172 @@ END; $$ LANGUAGE plpgsql STABLE; + +-- Insert address of a place into the place_addressline table. +-- +-- \param obj_place_id Place_id of the place to compute the address for. +-- \param partition Partition number where the place is in. +-- \param maxrank Rank of the place. All address features must have +-- a search rank lower than the given rank. +-- \param address Address terms for the place. +-- \param geoemtry Geometry to which the address objects should be close. +-- +-- \retval parent_place_id Place_id of the address object that is the direct +-- ancestor. +-- \retval postcode Postcode computed from the address. This is the +-- addr:postcode of one of the address objects. If +-- more than one of has a postcode, the highest ranking +-- one is used. May be NULL. +-- \retval nameaddress_vector Search terms for the address. This is the sum +-- of name terms of all address objects. +CREATE OR REPLACE FUNCTION insert_addresslines(obj_place_id BIGINT, + partition SMALLINT, + maxrank SMALLINT, + address HSTORE, + geometry GEOMETRY, + OUT parent_place_id BIGINT, + OUT postcode TEXT, + OUT nameaddress_vector INT[]) + AS $$ +DECLARE + current_rank_address INTEGER := 0; + location_distance FLOAT := 0; + location_parent GEOMETRY := NULL; + parent_place_id_rank SMALLINT := 0; + + location_isaddress BOOLEAN; + + address_havelevel BOOLEAN[]; + location_keywords INT[]; + + location RECORD; + addr_item RECORD; + + isin_tokens INT[]; + isin TEXT[]; +BEGIN + parent_place_id := 0; + nameaddress_vector := '{}'::int[]; + isin_tokens := '{}'::int[]; + + ---- convert address store to array of tokenids + IF address IS NOT NULL THEN + FOR addr_item IN SELECT * FROM each(address) + LOOP + IF addr_item.key IN ('city', 'tiger:county', 'state', 'suburb', 'province', + 'district', 'region', 'county', 'municipality', + 'hamlet', 'village', 'subdistrict', 'town', + 'neighbourhood', 'quarter', 'parish') + THEN + isin_tokens := array_merge(isin_tokens, + word_ids_from_name(addr_item.value)); + IF NOT %REVERSE-ONLY% THEN + nameaddress_vector := array_merge(nameaddress_vector, + addr_ids_from_name(addr_item.value)); + END IF; + END IF; + END LOOP; + + IF address ? 'is_in' THEN + -- is_in items need splitting + isin := regexp_split_to_array(address->'is_in', E'[;,]'); + IF array_upper(isin, 1) IS NOT NULL THEN + FOR i IN 1..array_upper(isin, 1) LOOP + isin_tokens := array_merge(isin_tokens, + word_ids_from_name(isin[i])); + + -- merge word into address vector + IF NOT %REVERSE-ONLY% THEN + nameaddress_vector := array_merge(nameaddress_vector, + addr_ids_from_name(isin[i])); + END IF; + END LOOP; + END IF; + END IF; + END IF; + IF NOT %REVERSE-ONLY% THEN + nameaddress_vector := array_merge(nameaddress_vector, isin_tokens); + END IF; + + ---- now compute the address terms + FOR i IN 1..28 LOOP + address_havelevel[i] := false; + END LOOP; + + FOR location IN + SELECT * FROM getNearFeatures(partition, geometry, maxrank, isin_tokens) + LOOP + IF location.rank_address != current_rank_address THEN + current_rank_address := location.rank_address; + IF location.isguess THEN + location_distance := location.distance * 1.5; + ELSE + IF location.rank_address <= 12 THEN + -- for county and above, if we have an area consider that exact + -- (It would be nice to relax the constraint for places close to + -- the boundary but we'd need the exact geometry for that. Too + -- expensive.) + location_distance = 0; + ELSE + -- Below county level remain slightly fuzzy. + location_distance := location.distance * 0.5; + END IF; + END IF; + ELSE + CONTINUE WHEN location.keywords <@ location_keywords; + END IF; + + IF location.distance < location_distance OR NOT location.isguess THEN + location_keywords := location.keywords; + + location_isaddress := NOT address_havelevel[location.rank_address]; + IF location_isaddress AND location.isguess AND location_parent IS NOT NULL THEN + location_isaddress := ST_Contains(location_parent, location.centroid); + END IF; + + -- RAISE WARNING '% isaddress: %', location.place_id, location_isaddress; + -- Add it to the list of search terms + IF NOT %REVERSE-ONLY% THEN + nameaddress_vector := array_merge(nameaddress_vector, + location.keywords::integer[]); + END IF; + + INSERT INTO place_addressline (place_id, address_place_id, fromarea, + isaddress, distance, cached_rank_address) + VALUES (obj_place_id, location.place_id, true, + location_isaddress, location.distance, location.rank_address); + + IF location_isaddress THEN + -- add postcode if we have one + -- (If multiple postcodes are available, we end up with the highest ranking one.) + IF location.postcode is not null THEN + postcode = location.postcode; + END IF; + + address_havelevel[location.rank_address] := true; + IF NOT location.isguess THEN + SELECT placex.geometry FROM placex + WHERE obj_place_id = location.place_id INTO location_parent; + END IF; + + IF location.rank_address > parent_place_id_rank THEN + parent_place_id = location.place_id; + parent_place_id_rank = location.rank_address; + END IF; + END IF; + --DEBUG: RAISE WARNING ' Terms: (%) %',location, nameaddress_vector; + END IF; + + END LOOP; +END; +$$ +LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION placex_insert() RETURNS TRIGGER AS $$ DECLARE - i INTEGER; postcode TEXT; result BOOLEAN; is_area BOOLEAN; @@ -428,33 +589,13 @@ CREATE OR REPLACE FUNCTION placex_update() RETURNS TRIGGER AS $$ DECLARE - search_maxdistance FLOAT[]; - search_mindistance FLOAT[]; - address_havelevel BOOLEAN[]; - i INTEGER; location RECORD; relation_members TEXT[]; - addr_item RECORD; - search_diameter FLOAT; - search_prevdiameter FLOAT; - search_maxrank INTEGER; - address_maxrank INTEGER; - address_street_word_ids INTEGER[]; - parent_place_id_rank BIGINT; addr_street TEXT; addr_place TEXT; - isin TEXT[]; - isin_tokens INT[]; - - location_rank_search INTEGER; - location_distance FLOAT; - location_parent GEOMETRY; - location_isaddress BOOLEAN; - location_keywords INTEGER[]; - name_vector INTEGER[]; nameaddress_vector INTEGER[]; @@ -711,13 +852,9 @@ BEGIN END IF; END IF; - -- What level are we searching from - search_maxrank := NEW.rank_search; - -- Initialise the name vector using our name NEW.name := add_default_place_name(NEW.country_code, NEW.name); name_vector := make_keywords(NEW.name); - nameaddress_vector := '{}'::int[]; -- make sure all names are in the word table IF NEW.admin_level = 2 @@ -728,142 +865,14 @@ BEGIN --DEBUG: RAISE WARNING 'Country names updated'; END IF; - FOR i IN 1..28 LOOP - address_havelevel[i] := false; - END LOOP; - - NEW.parent_place_id = 0; - parent_place_id_rank = 0; - - - -- convert address store to array of tokenids - --DEBUG: RAISE WARNING 'Starting address search'; - isin_tokens := '{}'::int[]; - IF NEW.address IS NOT NULL THEN - FOR addr_item IN SELECT * FROM each(NEW.address) - LOOP - IF addr_item.key IN ('city', 'tiger:county', 'state', 'suburb', 'province', - 'district', 'region', 'county', 'municipality', - 'hamlet', 'village', 'subdistrict', 'town', - 'neighbourhood', 'quarter', 'parish') - THEN - address_street_word_ids := word_ids_from_name(addr_item.value); - IF address_street_word_ids is not null THEN - isin_tokens := array_merge(isin_tokens, address_street_word_ids); - END IF; - IF NOT %REVERSE-ONLY% THEN - address_street_word_ids := addr_ids_from_name(addr_item.value); - IF address_street_word_ids is not null THEN - nameaddress_vector := array_merge(nameaddress_vector, - address_street_word_ids); - END IF; - END IF; - END IF; - IF addr_item.key = 'is_in' THEN - -- is_in items need splitting - isin := regexp_split_to_array(addr_item.value, E'[;,]'); - IF array_upper(isin, 1) IS NOT NULL THEN - FOR i IN 1..array_upper(isin, 1) LOOP - address_street_word_ids := word_ids_from_name(isin[i]); - IF address_street_word_ids is not null THEN - isin_tokens := array_merge(isin_tokens, address_street_word_ids); - END IF; - - -- merge word into address vector - IF NOT %REVERSE-ONLY% THEN - address_street_word_ids := addr_ids_from_name(isin[i]); - IF address_street_word_ids is not null THEN - nameaddress_vector := array_merge(nameaddress_vector, - address_street_word_ids); - END IF; - END IF; - END LOOP; - END IF; - END IF; - END LOOP; - END IF; - IF NOT %REVERSE-ONLY% THEN - nameaddress_vector := array_merge(nameaddress_vector, isin_tokens); - END IF; - --- RAISE WARNING 'ISIN: %', isin_tokens; - - -- Process area matches - location_rank_search := 0; - location_distance := 0; - location_parent := NULL; - -- added ourself as address already - address_havelevel[NEW.rank_address] := true; - --DEBUG: RAISE WARNING ' getNearFeatures(%,''%'',%,''%'')',NEW.partition, NEW.centroid, search_maxrank, isin_tokens; - FOR location IN - SELECT * from getNearFeatures(NEW.partition, - CASE WHEN NEW.rank_search >= 26 + SELECT * FROM insert_addresslines(NEW.place_id, NEW.partition, + NEW.rank_search, NEW.address, + CASE WHEN NEW.rank_search >= 26 AND NEW.rank_search < 30 - THEN NEW.geometry - ELSE NEW.centroid END, - search_maxrank, isin_tokens) - LOOP - IF location.rank_address != location_rank_search THEN - location_rank_search := location.rank_address; - IF location.isguess THEN - location_distance := location.distance * 1.5; - ELSE - IF location.rank_address <= 12 THEN - -- for county and above, if we have an area consider that exact - -- (It would be nice to relax the constraint for places close to - -- the boundary but we'd need the exact geometry for that. Too - -- expensive.) - location_distance = 0; - ELSE - -- Below county level remain slightly fuzzy. - location_distance := location.distance * 0.5; - END IF; - END IF; - ELSE - CONTINUE WHEN location.keywords <@ location_keywords; - END IF; + THEN NEW.geometry ELSE NEW.centroid END) + INTO NEW.parent_place_id, NEW.postcode, nameaddress_vector; - IF location.distance < location_distance OR NOT location.isguess THEN - location_keywords := location.keywords; - - location_isaddress := NOT address_havelevel[location.rank_address]; - IF location_isaddress AND location.isguess AND location_parent IS NOT NULL THEN - location_isaddress := ST_Contains(location_parent,location.centroid); - END IF; - - -- RAISE WARNING '% isaddress: %', location.place_id, location_isaddress; - -- Add it to the list of search terms - IF NOT %REVERSE-ONLY% THEN - nameaddress_vector := array_merge(nameaddress_vector, location.keywords::integer[]); - END IF; - INSERT INTO place_addressline (place_id, address_place_id, fromarea, isaddress, distance, cached_rank_address) - VALUES (NEW.place_id, location.place_id, true, location_isaddress, location.distance, location.rank_address); - - IF location_isaddress THEN - -- add postcode if we have one - -- (If multiple postcodes are available, we end up with the highest ranking one.) - IF location.postcode is not null THEN - NEW.postcode = location.postcode; - END IF; - - address_havelevel[location.rank_address] := true; - IF NOT location.isguess THEN - SELECT geometry FROM placex WHERE place_id = location.place_id INTO location_parent; - END IF; - - IF location.rank_address > parent_place_id_rank THEN - NEW.parent_place_id = location.place_id; - parent_place_id_rank = location.rank_address; - END IF; - - END IF; - - --DEBUG: RAISE WARNING ' Terms: (%) %',location, nameaddress_vector; - - END IF; - - END LOOP; - --DEBUG: RAISE WARNING 'address computed'; + --DEBUG: RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector; IF NEW.address is not null AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN From 5220a92be4c7691fb8c1dfc11ac9cb03ce68933e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 22 Feb 2020 16:46:03 +0100 Subject: [PATCH 2/2] adapt API tests --- test/bdd/api/search/queries.feature | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/bdd/api/search/queries.feature b/test/bdd/api/search/queries.feature index 6650e969..2969844c 100644 --- a/test/bdd/api/search/queries.feature +++ b/test/bdd/api/search/queries.feature @@ -27,6 +27,7 @@ Feature: Search queries | suburb | Eilbek | | postcode | 22089 | | city_district | Wandsbek | + | city | Hamburg | | country | Deutschland | | country_code | de | @@ -42,6 +43,7 @@ Feature: Search queries | suburb | Eilbek | | postcode | 22089 | | city_district | Wandsbek | + | city | Hamburg | | country | Deutschland | | country_code | de |