Merge pull request #1024 from lonvia/reduce-address-search-terms

Reduce address search terms
This commit is contained in:
Sarah Hoffmann 2018-04-26 22:28:53 +02:00 committed by GitHub
commit 080ba00956
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 91 additions and 114 deletions

View File

@ -1234,6 +1234,7 @@ DECLARE
relation_members TEXT[];
relMember RECORD;
linkedplacex RECORD;
addr_item RECORD;
search_diameter FLOAT;
search_prevdiameter FLOAT;
search_maxrank INTEGER;
@ -1766,44 +1767,43 @@ BEGIN
parent_place_id_rank = 0;
-- convert isin to array of tokenids
-- convert address store to array of tokenids
--DEBUG: RAISE WARNING 'Starting address search';
isin_tokens := '{}'::int[];
IF NEW.address IS NOT NULL THEN
isin := avals(NEW.address);
IF array_upper(isin, 1) IS NOT NULL THEN
FOR i IN 1..array_upper(isin, 1) LOOP
-- TODO further split terms with comma and semicolon
address_street_word_id := get_name_id(make_standard_name(isin[i]));
FOR addr_item IN SELECT * FROM each(NEW.address)
LOOP
IF addr_item.key IN ('city', 'tiger:county', 'state', 'suburb', 'province', 'district', 'region', 'county', 'municipality', 'hamlet', 'village', 'subdistrict', 'town', 'neighbourhood', 'quarter', 'parish') THEN
address_street_word_id := get_name_id(make_standard_name(addr_item.value));
IF address_street_word_id IS NOT NULL AND NOT(ARRAY[address_street_word_id] <@ isin_tokens) THEN
nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
isin_tokens := isin_tokens || address_street_word_id;
END IF;
-- merge word into address vector
address_street_word_id := get_word_id(make_standard_name(isin[i]));
address_street_word_id := get_word_id(make_standard_name(addr_item.value));
IF address_street_word_id IS NOT NULL THEN
nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
END IF;
END LOOP;
END IF;
END IF;
END IF;
IF addr_item.key = 'is_in' THEN
-- is_in items need splitting
isin := regexp_split_to_array(addr_item.value, E'[;,]');
IF array_upper(isin, 1) IS NOT NULL THEN
FOR i IN 1..array_upper(isin, 1) LOOP
address_street_word_id := get_name_id(make_standard_name(isin[i]));
IF address_street_word_id IS NOT NULL AND NOT(ARRAY[address_street_word_id] <@ isin_tokens) THEN
isin_tokens := isin_tokens || address_street_word_id;
END IF;
-- %NOTIGERDATA% IF 0 THEN
-- for the USA we have an additional address table. Merge in zip codes from there too
IF NEW.rank_search = 26 AND NEW.country_code = 'us' THEN
FOR location IN SELECT distinct postcode from location_property_tiger where parent_place_id = NEW.place_id LOOP
address_street_word_id := get_name_id(make_standard_name(location.postcode));
nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
isin_tokens := isin_tokens || address_street_word_id;
-- also merge in the single word version
address_street_word_id := get_word_id(make_standard_name(location.postcode));
nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
-- merge word into address vector
address_street_word_id := get_word_id(make_standard_name(isin[i]));
IF address_street_word_id IS NOT NULL THEN
nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
END IF;
END LOOP;
END IF;
END IF;
END LOOP;
END IF;
--DEBUG: RAISE WARNING 'Tiger postcodes collected';
-- %NOTIGERDATA% END IF;
nameaddress_vector := array_merge(nameaddress_vector, isin_tokens);
-- RAISE WARNING 'ISIN: %', isin_tokens;
@ -1884,40 +1884,6 @@ BEGIN
END LOOP;
--DEBUG: RAISE WARNING 'address computed';
-- try using the isin value to find parent places
IF array_upper(isin_tokens, 1) IS NOT NULL THEN
FOR i IN 1..array_upper(isin_tokens, 1) LOOP
--RAISE WARNING ' getNearestNamedFeature: % % % %',NEW.partition, place_centroid, search_maxrank, isin_tokens[i];
IF NOT ARRAY[isin_tokens[i]] <@ nameaddress_vector THEN
FOR location IN SELECT * from getNearestNamedFeature(NEW.partition, place_centroid, search_maxrank, isin_tokens[i]) LOOP
--RAISE WARNING ' ISIN: %',location;
IF location.rank_search > 4 THEN
nameaddress_vector := array_merge(nameaddress_vector, location.keywords::integer[]);
INSERT INTO place_addressline (place_id, address_place_id, fromarea, isaddress, distance, cached_rank_address)
VALUES (NEW.place_id, location.place_id, false, NOT address_havelevel[location.rank_address], location.distance, location.rank_address);
IF NEW.postcode is null AND location.postcode is not null
AND NOT address_havelevel[location.rank_address] THEN
NEW.postcode := location.postcode;
END IF;
address_havelevel[location.rank_address] := true;
IF location.rank_address > parent_place_id_rank THEN
NEW.parent_place_id = location.place_id;
parent_place_id_rank = location.rank_address;
END IF;
END IF;
END LOOP;
END IF;
END LOOP;
END IF;
--DEBUG: RAISE WARNING 'isin tokens processed';
-- for long ways we should add search terms for the entire length
IF st_length(NEW.geometry) > 0.05 THEN

View File

@ -19,8 +19,6 @@ CREATE INDEX idx_location_area_country_place_id ON location_area_country USING B
CREATE INDEX idx_osmline_parent_place_id ON location_property_osmline USING BTREE (parent_place_id) {ts:search-index};
CREATE INDEX idx_search_name_country_centroid ON search_name_country USING GIST (centroid) {ts:address-index};
DROP INDEX IF EXISTS place_id_idx;
CREATE UNIQUE INDEX idx_place_osm_unique on place using btree(osm_id,osm_type,class,type) {ts:address-index};

View File

@ -84,38 +84,6 @@ END
$$
LANGUAGE plpgsql;
create or replace function getNearestNamedFeature(in_partition INTEGER, point GEOMETRY, maxrank INTEGER, isin_token INTEGER) RETURNS setof nearfeature AS $$
DECLARE
r nearfeature%rowtype;
BEGIN
-- start
IF in_partition = -partition- THEN
FOR r IN
SELECT place_id, name_vector, address_rank, search_rank,
ST_Distance(centroid, point) as distance, null as isguess
FROM search_name_-partition-
WHERE name_vector @> ARRAY[isin_token]
AND search_rank < maxrank
UNION ALL
SELECT place_id, name_vector, address_rank, search_rank,
ST_Distance(centroid, point) as distance, null as isguess
FROM search_name_country
WHERE name_vector @> ARRAY[isin_token]
AND search_rank < maxrank
ORDER BY distance ASC limit 1
LOOP
RETURN NEXT r;
END LOOP;
RETURN;
END IF;
-- end
RAISE EXCEPTION 'Unknown partition %', in_partition;
END
$$
LANGUAGE plpgsql;
create or replace function getNearestNamedRoadFeature(in_partition INTEGER, point GEOMETRY, isin_token INTEGER[])
RETURNS setof nearfeature AS $$
DECLARE
@ -185,15 +153,6 @@ BEGIN
INSERT INTO search_name (place_id, search_rank, address_rank, importance, country_code, name_vector, nameaddress_vector, centroid)
values (in_place_id, in_rank_search, in_rank_address, in_importance, in_country_code, in_name_vector, in_nameaddress_vector, in_centroid);
IF in_rank_search <= 4 THEN
DELETE FROM search_name_country WHERE place_id = in_place_id;
IF in_rank_address > 0 THEN
INSERT INTO search_name_country (place_id, search_rank, address_rank, name_vector, centroid)
values (in_place_id, in_rank_search, in_rank_address, in_name_vector, in_geometry);
END IF;
RETURN TRUE;
END IF;
-- start
IF in_partition = -partition- THEN
DELETE FROM search_name_-partition- values WHERE place_id = in_place_id;
@ -216,7 +175,6 @@ DECLARE
BEGIN
DELETE from search_name WHERE place_id = in_place_id;
DELETE from search_name_country WHERE place_id = in_place_id;
-- start
IF in_partition = -partition- THEN

View File

@ -38,10 +38,6 @@ CREATE TABLE search_name_blank (
CREATE TABLE location_area_country () INHERITS (location_area_large) {ts:address-data};
CREATE INDEX idx_location_area_country_geometry ON location_area_country USING GIST (geometry) {ts:address-index};
CREATE TABLE search_name_country () INHERITS (search_name_blank) {ts:address-data};
CREATE INDEX idx_search_name_country_place_id ON search_name_country USING BTREE (place_id) {ts:address-index};
CREATE INDEX idx_search_name_country_name_vector ON search_name_country USING GIN (name_vector) WITH (fastupdate = off) {ts:address-index};
-- start
CREATE TABLE location_area_large_-partition- () INHERITS (location_area_large) {ts:address-data};
CREATE INDEX idx_location_area_large_-partition-_place_id ON location_area_large_-partition- USING BTREE (place_id) {ts:address-index};

View File

@ -23,3 +23,57 @@ Feature: Creation of search terms
Then search_name contains
| object | name_vector | nameaddress_vector |
| N1 | foo | the road |
Scenario: Some addr: tags are added to address when the name exists
Given the scene roads-with-pois
And the places
| osm | class | type | name | geometry |
| N1 | place | state | new york | 80 80 |
| N1 | place | city | bonn | 81 81 |
| N1 | place | suburb | smalltown| 80 81 |
And the named places
| osm | class | type | addr+city | addr+state | addr+suburb | geometry |
| W1 | highway | service | bonn | New York | Smalltown | :w-north |
When importing
Then search_name contains
| object | nameaddress_vector |
| W1 | bonn, new york, smalltown |
Scenario: A known addr:* tag is not added if the name is unknown
Given the scene roads-with-pois
And the places
| osm | class | type | name | addr+city | geometry |
| W1 | highway | residential | Road | Nandu | :w-north |
When importing
Then search_name contains not
| object | nameaddress_vector |
| W1 | nandu |
Scenario: addr:postcode is not added to the address terms
Given the scene roads-with-pois
And the places
| osm | class | type | name+ref | geometry |
| N1 | place | state | 12345 | 80 80 |
And the named places
| osm | class | type | addr+postcode | geometry |
| W1 | highway | residential | 12345 | :w-north |
When importing
Then search_name contains not
| object | nameaddress_vector |
| W1 | 12345 |
Scenario: is_in is split and added to the address search terms
Given the scene roads-with-pois
And the places
| osm | class | type | name | geometry |
| N1 | place | state | new york | 80 80 |
| N1 | place | city | bonn | 81 81 |
| N1 | place | suburb | smalltown| 80 81 |
And the named places
| osm | class | type | addr+is_in | geometry |
| W1 | highway | service | bonn, New York, Smalltown | :w-north |
When importing
Then search_name contains
| object | nameaddress_vector |
| W1 | bonn, new york, smalltown |

View File

@ -427,8 +427,8 @@ def check_placex_contents(context, exact):
context.db.commit()
@then("search_name contains")
def check_search_name_contents(context):
@then("search_name contains(?P<exclude> not)?")
def check_search_name_contents(context, exclude):
cur = context.db.cursor(cursor_factory=psycopg2.extras.DictCursor)
for row in context.table:
@ -446,11 +446,16 @@ def check_search_name_contents(context):
FROM word, (SELECT unnest(%s) as term) t
WHERE word_token = make_standard_name(t.term)""",
(terms,))
ok_(subcur.rowcount >= len(terms),
"No word entry found for " + row[h])
if not exclude:
ok_(subcur.rowcount >= len(terms),
"No word entry found for " + row[h])
for wid in subcur:
assert_in(wid[0], res[h],
"Missing term for %s/%s: %s" % (pid, h, wid[1]))
if exclude:
assert_not_in(wid[0], res[h],
"Found term for %s/%s: %s" % (pid, h, wid[1]))
else:
assert_in(wid[0], res[h],
"Missing term for %s/%s: %s" % (pid, h, wid[1]))
else:
assert_db_column(res, h, row[h], context)