move postcode normalization into tokenizer

This commit is contained in:
Sarah Hoffmann 2021-04-25 18:26:36 +02:00
parent d8ed1bfc60
commit ffc2d82b0e
12 changed files with 181 additions and 74 deletions

View File

@ -12,39 +12,47 @@ $$
LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
RETURNS HSTORE
AS $$
DECLARE
location RECORD;
waynodes BIGINT[];
BEGIN
IF akeys(in_address) != ARRAY['interpolation'] THEN
RETURN in_address;
END IF;
SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
FOR location IN
SELECT placex.address, placex.osm_id FROM placex
WHERE osm_type = 'N' and osm_id = ANY(waynodes)
and placex.address is not null
and (placex.address ? 'street' or placex.address ? 'place')
and indexed_status < 100
LOOP
-- mark it as a derived address
RETURN location.address || in_address || hstore('_inherited', '');
END LOOP;
RETURN in_address;
END;
$$
LANGUAGE plpgsql STABLE;
-- find the parent road of the cut road parts
CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
CREATE OR REPLACE FUNCTION get_interpolation_parent(street TEXT,
place TEXT, partition SMALLINT,
centroid GEOMETRY, geom GEOMETRY)
RETURNS BIGINT
AS $$
DECLARE
addr_street TEXT;
addr_place TEXT;
parent_place_id BIGINT;
waynodes BIGINT[];
location RECORD;
BEGIN
addr_street = street;
addr_place = place;
IF addr_street is null and addr_place is null THEN
select nodes from planet_osm_ways where id = wayid INTO waynodes;
FOR location IN SELECT placex.address from placex
where osm_type = 'N' and osm_id = ANY(waynodes)
and placex.address is not null
and (placex.address ? 'street' or placex.address ? 'place')
and indexed_status < 100
limit 1 LOOP
addr_street = location.address->'street';
addr_place = location.address->'place';
END LOOP;
END IF;
parent_place_id := find_parent_for_address(addr_street, addr_place,
partition, centroid);
parent_place_id := find_parent_for_address(street, place, partition, centroid);
IF parent_place_id is null THEN
FOR location IN SELECT place_id FROM placex
@ -147,17 +155,20 @@ BEGIN
NEW.interpolationtype = NEW.address->'interpolation';
place_centroid := ST_PointOnSurface(NEW.linegeo);
NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
NEW.parent_place_id = get_interpolation_parent(NEW.address->'street',
NEW.address->'place',
NEW.partition, place_centroid, NEW.linegeo);
IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
interpol_postcode := NEW.address->'postcode';
housenum := getorcreate_postcode_id(NEW.address->'postcode');
ELSE
interpol_postcode := NULL;
END IF;
IF NEW.address ? '_inherited' THEN
NEW.address := hstore('interpolation', NEW.interpolationtype);
END IF;
-- if the line was newly inserted, split the line as necessary
IF OLD.indexed_status = 1 THEN
select nodes from planet_osm_ways where id = NEW.osm_id INTO waynodes;

View File

@ -817,10 +817,6 @@ BEGIN
IF NEW.address is not NULL THEN
addr_street := NEW.address->'street';
addr_place := NEW.address->'place';
IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
i := getorcreate_postcode_id(NEW.address->'postcode');
END IF;
END IF;
NEW.postcode := null;

View File

@ -34,6 +34,13 @@ AS $$
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
RETURNS TEXT
AS $$
SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
$$ LANGUAGE SQL IMMUTABLE STRICT;
-- Return token info that should be saved permanently in the database.
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
RETURNS JSONB
@ -133,26 +140,26 @@ $$
LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
RETURNS INTEGER
CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
RETURNS BOOLEAN
AS $$
DECLARE
r RECORD;
lookup_token TEXT;
lookup_word TEXT;
return_word_id INTEGER;
BEGIN
lookup_word := upper(trim(postcode));
lookup_token := ' ' || make_standard_name(lookup_word);
SELECT min(word_id) FROM word
WHERE word_token = lookup_token and word = lookup_word
lookup_token := ' ' || make_standard_name(postcode);
FOR r IN
SELECT word_id FROM word
WHERE word_token = lookup_token and word = postcode
and class='place' and type='postcode'
INTO return_word_id;
IF return_word_id IS NULL THEN
return_word_id := nextval('seq_word');
INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
'place', 'postcode', null, 0);
END IF;
RETURN return_word_id;
LOOP
RETURN false;
END LOOP;
INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
'place', 'postcode', null, 0);
RETURN true;
END;
$$
LANGUAGE plpgsql;

View File

@ -116,7 +116,8 @@ class SetupAll:
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Calculate postcodes')
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
tokenizer)
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
if args.continue_at is not None and args.continue_at != 'load-data':

View File

@ -147,7 +147,7 @@ class Indexer:
if maxrank == 30:
self._index(runners.RankRunner(0, analyzer))
self._index(runners.InterpolationRunner(), 20)
self._index(runners.InterpolationRunner(analyzer), 20)
self._index(runners.RankRunner(30, analyzer), 20)
else:
self._index(runners.RankRunner(maxrank, analyzer))

View File

@ -25,7 +25,7 @@ class AbstractPlacexRunner:
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
""".format(','.join(["(%s, %s::hstore, %s::json)"] * num_places))
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
def index_places(self, worker, places):
@ -82,6 +82,10 @@ class InterpolationRunner:
location_property_osmline.
"""
def __init__(self, analyzer):
self.analyzer = analyzer
@staticmethod
def name():
return "interpolation lines (location_property_osmline)"
@ -93,15 +97,30 @@ class InterpolationRunner:
@staticmethod
def sql_get_objects():
return """SELECT place_id FROM location_property_osmline
return """SELECT place_id, get_interpolation_address(address, osm_id) as address
FROM location_property_osmline
WHERE indexed_status > 0
ORDER BY geometry_sector"""
@staticmethod
def index_places(worker, ids):
worker.perform(""" UPDATE location_property_osmline
SET indexed_status = 0 WHERE place_id IN ({})
""".format(','.join((str(i[0]) for i in ids))))
@functools.lru_cache(maxsize=1)
def _index_sql(num_places):
return """ UPDATE location_property_osmline
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
def index_places(self, worker, places):
values = []
for place in places:
values.extend((place[x] for x in ('place_id', 'address')))
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
worker.perform(self._index_sql(len(places)), values)
class PostcodeRunner:

View File

@ -1,6 +1,7 @@
"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
from collections import OrderedDict
import logging
import re
import shutil
@ -213,6 +214,15 @@ class LegacyNameAnalyzer:
self.conn.close()
self.conn = None
def add_postcodes_from_db(self):
""" Add postcodes from the location_postcode table to the word table.
"""
with self.conn.cursor() as cur:
cur.execute("""SELECT count(create_postcode_id(pc))
FROM (SELECT distinct(postcode) as pc
FROM location_postcode) x""")
def process_place(self, place):
""" Determine tokenizer information about the given place.
@ -226,11 +236,25 @@ class LegacyNameAnalyzer:
address = place.get('address')
if address:
self._add_postcode(address.get('postcode'))
token_info.add_housenumbers(self.conn, address)
return token_info.data
def _add_postcode(self, postcode):
""" Make sure the normalized postcode is present in the word table.
"""
if not postcode or re.search(r'[:,;]', postcode) is not None:
return
def _create_postcode_from_db(pcode):
with self.conn.cursor() as cur:
cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
@ -285,6 +309,32 @@ class _TokenInfo:
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
class _LRU:
""" Least recently used cache that accepts a generator function to
produce the item when there is a cache miss.
"""
def __init__(self, maxsize=128):
self.data = OrderedDict()
self.maxsize = maxsize
def get(self, key, generator):
""" Get the item with the given key from the cache. If nothing
is found in the cache, generate the value through the
generator function and store it in the cache.
"""
value = self.data.get(key)
if value is not None:
self.data.move_to_end(key)
else:
value = generator(key)
if len(self.data) >= self.maxsize:
self.data.popitem(last=False)
self.data[key] = value
return value
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
@ -292,6 +342,9 @@ class _TokenCache:
analyzer.
"""
def __init__(self, conn):
# various LRU caches
self.postcodes = _LRU(maxsize=32)
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text

View File

@ -6,7 +6,7 @@ of artificial postcode centroids.
from nominatim.db.utils import execute_file
from nominatim.db.connection import connect
def import_postcodes(dsn, project_dir):
def import_postcodes(dsn, project_dir, tokenizer):
""" Set up the initial list of postcodes.
"""
@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, country_code,
upper(trim (both ' ' from address->'postcode')) as pc,
token_normalized_postcode(address->'postcode') as pc,
ST_Centroid(ST_Collect(ST_Centroid(geometry)))
FROM placex
WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
WHERE address ? 'postcode'
and token_normalized_postcode(address->'postcode') is not null
AND geometry IS NOT null
GROUP BY country_code, pc
""")
@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, 'us', postcode,
SELECT nextval('seq_place'), 1, 'us',
token_normalized_postcode(postcode),
ST_SetSRID(ST_Point(x,y),4326)
FROM us_postcode WHERE postcode NOT IN
FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode
WHERE country_code = 'us')
""")
@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
FROM gb_postcode WHERE postcode NOT IN
SELECT nextval('seq_place'), 1, 'gb',
token_normalized_postcode(postcode), geometry
FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode
WHERE country_code = 'gb')
""")
@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
DELETE FROM word WHERE class='place' and type='postcode'
and word NOT IN (SELECT postcode FROM location_postcode)
""")
cur.execute("""
SELECT count(getorcreate_postcode_id(v)) FROM
(SELECT distinct(postcode) as v FROM location_postcode) p
""")
conn.commit()
with tokenizer.name_analyzer() as analyzer:
analyzer.add_postcodes_from_db()

View File

@ -299,7 +299,7 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
@pytest.fixture
def tokenizer_mock(monkeypatch, property_table, temp_db_conn):
def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
""" Sets up the configuration so that the test dummy tokenizer will be
loaded.
"""

View File

@ -43,6 +43,9 @@ class DummyNameAnalyzer:
"""
pass
def add_postcodes_from_db(self):
pass
def process_place(self, place):
""" Determine tokenizer information about the given place.

View File

@ -33,6 +33,9 @@ class IndexerTestDB:
geometry_sector INTEGER)""")
cur.execute("""CREATE TABLE location_property_osmline (
place_id BIGINT,
osm_id BIGINT,
address HSTORE,
token_info JSONB,
indexed_status SMALLINT,
indexed_date TIMESTAMP,
geometry_sector INTEGER)""")
@ -61,6 +64,14 @@ class IndexerTestDB:
END;
$$ LANGUAGE plpgsql STABLE;
""")
cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
RETURNS HSTORE AS $$
BEGIN
RETURN in_address;
END;
$$ LANGUAGE plpgsql STABLE;
""")
for table in ('placex', 'location_property_osmline', 'location_postcode'):
cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
FOR EACH ROW EXECUTE PROCEDURE date_update()
@ -91,9 +102,9 @@ class IndexerTestDB:
next_id = next(self.osmline_id)
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO location_property_osmline
(place_id, indexed_status, geometry_sector)
VALUES (%s, 1, %s)""",
(next_id, sector))
(place_id, osm_id, indexed_status, geometry_sector)
VALUES (%s, %s, 1, %s)""",
(next_id, next_id, sector))
return next_id
def add_postcode(self, country, postcode):

View File

@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
import pytest
from nominatim.tools import postcodes
import dummy_tokenizer
@pytest.fixture
def tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None)
@pytest.fixture
def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
postcode TEXT,
geometry GEOMETRY(Geometry, 4326)""")
temp_db_cursor.execute('CREATE SEQUENCE seq_place')
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
""")
def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
postcodes.import_postcodes(dsn, tmp_path)
def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
postcodes.import_postcodes(dsn, tmp_path, tokenizer)
assert temp_db_cursor.table_exists('gb_postcode')
assert temp_db_cursor.table_exists('us_postcode')
assert temp_db_cursor.table_rows('location_postcode') == 0
def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
temp_db_cursor.execute("""
INSERT INTO placex (place_id, country_code, address, geometry)
VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
""")
postcodes.import_postcodes(dsn, tmp_path)
postcodes.import_postcodes(dsn, tmp_path, tokenizer)
rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
ST_X(geometry), ST_Y(geometry)