mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-23 21:14:11 +03:00
move postcode normalization into tokenizer
This commit is contained in:
parent
d8ed1bfc60
commit
ffc2d82b0e
@ -12,39 +12,47 @@ $$
|
||||
LANGUAGE plpgsql IMMUTABLE;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
|
||||
RETURNS HSTORE
|
||||
AS $$
|
||||
DECLARE
|
||||
location RECORD;
|
||||
waynodes BIGINT[];
|
||||
BEGIN
|
||||
IF akeys(in_address) != ARRAY['interpolation'] THEN
|
||||
RETURN in_address;
|
||||
END IF;
|
||||
|
||||
SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
|
||||
FOR location IN
|
||||
SELECT placex.address, placex.osm_id FROM placex
|
||||
WHERE osm_type = 'N' and osm_id = ANY(waynodes)
|
||||
and placex.address is not null
|
||||
and (placex.address ? 'street' or placex.address ? 'place')
|
||||
and indexed_status < 100
|
||||
LOOP
|
||||
-- mark it as a derived address
|
||||
RETURN location.address || in_address || hstore('_inherited', '');
|
||||
END LOOP;
|
||||
|
||||
RETURN in_address;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql STABLE;
|
||||
|
||||
|
||||
|
||||
-- find the parent road of the cut road parts
|
||||
CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
|
||||
CREATE OR REPLACE FUNCTION get_interpolation_parent(street TEXT,
|
||||
place TEXT, partition SMALLINT,
|
||||
centroid GEOMETRY, geom GEOMETRY)
|
||||
RETURNS BIGINT
|
||||
AS $$
|
||||
DECLARE
|
||||
addr_street TEXT;
|
||||
addr_place TEXT;
|
||||
parent_place_id BIGINT;
|
||||
|
||||
waynodes BIGINT[];
|
||||
|
||||
location RECORD;
|
||||
BEGIN
|
||||
addr_street = street;
|
||||
addr_place = place;
|
||||
|
||||
IF addr_street is null and addr_place is null THEN
|
||||
select nodes from planet_osm_ways where id = wayid INTO waynodes;
|
||||
FOR location IN SELECT placex.address from placex
|
||||
where osm_type = 'N' and osm_id = ANY(waynodes)
|
||||
and placex.address is not null
|
||||
and (placex.address ? 'street' or placex.address ? 'place')
|
||||
and indexed_status < 100
|
||||
limit 1 LOOP
|
||||
addr_street = location.address->'street';
|
||||
addr_place = location.address->'place';
|
||||
END LOOP;
|
||||
END IF;
|
||||
|
||||
parent_place_id := find_parent_for_address(addr_street, addr_place,
|
||||
partition, centroid);
|
||||
parent_place_id := find_parent_for_address(street, place, partition, centroid);
|
||||
|
||||
IF parent_place_id is null THEN
|
||||
FOR location IN SELECT place_id FROM placex
|
||||
@ -147,17 +155,20 @@ BEGIN
|
||||
NEW.interpolationtype = NEW.address->'interpolation';
|
||||
|
||||
place_centroid := ST_PointOnSurface(NEW.linegeo);
|
||||
NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
|
||||
NEW.parent_place_id = get_interpolation_parent(NEW.address->'street',
|
||||
NEW.address->'place',
|
||||
NEW.partition, place_centroid, NEW.linegeo);
|
||||
|
||||
IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
|
||||
interpol_postcode := NEW.address->'postcode';
|
||||
housenum := getorcreate_postcode_id(NEW.address->'postcode');
|
||||
ELSE
|
||||
interpol_postcode := NULL;
|
||||
END IF;
|
||||
|
||||
IF NEW.address ? '_inherited' THEN
|
||||
NEW.address := hstore('interpolation', NEW.interpolationtype);
|
||||
END IF;
|
||||
|
||||
-- if the line was newly inserted, split the line as necessary
|
||||
IF OLD.indexed_status = 1 THEN
|
||||
select nodes from planet_osm_ways where id = NEW.osm_id INTO waynodes;
|
||||
|
@ -817,10 +817,6 @@ BEGIN
|
||||
IF NEW.address is not NULL THEN
|
||||
addr_street := NEW.address->'street';
|
||||
addr_place := NEW.address->'place';
|
||||
|
||||
IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
|
||||
i := getorcreate_postcode_id(NEW.address->'postcode');
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
NEW.postcode := null;
|
||||
|
@ -34,6 +34,13 @@ AS $$
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
|
||||
RETURNS TEXT
|
||||
AS $$
|
||||
SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
-- Return token info that should be saved permanently in the database.
|
||||
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
|
||||
RETURNS JSONB
|
||||
@ -133,26 +140,26 @@ $$
|
||||
LANGUAGE plpgsql;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
|
||||
RETURNS INTEGER
|
||||
CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
DECLARE
|
||||
r RECORD;
|
||||
lookup_token TEXT;
|
||||
lookup_word TEXT;
|
||||
return_word_id INTEGER;
|
||||
BEGIN
|
||||
lookup_word := upper(trim(postcode));
|
||||
lookup_token := ' ' || make_standard_name(lookup_word);
|
||||
SELECT min(word_id) FROM word
|
||||
WHERE word_token = lookup_token and word = lookup_word
|
||||
lookup_token := ' ' || make_standard_name(postcode);
|
||||
FOR r IN
|
||||
SELECT word_id FROM word
|
||||
WHERE word_token = lookup_token and word = postcode
|
||||
and class='place' and type='postcode'
|
||||
INTO return_word_id;
|
||||
IF return_word_id IS NULL THEN
|
||||
return_word_id := nextval('seq_word');
|
||||
INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
|
||||
'place', 'postcode', null, 0);
|
||||
END IF;
|
||||
RETURN return_word_id;
|
||||
LOOP
|
||||
RETURN false;
|
||||
END LOOP;
|
||||
|
||||
INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
|
||||
'place', 'postcode', null, 0);
|
||||
RETURN true;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql;
|
||||
|
@ -116,7 +116,8 @@ class SetupAll:
|
||||
|
||||
if args.continue_at is None or args.continue_at == 'load-data':
|
||||
LOG.warning('Calculate postcodes')
|
||||
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
|
||||
postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
|
||||
tokenizer)
|
||||
|
||||
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
|
||||
if args.continue_at is not None and args.continue_at != 'load-data':
|
||||
|
@ -147,7 +147,7 @@ class Indexer:
|
||||
|
||||
if maxrank == 30:
|
||||
self._index(runners.RankRunner(0, analyzer))
|
||||
self._index(runners.InterpolationRunner(), 20)
|
||||
self._index(runners.InterpolationRunner(analyzer), 20)
|
||||
self._index(runners.RankRunner(30, analyzer), 20)
|
||||
else:
|
||||
self._index(runners.RankRunner(maxrank, analyzer))
|
||||
|
@ -25,7 +25,7 @@ class AbstractPlacexRunner:
|
||||
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
||||
FROM (VALUES {}) as v(id, addr, ti)
|
||||
WHERE place_id = v.id
|
||||
""".format(','.join(["(%s, %s::hstore, %s::json)"] * num_places))
|
||||
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
|
||||
|
||||
|
||||
def index_places(self, worker, places):
|
||||
@ -82,6 +82,10 @@ class InterpolationRunner:
|
||||
location_property_osmline.
|
||||
"""
|
||||
|
||||
def __init__(self, analyzer):
|
||||
self.analyzer = analyzer
|
||||
|
||||
|
||||
@staticmethod
|
||||
def name():
|
||||
return "interpolation lines (location_property_osmline)"
|
||||
@ -93,15 +97,30 @@ class InterpolationRunner:
|
||||
|
||||
@staticmethod
|
||||
def sql_get_objects():
|
||||
return """SELECT place_id FROM location_property_osmline
|
||||
return """SELECT place_id, get_interpolation_address(address, osm_id) as address
|
||||
FROM location_property_osmline
|
||||
WHERE indexed_status > 0
|
||||
ORDER BY geometry_sector"""
|
||||
|
||||
|
||||
@staticmethod
|
||||
def index_places(worker, ids):
|
||||
worker.perform(""" UPDATE location_property_osmline
|
||||
SET indexed_status = 0 WHERE place_id IN ({})
|
||||
""".format(','.join((str(i[0]) for i in ids))))
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _index_sql(num_places):
|
||||
return """ UPDATE location_property_osmline
|
||||
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
||||
FROM (VALUES {}) as v(id, addr, ti)
|
||||
WHERE place_id = v.id
|
||||
""".format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
|
||||
|
||||
|
||||
def index_places(self, worker, places):
|
||||
values = []
|
||||
for place in places:
|
||||
values.extend((place[x] for x in ('place_id', 'address')))
|
||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
||||
|
||||
worker.perform(self._index_sql(len(places)), values)
|
||||
|
||||
|
||||
|
||||
class PostcodeRunner:
|
||||
|
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Tokenizer implementing normalisation as used before Nominatim 4.
|
||||
"""
|
||||
from collections import OrderedDict
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
@ -213,6 +214,15 @@ class LegacyNameAnalyzer:
|
||||
self.conn.close()
|
||||
self.conn = None
|
||||
|
||||
|
||||
def add_postcodes_from_db(self):
|
||||
""" Add postcodes from the location_postcode table to the word table.
|
||||
"""
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT count(create_postcode_id(pc))
|
||||
FROM (SELECT distinct(postcode) as pc
|
||||
FROM location_postcode) x""")
|
||||
|
||||
def process_place(self, place):
|
||||
""" Determine tokenizer information about the given place.
|
||||
|
||||
@ -226,11 +236,25 @@ class LegacyNameAnalyzer:
|
||||
address = place.get('address')
|
||||
|
||||
if address:
|
||||
self._add_postcode(address.get('postcode'))
|
||||
token_info.add_housenumbers(self.conn, address)
|
||||
|
||||
return token_info.data
|
||||
|
||||
|
||||
def _add_postcode(self, postcode):
|
||||
""" Make sure the normalized postcode is present in the word table.
|
||||
"""
|
||||
if not postcode or re.search(r'[:,;]', postcode) is not None:
|
||||
return
|
||||
|
||||
def _create_postcode_from_db(pcode):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
|
||||
|
||||
self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
|
||||
|
||||
|
||||
class _TokenInfo:
|
||||
""" Collect token information to be sent back to the database.
|
||||
"""
|
||||
@ -285,6 +309,32 @@ class _TokenInfo:
|
||||
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
|
||||
|
||||
|
||||
class _LRU:
|
||||
""" Least recently used cache that accepts a generator function to
|
||||
produce the item when there is a cache miss.
|
||||
"""
|
||||
|
||||
def __init__(self, maxsize=128):
|
||||
self.data = OrderedDict()
|
||||
self.maxsize = maxsize
|
||||
|
||||
def get(self, key, generator):
|
||||
""" Get the item with the given key from the cache. If nothing
|
||||
is found in the cache, generate the value through the
|
||||
generator function and store it in the cache.
|
||||
"""
|
||||
value = self.data.get(key)
|
||||
if value is not None:
|
||||
self.data.move_to_end(key)
|
||||
else:
|
||||
value = generator(key)
|
||||
if len(self.data) >= self.maxsize:
|
||||
self.data.popitem(last=False)
|
||||
self.data[key] = value
|
||||
|
||||
return value
|
||||
|
||||
|
||||
class _TokenCache:
|
||||
""" Cache for token information to avoid repeated database queries.
|
||||
|
||||
@ -292,6 +342,9 @@ class _TokenCache:
|
||||
analyzer.
|
||||
"""
|
||||
def __init__(self, conn):
|
||||
# various LRU caches
|
||||
self.postcodes = _LRU(maxsize=32)
|
||||
|
||||
# Lookup houseunumbers up to 100 and cache them
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
|
||||
|
@ -6,7 +6,7 @@ of artificial postcode centroids.
|
||||
from nominatim.db.utils import execute_file
|
||||
from nominatim.db.connection import connect
|
||||
|
||||
def import_postcodes(dsn, project_dir):
|
||||
def import_postcodes(dsn, project_dir, tokenizer):
|
||||
""" Set up the initial list of postcodes.
|
||||
"""
|
||||
|
||||
@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
|
||||
INSERT INTO location_postcode
|
||||
(place_id, indexed_status, country_code, postcode, geometry)
|
||||
SELECT nextval('seq_place'), 1, country_code,
|
||||
upper(trim (both ' ' from address->'postcode')) as pc,
|
||||
token_normalized_postcode(address->'postcode') as pc,
|
||||
ST_Centroid(ST_Collect(ST_Centroid(geometry)))
|
||||
FROM placex
|
||||
WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
|
||||
WHERE address ? 'postcode'
|
||||
and token_normalized_postcode(address->'postcode') is not null
|
||||
AND geometry IS NOT null
|
||||
GROUP BY country_code, pc
|
||||
""")
|
||||
@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
|
||||
cur.execute("""
|
||||
INSERT INTO location_postcode
|
||||
(place_id, indexed_status, country_code, postcode, geometry)
|
||||
SELECT nextval('seq_place'), 1, 'us', postcode,
|
||||
SELECT nextval('seq_place'), 1, 'us',
|
||||
token_normalized_postcode(postcode),
|
||||
ST_SetSRID(ST_Point(x,y),4326)
|
||||
FROM us_postcode WHERE postcode NOT IN
|
||||
FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
|
||||
(SELECT postcode FROM location_postcode
|
||||
WHERE country_code = 'us')
|
||||
""")
|
||||
@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
|
||||
cur.execute("""
|
||||
INSERT INTO location_postcode
|
||||
(place_id, indexed_status, country_code, postcode, geometry)
|
||||
SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
|
||||
FROM gb_postcode WHERE postcode NOT IN
|
||||
SELECT nextval('seq_place'), 1, 'gb',
|
||||
token_normalized_postcode(postcode), geometry
|
||||
FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
|
||||
(SELECT postcode FROM location_postcode
|
||||
WHERE country_code = 'gb')
|
||||
""")
|
||||
@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
|
||||
DELETE FROM word WHERE class='place' and type='postcode'
|
||||
and word NOT IN (SELECT postcode FROM location_postcode)
|
||||
""")
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(getorcreate_postcode_id(v)) FROM
|
||||
(SELECT distinct(postcode) as v FROM location_postcode) p
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
analyzer.add_postcodes_from_db()
|
||||
|
@ -299,7 +299,7 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer_mock(monkeypatch, property_table, temp_db_conn):
|
||||
def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
|
||||
""" Sets up the configuration so that the test dummy tokenizer will be
|
||||
loaded.
|
||||
"""
|
||||
|
@ -43,6 +43,9 @@ class DummyNameAnalyzer:
|
||||
"""
|
||||
pass
|
||||
|
||||
def add_postcodes_from_db(self):
|
||||
pass
|
||||
|
||||
def process_place(self, place):
|
||||
""" Determine tokenizer information about the given place.
|
||||
|
||||
|
@ -33,6 +33,9 @@ class IndexerTestDB:
|
||||
geometry_sector INTEGER)""")
|
||||
cur.execute("""CREATE TABLE location_property_osmline (
|
||||
place_id BIGINT,
|
||||
osm_id BIGINT,
|
||||
address HSTORE,
|
||||
token_info JSONB,
|
||||
indexed_status SMALLINT,
|
||||
indexed_date TIMESTAMP,
|
||||
geometry_sector INTEGER)""")
|
||||
@ -61,6 +64,14 @@ class IndexerTestDB:
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
""")
|
||||
cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
|
||||
RETURNS HSTORE AS $$
|
||||
BEGIN
|
||||
RETURN in_address;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
""")
|
||||
|
||||
for table in ('placex', 'location_property_osmline', 'location_postcode'):
|
||||
cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
|
||||
FOR EACH ROW EXECUTE PROCEDURE date_update()
|
||||
@ -91,9 +102,9 @@ class IndexerTestDB:
|
||||
next_id = next(self.osmline_id)
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO location_property_osmline
|
||||
(place_id, indexed_status, geometry_sector)
|
||||
VALUES (%s, 1, %s)""",
|
||||
(next_id, sector))
|
||||
(place_id, osm_id, indexed_status, geometry_sector)
|
||||
VALUES (%s, %s, 1, %s)""",
|
||||
(next_id, next_id, sector))
|
||||
return next_id
|
||||
|
||||
def add_postcode(self, country, postcode):
|
||||
|
@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
|
||||
import pytest
|
||||
|
||||
from nominatim.tools import postcodes
|
||||
import dummy_tokenizer
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer():
|
||||
return dummy_tokenizer.DummyTokenizer(None, None)
|
||||
|
||||
@pytest.fixture
|
||||
def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
|
||||
@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
|
||||
postcode TEXT,
|
||||
geometry GEOMETRY(Geometry, 4326)""")
|
||||
temp_db_cursor.execute('CREATE SEQUENCE seq_place')
|
||||
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
|
||||
RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
|
||||
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
|
||||
RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
|
||||
""")
|
||||
|
||||
|
||||
def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
|
||||
postcodes.import_postcodes(dsn, tmp_path)
|
||||
def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
|
||||
postcodes.import_postcodes(dsn, tmp_path, tokenizer)
|
||||
|
||||
assert temp_db_cursor.table_exists('gb_postcode')
|
||||
assert temp_db_cursor.table_exists('us_postcode')
|
||||
assert temp_db_cursor.table_rows('location_postcode') == 0
|
||||
|
||||
|
||||
def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
|
||||
def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
|
||||
temp_db_cursor.execute("""
|
||||
INSERT INTO placex (place_id, country_code, address, geometry)
|
||||
VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
|
||||
""")
|
||||
|
||||
postcodes.import_postcodes(dsn, tmp_path)
|
||||
postcodes.import_postcodes(dsn, tmp_path, tokenizer)
|
||||
|
||||
rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
|
||||
ST_X(geometry), ST_Y(geometry)
|
||||
|
Loading…
Reference in New Issue
Block a user