move postcode normalization into tokenizer

2024-12-24 13:31:37 +03:00 · 2021-04-25 18:26:36 +02:00 · 2021-04-25 18:26:36 +02:00 · ffc2d82b0e
commit ffc2d82b0e
parent d8ed1bfc60
12 changed files with 181 additions and 74 deletions
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@ -12,39 +12,47 @@ $$
 LANGUAGE plpgsql IMMUTABLE;


+CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+RETURNS HSTORE
+  AS $$
+DECLARE
+  location RECORD;
+  waynodes BIGINT[];
+BEGIN
+  IF akeys(in_address) != ARRAY['interpolation'] THEN
+    RETURN in_address;
+  END IF;
+
+  SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
+  FOR location IN
+    SELECT placex.address, placex.osm_id FROM placex
+     WHERE osm_type = 'N' and osm_id = ANY(waynodes)
+           and placex.address is not null
+           and (placex.address ? 'street' or placex.address ? 'place')
+           and indexed_status < 100
+  LOOP
+    -- mark it as a derived address
+    RETURN location.address || in_address || hstore('_inherited', '');
+  END LOOP;
+
+  RETURN in_address;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+
 -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
+CREATE OR REPLACE FUNCTION get_interpolation_parent(street TEXT,
                                                    place TEXT, partition SMALLINT,
                                                    centroid GEOMETRY, geom GEOMETRY)
  RETURNS BIGINT
  AS $$
 DECLARE
-  addr_street TEXT;
-  addr_place TEXT;
  parent_place_id BIGINT;
-
-  waynodes BIGINT[];
-
  location RECORD;
 BEGIN
-  addr_street = street;
-  addr_place = place;
-
-  IF addr_street is null and addr_place is null THEN
-    select nodes from planet_osm_ways where id = wayid INTO waynodes;
-    FOR location IN SELECT placex.address from placex
-                    where osm_type = 'N' and osm_id = ANY(waynodes)
-                          and placex.address is not null
-                          and (placex.address ? 'street' or placex.address ? 'place')
-                          and indexed_status < 100
-                    limit 1 LOOP
-      addr_street = location.address->'street';
-      addr_place = location.address->'place';
-    END LOOP;
-  END IF;
-
-  parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                             partition, centroid);
+  parent_place_id := find_parent_for_address(street, place, partition, centroid);

  IF parent_place_id is null THEN
    FOR location IN SELECT place_id FROM placex
@ -147,17 +155,20 @@ BEGIN
  NEW.interpolationtype = NEW.address->'interpolation';

  place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
+  NEW.parent_place_id = get_interpolation_parent(NEW.address->'street',
                                                 NEW.address->'place',
                                                 NEW.partition, place_centroid, NEW.linegeo);

  IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
    interpol_postcode := NEW.address->'postcode';
-    housenum := getorcreate_postcode_id(NEW.address->'postcode');
  ELSE
    interpol_postcode := NULL;
  END IF;

+  IF NEW.address ? '_inherited' THEN
+    NEW.address := hstore('interpolation', NEW.interpolationtype);
+  END IF;
+
  -- if the line was newly inserted, split the line as necessary
  IF OLD.indexed_status = 1 THEN
      select nodes from planet_osm_ways where id = NEW.osm_id INTO waynodes;
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@ -817,10 +817,6 @@ BEGIN
  IF NEW.address is not NULL THEN
      addr_street := NEW.address->'street';
      addr_place := NEW.address->'place';
-
-      IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
-        i := getorcreate_postcode_id(NEW.address->'postcode');
-      END IF;
  END IF;

  NEW.postcode := null;
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@ -34,6 +34,13 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;


+CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+  RETURNS TEXT
+AS $$
+  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
 -- Return token info that should be saved permanently in the database.
 CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
  RETURNS JSONB
@ -133,26 +140,26 @@ $$
 LANGUAGE plpgsql;


-CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-  RETURNS INTEGER
+CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+  RETURNS BOOLEAN
  AS $$
 DECLARE
+  r RECORD;
  lookup_token TEXT;
-  lookup_word TEXT;
  return_word_id INTEGER;
 BEGIN
-  lookup_word := upper(trim(postcode));
-  lookup_token := ' ' || make_standard_name(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and word = lookup_word
+  lookup_token := ' ' || make_standard_name(postcode);
+  FOR r IN
+    SELECT word_id FROM word
+    WHERE word_token = lookup_token and word = postcode
          and class='place' and type='postcode'
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
+  LOOP
+    RETURN false;
+  END LOOP;
+
+  INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
                           'place', 'postcode', null, 0);
-  END IF;
-  RETURN return_word_id;
+  RETURN true;
 END;
 $$
 LANGUAGE plpgsql;
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@ -116,7 +116,8 @@ class SetupAll:

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
-            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
+                                       tokenizer)

        if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
            if args.continue_at is not None and args.continue_at != 'load-data':
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@ -147,7 +147,7 @@ class Indexer:

            if maxrank == 30:
                self._index(runners.RankRunner(0, analyzer))
-                self._index(runners.InterpolationRunner(), 20)
+                self._index(runners.InterpolationRunner(analyzer), 20)
                self._index(runners.RankRunner(30, analyzer), 20)
            else:
                self._index(runners.RankRunner(maxrank, analyzer))
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@ -25,7 +25,7 @@ class AbstractPlacexRunner:
                   SET indexed_status = 0, address = v.addr, token_info = v.ti
                   FROM (VALUES {}) as v(id, addr, ti)
                   WHERE place_id = v.id
-               """.format(','.join(["(%s, %s::hstore, %s::json)"]  * num_places))
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))


    def index_places(self, worker, places):
@ -82,6 +82,10 @@ class InterpolationRunner:
        location_property_osmline.
    """

+    def __init__(self, analyzer):
+        self.analyzer = analyzer
+
+
    @staticmethod
    def name():
        return "interpolation lines (location_property_osmline)"
@ -93,15 +97,30 @@ class InterpolationRunner:

    @staticmethod
    def sql_get_objects():
-        return """SELECT place_id FROM location_property_osmline
+        return """SELECT place_id, get_interpolation_address(address, osm_id) as address
+                  FROM location_property_osmline
                  WHERE indexed_status > 0
                  ORDER BY geometry_sector"""

+
    @staticmethod
-    def index_places(worker, ids):
-        worker.perform(""" UPDATE location_property_osmline
-                           SET indexed_status = 0 WHERE place_id IN ({})
-                       """.format(','.join((str(i[0]) for i in ids))))
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE location_property_osmline
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+


 class PostcodeRunner:
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@ -1,6 +1,7 @@
 """
 Tokenizer implementing normalisation as used before Nominatim 4.
 """
+from collections import OrderedDict
 import logging
 import re
 import shutil
@ -213,6 +214,15 @@ class LegacyNameAnalyzer:
            self.conn.close()
            self.conn = None

+
+    def add_postcodes_from_db(self):
+        """ Add postcodes from the location_postcode table to the word table.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT count(create_postcode_id(pc))
+                           FROM (SELECT distinct(postcode) as pc
+                                 FROM location_postcode) x""")
+
    def process_place(self, place):
        """ Determine tokenizer information about the given place.

@ -226,11 +236,25 @@ class LegacyNameAnalyzer:
        address = place.get('address')

        if address:
+            self._add_postcode(address.get('postcode'))
            token_info.add_housenumbers(self.conn, address)

        return token_info.data


+    def _add_postcode(self, postcode):
+        """ Make sure the normalized postcode is present in the word table.
+        """
+        if not postcode or re.search(r'[:,;]', postcode) is not None:
+            return
+
+        def _create_postcode_from_db(pcode):
+            with self.conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+        self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
 class _TokenInfo:
    """ Collect token information to be sent back to the database.
    """
@ -285,6 +309,32 @@ class _TokenInfo:
            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()


+class _LRU:
+    """ Least recently used cache that accepts a generator function to
+        produce the item when there is a cache miss.
+    """
+
+    def __init__(self, maxsize=128):
+        self.data = OrderedDict()
+        self.maxsize = maxsize
+
+    def get(self, key, generator):
+        """ Get the item with the given key from the cache. If nothing
+            is found in the cache, generate the value through the
+            generator function and store it in the cache.
+        """
+        value = self.data.get(key)
+        if value is not None:
+            self.data.move_to_end(key)
+        else:
+            value = generator(key)
+            if len(self.data) >= self.maxsize:
+                self.data.popitem(last=False)
+            self.data[key] = value
+
+        return value
+
+
 class _TokenCache:
    """ Cache for token information to avoid repeated database queries.

@ -292,6 +342,9 @@ class _TokenCache:
        analyzer.
    """
    def __init__(self, conn):
+        # various LRU caches
+        self.postcodes = _LRU(maxsize=32)
+
        # Lookup houseunumbers up to 100 and cache them
        with conn.cursor() as cur:
            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@ -6,7 +6,7 @@ of artificial postcode centroids.
 from nominatim.db.utils import execute_file
 from nominatim.db.connection import connect

-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
    """ Set up the initial list of postcodes.
    """

@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, country_code,
-                       upper(trim (both ' ' from address->'postcode')) as pc,
+                       token_normalized_postcode(address->'postcode') as pc,
                       ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                  FROM placex
-                 WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+                 WHERE address ? 'postcode'
+                       and token_normalized_postcode(address->'postcode') is not null
                       AND geometry IS NOT null
                 GROUP BY country_code, pc
            """)
@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'us', postcode,
+                SELECT nextval('seq_place'), 1, 'us',
+                       token_normalized_postcode(postcode),
                       ST_SetSRID(ST_Point(x,y),4326)
-                  FROM us_postcode WHERE postcode NOT IN
+                  FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
                        (SELECT postcode FROM location_postcode
                          WHERE country_code = 'us')
            """)
@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
-                  FROM gb_postcode WHERE postcode NOT IN
+                SELECT nextval('seq_place'), 1, 'gb',
+                       token_normalized_postcode(postcode), geometry
+                  FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
                           (SELECT postcode FROM location_postcode
                             WHERE country_code = 'gb')
            """)
@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
                    DELETE FROM word WHERE class='place' and type='postcode'
                    and word NOT IN (SELECT postcode FROM location_postcode)
            """)
-
-            cur.execute("""
-                SELECT count(getorcreate_postcode_id(v)) FROM
-                (SELECT distinct(postcode) as v FROM location_postcode) p
-            """)
        conn.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.add_postcodes_from_db()
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@ -299,7 +299,7 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):


@pytest.fixture
-def tokenizer_mock(monkeypatch, property_table, temp_db_conn):
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
    """ Sets up the configuration so that the test dummy tokenizer will be
        loaded.
    """
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@ -43,6 +43,9 @@ class DummyNameAnalyzer:
        """
        pass

+    def add_postcodes_from_db(self):
+        pass
+
    def process_place(self, place):
        """ Determine tokenizer information about the given place.

--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@ -33,6 +33,9 @@ class IndexerTestDB:
                                                geometry_sector INTEGER)""")
            cur.execute("""CREATE TABLE location_property_osmline (
                               place_id BIGINT,
+                               osm_id BIGINT,
+                               address HSTORE,
+                               token_info JSONB,
                               indexed_status SMALLINT,
                               indexed_date TIMESTAMP,
                               geometry_sector INTEGER)""")
@ -61,6 +64,14 @@ class IndexerTestDB:
                           END;
                           $$ LANGUAGE plpgsql STABLE;
                        """)
+            cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+                           RETURNS HSTORE AS $$
+                           BEGIN
+                             RETURN in_address;
+                           END;
+                           $$ LANGUAGE plpgsql STABLE;
+                        """)
+
            for table in ('placex', 'location_property_osmline', 'location_postcode'):
                cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
                               FOR EACH ROW EXECUTE PROCEDURE date_update()
@ -91,9 +102,9 @@ class IndexerTestDB:
        next_id = next(self.osmline_id)
        with self.conn.cursor() as cur:
            cur.execute("""INSERT INTO location_property_osmline
-                              (place_id, indexed_status, geometry_sector)
-                              VALUES (%s, 1, %s)""",
-                        (next_id, sector))
+                              (place_id, osm_id, indexed_status, geometry_sector)
+                              VALUES (%s, %s, 1, %s)""",
+                        (next_id, next_id, sector))
        return next_id

    def add_postcode(self, country, postcode):
--- a/test/python/test_tools_postcodes.py
+++ b/test/python/test_tools_postcodes.py
@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
 import pytest

 from nominatim.tools import postcodes
+import dummy_tokenizer
+
+@pytest.fixture
+def tokenizer():
+    return dummy_tokenizer.DummyTokenizer(None, None)

@pytest.fixture
 def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
                      postcode TEXT,
                      geometry GEOMETRY(Geometry, 4326)""")
    temp_db_cursor.execute('CREATE SEQUENCE seq_place')
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-                              RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+                              RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
                           """)


-def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
-    postcodes.import_postcodes(dsn, tmp_path)
+def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)

    assert temp_db_cursor.table_exists('gb_postcode')
    assert temp_db_cursor.table_exists('us_postcode')
    assert temp_db_cursor.table_rows('location_postcode') == 0


-def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
    temp_db_cursor.execute("""
        INSERT INTO placex (place_id, country_code, address, geometry)
          VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
    """)

-    postcodes.import_postcodes(dsn, tmp_path)
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)

    rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
                                      ST_X(geometry), ST_Y(geometry)