move word table and normalisation SQL into tokenizer

Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation.
2024-12-23 21:14:11 +03:00 · 2021-04-22 22:47:34 +02:00 · 2021-04-22 22:47:34 +02:00 · fbbdd31399
commit fbbdd31399
parent b5540dc35c
15 changed files with 117 additions and 53 deletions
--- a/lib-sql/functions.sql
+++ b/lib-sql/functions.sql
@ -1,5 +1,4 @@
 {% include('functions/utils.sql') %}
-{% include('functions/normalization.sql') %}
 {% include('functions/ranking.sql') %}
 {% include('functions/importance.sql') %}
 {% include('functions/address_lookup.sql') %}
--- a/lib-sql/tables.sql
+++ b/lib-sql/tables.sql
@ -43,22 +43,6 @@ CREATE TABLE nominatim_properties (
 );
 GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";

-drop table IF EXISTS word;
-CREATE TABLE word (
-  word_id INTEGER,
-  word_token text,
-  word text,
-  class text,
-  type text,
-  country_code varchar(2),
-  search_name_count INTEGER,
-  operator TEXT
-  ) {{db.tablespace.search_data}};
-CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}};
-GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ;
-DROP SEQUENCE IF EXISTS seq_word;
-CREATE SEQUENCE seq_word start 1;
-
 drop table IF EXISTS location_area CASCADE;
 CREATE TABLE location_area (
  place_id BIGINT,
@ -178,7 +162,6 @@ DROP SEQUENCE IF EXISTS seq_place;
 CREATE SEQUENCE seq_place start 1;
 GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ;
-GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ;
 GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ;
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@ -38,7 +38,7 @@ BEGIN
    return_word_id := nextval('seq_word');
    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
  ELSE
-    IF count > get_maxwordfreq() THEN
+    IF count > {{ max_word_freq }} THEN
      return_word_id := NULL;
    END IF;
  END IF;
--- a/lib-sql/tokenizer/legacy_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer_tables.sql
@ -0,0 +1,19 @@
+DROP TABLE IF EXISTS word;
+CREATE TABLE word (
+  word_id INTEGER,
+  word_token text NOT NULL,
+  word text,
+  class text,
+  type text,
+  country_code varchar(2),
+  search_name_count INTEGER,
+  operator TEXT
+) {{db.tablespace.search_data}};
+
+CREATE INDEX idx_word_word_token ON word
+    USING BTREE (word_token) {{db.tablespace.search_index}};
+GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+DROP SEQUENCE IF EXISTS seq_word;
+CREATE SEQUENCE seq_word start 1;
+GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@ -46,6 +46,7 @@ class UpdateRefresh:
    @staticmethod
    def run(args):
        from ..tools import refresh
+        from ..tokenizer import factory as tokenizer_factory

        if args.postcodes:
            LOG.warning("Update postcodes centroid")
@ -66,6 +67,8 @@ class UpdateRefresh:
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.create_functions(conn, args.config,
                                         args.diffs, args.enable_debug_statements)
+                tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+                tokenizer.update_sql_functions(args.config)

        if args.wiki_data:
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@ -100,15 +100,19 @@ class SetupAll:
        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Initialise tables')
            with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
+                database_import.truncate_data_tables(conn)

            LOG.warning('Load data into placex table')
            database_import.load_data(args.config.get_libpq_dsn(),
-                                      args.data_dir,
                                      args.threads or psutil.cpu_count() or 1)

        LOG.warning("Setting up tokenizer")
-        tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        if args.continue_at is None or args.continue_at == 'load-data':
+            # (re)initialise the tokenizer data
+            tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        else:
+            # just load the tokenizer
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@ -89,8 +89,6 @@ class SQLPreprocessor:
        self.env.globals['db'] = db_info
        self.env.globals['sql'] = _setup_postgres_sql(conn)
        self.env.globals['postgres'] = _setup_postgresql_features(conn)
-        self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
-                                         str((config.project_dir / 'module').resolve())


    def run_sql_file(self, conn, name, **kwargs):
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@ -8,9 +8,12 @@ import psycopg2

 from nominatim.db.connection import connect
 from nominatim.db import properties
+from nominatim.db import utils as db_utils
+from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError

 DBCFG_NORMALIZATION = "tokenizer_normalization"
+DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"

 LOG = logging.getLogger()

@ -53,6 +56,9 @@ def _install_module(config_module_path, src_dir, module_dir):


 def _check_module(module_dir, conn):
+    """ Try to use the PostgreSQL module to confirm that it is correctly
+        installed and accessible from PostgreSQL.
+    """
    with conn.cursor() as cur:
        try:
            cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
@ -91,7 +97,11 @@ class LegacyTokenizer:

        with connect(self.dsn) as conn:
            _check_module(module_dir, conn)
-            self._save_config(conn)
+            self._save_config(conn, config)
+            conn.commit()
+
+        self.update_sql_functions(config)
+        self._init_db_tables(config)


    def init_from_project(self):
@ -101,6 +111,19 @@ class LegacyTokenizer:
            self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)


+    def update_sql_functions(self, config):
+        """ Reimport the SQL functions for this tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
+            modulepath = config.DATABASE_MODULE_PATH or \
+                         str((config.project_dir / 'module').resolve())
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
+                              max_word_freq=max_word_freq,
+                              modulepath=modulepath)
+
+
    def migrate_database(self, config):
        """ Initialise the project directory of an existing database for
            use with this tokenizer.
@ -114,11 +137,25 @@ class LegacyTokenizer:

        with connect(self.dsn) as conn:
            _check_module(module_dir, conn)
-            self._save_config(conn)
+            self._save_config(conn, config)


-    def _save_config(self, conn):
+    def _init_db_tables(self, config):
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            conn.commit()
+
+        LOG.warning("Precomputing word tokens")
+        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
+
+
+    def _save_config(self, conn, config):
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
        """
        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@ -160,11 +160,10 @@ def create_partition_tables(conn, config):
    sql.run_sql_file(conn, 'partition-tables.src.sql')


-def truncate_data_tables(conn, max_word_frequency=None):
+def truncate_data_tables(conn):
    """ Truncate all data tables to prepare for a fresh load.
    """
    with conn.cursor() as cur:
-        cur.execute('TRUNCATE word')
        cur.execute('TRUNCATE placex')
        cur.execute('TRUNCATE place_addressline')
        cur.execute('TRUNCATE location_area')
@ -183,23 +182,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
        for table in [r[0] for r in list(cur)]:
            cur.execute('TRUNCATE ' + table)

-        if max_word_frequency is not None:
-            # Used by getorcreate_word_id to ignore frequent partial words.
-            cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
-                           RETURNS integer AS $$
-                             SELECT {} as maxwordfreq;
-                           $$ LANGUAGE SQL IMMUTABLE
-                        """.format(max_word_frequency))
-        conn.commit()
+    conn.commit()

 _COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'

-def load_data(dsn, data_dir, threads):
+def load_data(dsn, threads):
    """ Copy data into the word and placex table.
    """
-    # Pre-calculate the most important terms in the word list.
-    db_utils.execute_file(dsn, data_dir / 'words.sql')
-
    sel = selectors.DefaultSelector()
    # Then copy data from place to placex in <threads - 1> chunks.
    place_threads = max(1, threads - 1)
--- a/nominatim/tools/migration.py
+++ b/nominatim/tools/migration.py
@ -49,6 +49,8 @@ def migrate(config, paths):
        if has_run_migration:
            LOG.warning('Updating SQL functions.')
            refresh.create_functions(conn, config)
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+            tokenizer.update_sql_functions(config)

        properties.set_property(conn, 'database_version',
                                '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@ -286,7 +286,6 @@ def osm2pgsql_options(temp_db):

@pytest.fixture
 def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
    table_factory('country_name', 'partition INT', (0, 1, 2))
    cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
    cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@ -139,7 +139,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
        mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
        mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
        mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
-        mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
+        mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
        mock_func_factory(nominatim.tools.refresh, 'setup_website'),
        mock_func_factory(nominatim.db.properties, 'set_property')
    ]
@ -161,7 +161,7 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
        mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
        mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
        mock_func_factory(nominatim.tools.refresh, 'setup_website'),
-        mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
+        mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
        mock_func_factory(nominatim.db.properties, 'set_property')
    ]

@ -242,7 +242,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
                         ('postcodes', 'update_postcodes'),
                         ('word-counts', 'recompute_word_counts'),
                         ('address-levels', 'load_address_levels_from_file'),
-                         ('functions', 'create_functions'),
                         ('wiki-data', 'import_wikipedia_articles'),
                         ('importance', 'recompute_importance'),
                         ('website', 'setup_website'),
@ -254,6 +253,22 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
    assert func_mock.called == 1


+def test_refresh_create_functions(mock_func_factory, monkeypatch, temp_db):
+    class DummyTokenizer:
+        def update_sql_functions(self, *args):
+            self.called = True
+
+    func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
+    tok = DummyTokenizer()
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+                        lambda *args: tok)
+
+
+    assert 0 == call_nominatim('refresh', '--functions')
+    assert func_mock.called == 1
+    assert hasattr(tok, 'called')
+
+
 def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
    calls = []
    monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
--- a/test/python/test_db_sql_preprocessor.py
+++ b/test/python/test_db_sql_preprocessor.py
@ -24,7 +24,6 @@ def sql_factory(tmp_path):
    ("'{{db.partitions|join}}'", '012'),
    ("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
    ("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
-    ("'{{config.DATABASE_MODULE_PATH}}'", '.')
    ])
 def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
    sqlfile = sql_factory("RETURN {};".format(expr))
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@ -1,6 +1,8 @@
 """
 Test for legacy tokenizer.
 """
+import shutil
+
 import pytest

 from nominatim.tokenizer import legacy_tokenizer
@ -18,6 +20,18 @@ def test_config(def_config, tmp_path):

    def_config.lib_dir.module = module_dir

+    sqldir = tmp_path / 'sql'
+    sqldir.mkdir()
+    (sqldir / 'tokenizer').mkdir()
+    (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
+    (sqldir / 'words.sql').write_text("SELECT 'a'")
+    shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
+                str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
+
+    def_config.lib_dir.sql = sqldir
+    def_config.lib_dir.data = sqldir
+
+
    return def_config


@ -30,13 +44,15 @@ def tokenizer_factory(dsn, tmp_path, monkeypatch):
    return _maker

@pytest.fixture
-def tokenizer_setup(tokenizer_factory, test_config, property_table, monkeypatch):
+def tokenizer_setup(tokenizer_factory, test_config, property_table,
+                    monkeypatch, sql_preprocessor):
    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
    tok = tokenizer_factory()
    tok.init_new_db(test_config)


-def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
+def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch,
+                  temp_db_conn, sql_preprocessor):
    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)

@ -52,7 +68,8 @@ def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, t
    assert outfile.stat().st_mode == 33261


-def test_init_module_load_failed(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
+def test_init_module_load_failed(tokenizer_factory, test_config, property_table,
+                                 monkeypatch, temp_db_conn):
    tok = tokenizer_factory()

    with pytest.raises(UsageError):
@ -60,7 +77,7 @@ def test_init_module_load_failed(tokenizer_factory, test_config, property_table,


 def test_init_module_custom(tokenizer_factory, test_config, property_table,
-                            monkeypatch, tmp_path):
+                            monkeypatch, tmp_path, sql_preprocessor):
    module_dir = (tmp_path / 'custom').resolve()
    module_dir.mkdir()
    (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
--- a/test/python/test_tools_database_import.py
+++ b/test/python/test_tools_database_import.py
@ -138,14 +138,14 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):


 def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
-    tables = ('word', 'placex', 'place_addressline', 'location_area',
+    tables = ('placex', 'place_addressline', 'location_area',
              'location_area_country',
              'location_property_tiger', 'location_property_osmline',
              'location_postcode', 'search_name', 'location_road_23')
    for table in tables:
        table_factory(table, content=(1, 2, 3))

-    database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
+    database_import.truncate_data_tables(temp_db_conn)

    for table in tables:
        assert temp_db_cursor.table_rows(table) == 0
@ -163,7 +163,7 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
    place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
              geom='SRID=4326;LINESTRING(0 0, 10 10)')

-    database_import.load_data(dsn, src_dir / 'data', threads)
+    database_import.load_data(dsn, threads)

    assert temp_db_cursor.table_rows('placex') == 30
    assert temp_db_cursor.table_rows('location_property_osmline') == 1