From fbbdd31399da42b94188d9d4aa4f084efd4876a4 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 22 Apr 2021 22:47:34 +0200 Subject: [PATCH] move word table and normalisation SQL into tokenizer Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation. --- lib-sql/functions.sql | 1 - lib-sql/tables.sql | 17 -------- .../legacy_tokenizer.sql} | 2 +- lib-sql/tokenizer/legacy_tokenizer_tables.sql | 19 ++++++++ nominatim/clicmd/refresh.py | 3 ++ nominatim/clicmd/setup.py | 10 +++-- nominatim/db/sql_preprocessor.py | 2 - nominatim/tokenizer/legacy_tokenizer.py | 43 +++++++++++++++++-- nominatim/tools/database_import.py | 17 ++------ nominatim/tools/migration.py | 2 + test/python/conftest.py | 1 - test/python/test_cli.py | 21 +++++++-- test/python/test_db_sql_preprocessor.py | 1 - test/python/test_tokenizer_legacy.py | 25 +++++++++-- test/python/test_tools_database_import.py | 6 +-- 15 files changed, 117 insertions(+), 53 deletions(-) rename lib-sql/{functions/normalization.sql => tokenizer/legacy_tokenizer.sql} (99%) create mode 100644 lib-sql/tokenizer/legacy_tokenizer_tables.sql diff --git a/lib-sql/functions.sql b/lib-sql/functions.sql index 750af9f0..e9419ca2 100644 --- a/lib-sql/functions.sql +++ b/lib-sql/functions.sql @@ -1,5 +1,4 @@ {% include('functions/utils.sql') %} -{% include('functions/normalization.sql') %} {% include('functions/ranking.sql') %} {% include('functions/importance.sql') %} {% include('functions/address_lookup.sql') %} diff --git a/lib-sql/tables.sql b/lib-sql/tables.sql index 609472ec..2072accc 100644 --- a/lib-sql/tables.sql +++ b/lib-sql/tables.sql @@ -43,22 +43,6 @@ CREATE TABLE nominatim_properties ( ); GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}"; -drop table IF EXISTS word; -CREATE TABLE word ( - word_id INTEGER, - word_token text, - word text, - class text, - type text, - country_code varchar(2), - search_name_count INTEGER, - operator TEXT - ) {{db.tablespace.search_data}}; -CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}}; -GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ; -DROP SEQUENCE IF EXISTS seq_word; -CREATE SEQUENCE seq_word start 1; - drop table IF EXISTS location_area CASCADE; CREATE TABLE location_area ( place_id BIGINT, @@ -178,7 +162,6 @@ DROP SEQUENCE IF EXISTS seq_place; CREATE SEQUENCE seq_place start 1; GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ; -GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ; GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ; diff --git a/lib-sql/functions/normalization.sql b/lib-sql/tokenizer/legacy_tokenizer.sql similarity index 99% rename from lib-sql/functions/normalization.sql rename to lib-sql/tokenizer/legacy_tokenizer.sql index c7bd2fc5..d77b519e 100644 --- a/lib-sql/functions/normalization.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -38,7 +38,7 @@ BEGIN return_word_id := nextval('seq_word'); INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0); ELSE - IF count > get_maxwordfreq() THEN + IF count > {{ max_word_freq }} THEN return_word_id := NULL; END IF; END IF; diff --git a/lib-sql/tokenizer/legacy_tokenizer_tables.sql b/lib-sql/tokenizer/legacy_tokenizer_tables.sql new file mode 100644 index 00000000..3410b763 --- /dev/null +++ b/lib-sql/tokenizer/legacy_tokenizer_tables.sql @@ -0,0 +1,19 @@ +DROP TABLE IF EXISTS word; +CREATE TABLE word ( + word_id INTEGER, + word_token text NOT NULL, + word text, + class text, + type text, + country_code varchar(2), + search_name_count INTEGER, + operator TEXT +) {{db.tablespace.search_data}}; + +CREATE INDEX idx_word_word_token ON word + USING BTREE (word_token) {{db.tablespace.search_index}}; +GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; + +DROP SEQUENCE IF EXISTS seq_word; +CREATE SEQUENCE seq_word start 1; +GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}"; diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index ddc00d49..e6e74912 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -46,6 +46,7 @@ class UpdateRefresh: @staticmethod def run(args): from ..tools import refresh + from ..tokenizer import factory as tokenizer_factory if args.postcodes: LOG.warning("Update postcodes centroid") @@ -66,6 +67,8 @@ class UpdateRefresh: with connect(args.config.get_libpq_dsn()) as conn: refresh.create_functions(conn, args.config, args.diffs, args.enable_debug_statements) + tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) + tokenizer.update_sql_functions(args.config) if args.wiki_data: data_path = Path(args.config.WIKIPEDIA_DATA_PATH diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 066c2960..d8d953e3 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -100,15 +100,19 @@ class SetupAll: if args.continue_at is None or args.continue_at == 'load-data': LOG.warning('Initialise tables') with connect(args.config.get_libpq_dsn()) as conn: - database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY) + database_import.truncate_data_tables(conn) LOG.warning('Load data into placex table') database_import.load_data(args.config.get_libpq_dsn(), - args.data_dir, args.threads or psutil.cpu_count() or 1) LOG.warning("Setting up tokenizer") - tokenizer = tokenizer_factory.create_tokenizer(args.config) + if args.continue_at is None or args.continue_at == 'load-data': + # (re)initialise the tokenizer data + tokenizer = tokenizer_factory.create_tokenizer(args.config) + else: + # just load the tokenizer + tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) if args.continue_at is None or args.continue_at == 'load-data': LOG.warning('Calculate postcodes') diff --git a/nominatim/db/sql_preprocessor.py b/nominatim/db/sql_preprocessor.py index 9e0b2912..dafc5de4 100644 --- a/nominatim/db/sql_preprocessor.py +++ b/nominatim/db/sql_preprocessor.py @@ -89,8 +89,6 @@ class SQLPreprocessor: self.env.globals['db'] = db_info self.env.globals['sql'] = _setup_postgres_sql(conn) self.env.globals['postgres'] = _setup_postgresql_features(conn) - self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \ - str((config.project_dir / 'module').resolve()) def run_sql_file(self, conn, name, **kwargs): diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 2e05ce54..d0a404b9 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -8,9 +8,12 @@ import psycopg2 from nominatim.db.connection import connect from nominatim.db import properties +from nominatim.db import utils as db_utils +from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.errors import UsageError DBCFG_NORMALIZATION = "tokenizer_normalization" +DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" LOG = logging.getLogger() @@ -53,6 +56,9 @@ def _install_module(config_module_path, src_dir, module_dir): def _check_module(module_dir, conn): + """ Try to use the PostgreSQL module to confirm that it is correctly + installed and accessible from PostgreSQL. + """ with conn.cursor() as cur: try: cur.execute("""CREATE FUNCTION nominatim_test_import_func(text) @@ -91,7 +97,11 @@ class LegacyTokenizer: with connect(self.dsn) as conn: _check_module(module_dir, conn) - self._save_config(conn) + self._save_config(conn, config) + conn.commit() + + self.update_sql_functions(config) + self._init_db_tables(config) def init_from_project(self): @@ -101,6 +111,19 @@ class LegacyTokenizer: self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION) + def update_sql_functions(self, config): + """ Reimport the SQL functions for this tokenizer. + """ + with connect(self.dsn) as conn: + max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ) + modulepath = config.DATABASE_MODULE_PATH or \ + str((config.project_dir / 'module').resolve()) + sqlp = SQLPreprocessor(conn, config) + sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql', + max_word_freq=max_word_freq, + modulepath=modulepath) + + def migrate_database(self, config): """ Initialise the project directory of an existing database for use with this tokenizer. @@ -114,11 +137,25 @@ class LegacyTokenizer: with connect(self.dsn) as conn: _check_module(module_dir, conn) - self._save_config(conn) + self._save_config(conn, config) - def _save_config(self, conn): + def _init_db_tables(self, config): + """ Set up the word table and fill it with pre-computed word + frequencies. + """ + with connect(self.dsn) as conn: + sqlp = SQLPreprocessor(conn, config) + sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql') + conn.commit() + + LOG.warning("Precomputing word tokens") + db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql') + + + def _save_config(self, conn, config): """ Save the configuration that needs to remain stable for the given database as database properties. """ properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization) + properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY) diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 324a82cf..400ce7a5 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -160,11 +160,10 @@ def create_partition_tables(conn, config): sql.run_sql_file(conn, 'partition-tables.src.sql') -def truncate_data_tables(conn, max_word_frequency=None): +def truncate_data_tables(conn): """ Truncate all data tables to prepare for a fresh load. """ with conn.cursor() as cur: - cur.execute('TRUNCATE word') cur.execute('TRUNCATE placex') cur.execute('TRUNCATE place_addressline') cur.execute('TRUNCATE location_area') @@ -183,23 +182,13 @@ def truncate_data_tables(conn, max_word_frequency=None): for table in [r[0] for r in list(cur)]: cur.execute('TRUNCATE ' + table) - if max_word_frequency is not None: - # Used by getorcreate_word_id to ignore frequent partial words. - cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq() - RETURNS integer AS $$ - SELECT {} as maxwordfreq; - $$ LANGUAGE SQL IMMUTABLE - """.format(max_word_frequency)) - conn.commit() + conn.commit() _COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry' -def load_data(dsn, data_dir, threads): +def load_data(dsn, threads): """ Copy data into the word and placex table. """ - # Pre-calculate the most important terms in the word list. - db_utils.execute_file(dsn, data_dir / 'words.sql') - sel = selectors.DefaultSelector() # Then copy data from place to placex in chunks. place_threads = max(1, threads - 1) diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py index 2ca8324b..80539702 100644 --- a/nominatim/tools/migration.py +++ b/nominatim/tools/migration.py @@ -49,6 +49,8 @@ def migrate(config, paths): if has_run_migration: LOG.warning('Updating SQL functions.') refresh.create_functions(conn, config) + tokenizer = tokenizer_factory.get_tokenizer_for_db(config) + tokenizer.update_sql_functions(config) properties.set_property(conn, 'database_version', '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION)) diff --git a/test/python/conftest.py b/test/python/conftest.py index d0fdc569..b55c206d 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -286,7 +286,6 @@ def osm2pgsql_options(temp_db): @pytest.fixture def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory): - monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.') table_factory('country_name', 'partition INT', (0, 1, 2)) cfg = Configuration(None, SRC_DIR.resolve() / 'settings') cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php', diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 10a31542..1d4aefc7 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -139,7 +139,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), - mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'), + mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), mock_func_factory(nominatim.db.properties, 'set_property') ] @@ -161,7 +161,7 @@ def test_import_continue_postprocess(temp_db, mock_func_factory): mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), - mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'), + mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'), mock_func_factory(nominatim.db.properties, 'set_property') ] @@ -242,7 +242,6 @@ def test_special_phrases_command(temp_db, mock_func_factory): ('postcodes', 'update_postcodes'), ('word-counts', 'recompute_word_counts'), ('address-levels', 'load_address_levels_from_file'), - ('functions', 'create_functions'), ('wiki-data', 'import_wikipedia_articles'), ('importance', 'recompute_importance'), ('website', 'setup_website'), @@ -254,6 +253,22 @@ def test_refresh_command(mock_func_factory, temp_db, command, func): assert func_mock.called == 1 +def test_refresh_create_functions(mock_func_factory, monkeypatch, temp_db): + class DummyTokenizer: + def update_sql_functions(self, *args): + self.called = True + + func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions') + tok = DummyTokenizer() + monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' , + lambda *args: tok) + + + assert 0 == call_nominatim('refresh', '--functions') + assert func_mock.called == 1 + assert hasattr(tok, 'called') + + def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db): calls = [] monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles', diff --git a/test/python/test_db_sql_preprocessor.py b/test/python/test_db_sql_preprocessor.py index 08a195bd..6a254ef3 100644 --- a/test/python/test_db_sql_preprocessor.py +++ b/test/python/test_db_sql_preprocessor.py @@ -24,7 +24,6 @@ def sql_factory(tmp_path): ("'{{db.partitions|join}}'", '012'), ("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"), ("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"), - ("'{{config.DATABASE_MODULE_PATH}}'", '.') ]) def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret): sqlfile = sql_factory("RETURN {};".format(expr)) diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py index 44937904..95b49be3 100644 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@ -1,6 +1,8 @@ """ Test for legacy tokenizer. """ +import shutil + import pytest from nominatim.tokenizer import legacy_tokenizer @@ -18,6 +20,18 @@ def test_config(def_config, tmp_path): def_config.lib_dir.module = module_dir + sqldir = tmp_path / 'sql' + sqldir.mkdir() + (sqldir / 'tokenizer').mkdir() + (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'") + (sqldir / 'words.sql').write_text("SELECT 'a'") + shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'), + str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql')) + + def_config.lib_dir.sql = sqldir + def_config.lib_dir.data = sqldir + + return def_config @@ -30,13 +44,15 @@ def tokenizer_factory(dsn, tmp_path, monkeypatch): return _maker @pytest.fixture -def tokenizer_setup(tokenizer_factory, test_config, property_table, monkeypatch): +def tokenizer_setup(tokenizer_factory, test_config, property_table, + monkeypatch, sql_preprocessor): monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) tok = tokenizer_factory() tok.init_new_db(test_config) -def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn): +def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, + temp_db_conn, sql_preprocessor): monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv') monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) @@ -52,7 +68,8 @@ def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, t assert outfile.stat().st_mode == 33261 -def test_init_module_load_failed(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn): +def test_init_module_load_failed(tokenizer_factory, test_config, property_table, + monkeypatch, temp_db_conn): tok = tokenizer_factory() with pytest.raises(UsageError): @@ -60,7 +77,7 @@ def test_init_module_load_failed(tokenizer_factory, test_config, property_table, def test_init_module_custom(tokenizer_factory, test_config, property_table, - monkeypatch, tmp_path): + monkeypatch, tmp_path, sql_preprocessor): module_dir = (tmp_path / 'custom').resolve() module_dir.mkdir() (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so') diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py index 7ae19e2a..e31017e8 100644 --- a/test/python/test_tools_database_import.py +++ b/test/python/test_tools_database_import.py @@ -138,14 +138,14 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options): def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory): - tables = ('word', 'placex', 'place_addressline', 'location_area', + tables = ('placex', 'place_addressline', 'location_area', 'location_area_country', 'location_property_tiger', 'location_property_osmline', 'location_postcode', 'search_name', 'location_road_23') for table in tables: table_factory(table, content=(1, 2, 3)) - database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23) + database_import.truncate_data_tables(temp_db_conn) for table in tables: assert temp_db_cursor.table_rows(table) == 0 @@ -163,7 +163,7 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta place_row(osm_type='W', osm_id=342, cls='place', typ='houses', geom='SRID=4326;LINESTRING(0 0, 10 10)') - database_import.load_data(dsn, src_dir / 'data', threads) + database_import.load_data(dsn, threads) assert temp_db_cursor.table_rows('placex') == 30 assert temp_db_cursor.table_rows('location_property_osmline') == 1