From d74ae669e3cfd9d91b7ec6ef373bd5e9643ee148 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 17 Apr 2021 11:07:04 +0200 Subject: [PATCH 1/2] add support index when continuing import at index phase Indexing scans the placex table sequentially during indexing on the initial import. That is okay because we know that all rows need to be processed anywhere. When continuing the import, however, a large part might already be indexed, so that the process spends a lot of time going through rows that are no longer of interest. Create a supporting index for all unindexed rows to speed up the scan. This is the same index as used later for updates. --- nominatim/clicmd/setup.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index fe7c8dc1..fb7abdec 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -105,11 +105,11 @@ class SetupAll: LOG.error('Wikipedia importance dump file not found. ' 'Will be using default importances.') + if args.continue_at is None or args.continue_at == 'load-data': LOG.warning('Initialise tables') with connect(args.config.get_libpq_dsn()) as conn: database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY) - if args.continue_at is None or args.continue_at == 'load-data': LOG.warning('Load data into placex table') database_import.load_data(args.config.get_libpq_dsn(), args.data_dir, @@ -119,6 +119,9 @@ class SetupAll: postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir) if args.continue_at is None or args.continue_at in ('load-data', 'indexing'): + if args.continue_at is not None and args.continue_at != 'load-data': + with connect(args.config.get_libpq_dsn()) as conn: + SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX) LOG.warning('Indexing places') indexer = Indexer(args.config.get_libpq_dsn(), args.threads or psutil.cpu_count() or 1) @@ -148,3 +151,25 @@ class SetupAll: '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION)) return 0 + + + @staticmethod + def _create_pending_index(conn, tablespace): + """ Add a supporting index for finding places still to be indexed. + + This index is normally created at the end of the import process + for later updates. When indexing was partially done, then this + index can greatly improve speed going through already indexed data. + """ + if conn.index_exists('idx_placex_pendingsector'): + return + + with conn.cursor() as cur: + LOG.warning('Creating support index') + if tablespace: + tablespace = 'TABLESPACE ' + tablespace + cur.execute("""CREATE INDEX idx_placex_pendingsector + ON placex USING BTREE (rank_address,geometry_sector) + {} WHERE indexed_status > 0 + """.format(tablespace)) + conn.commit() From 2ca11ccc6baa9ae49145b00392f42ff0a8f19017 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 17 Apr 2021 11:10:36 +0200 Subject: [PATCH 2/2] add tests for continuing import --- test/python/test_cli.py | 57 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 38bbaefe..afa01e57 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -49,6 +49,7 @@ def mock_run_legacy(monkeypatch): def mock_func_factory(monkeypatch): def get_mock(module, func): mock = MockParamCapture() + mock.func_name = func monkeypatch.setattr(module, func, mock) return mock @@ -110,7 +111,61 @@ def test_import_full(temp_db, mock_func_factory): assert cf_mock.called > 1 for mock in mocks: - assert mock.called == 1 + assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) + + +def test_import_continue_load_data(temp_db, mock_func_factory): + mocks = [ + mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'), + mock_func_factory(nominatim.tools.database_import, 'load_data'), + mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), + mock_func_factory(nominatim.tools.database_import, 'create_country_names'), + mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'), + mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), + mock_func_factory(nominatim.tools.refresh, 'setup_website'), + mock_func_factory(nominatim.db.properties, 'set_property') + ] + + assert 0 == call_nominatim('import', '--continue', 'load-data') + + for mock in mocks: + assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) + + +def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn): + mocks = [ + mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), + mock_func_factory(nominatim.tools.database_import, 'create_country_names'), + mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), + mock_func_factory(nominatim.tools.refresh, 'setup_website'), + mock_func_factory(nominatim.db.properties, 'set_property') + ] + + assert 0 == call_nominatim('import', '--continue', 'indexing') + + for mock in mocks: + assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) + + assert temp_db_conn.index_exists('idx_placex_pendingsector') + + # Calling it again still works for the index + assert 0 == call_nominatim('import', '--continue', 'indexing') + assert temp_db_conn.index_exists('idx_placex_pendingsector') + + +def test_import_continue_postprocess(temp_db, mock_func_factory): + mocks = [ + mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), + mock_func_factory(nominatim.tools.database_import, 'create_country_names'), + mock_func_factory(nominatim.tools.refresh, 'setup_website'), + mock_func_factory(nominatim.db.properties, 'set_property') + ] + + assert 0 == call_nominatim('import', '--continue', 'db-postprocess') + + for mock in mocks: + assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) + def test_freeze_command(mock_func_factory, temp_db): mock_drop = mock_func_factory(nominatim.tools.freeze, 'drop_update_tables')