mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-22 12:06:27 +03:00
recreate word table when refreshing counts
The counting touches a large part of the word table, leaving bloated tables and indexes. Thus recreate the table instead and swap it in.
This commit is contained in:
parent
33c0f249b1
commit
81eed0680c
@ -1,40 +0,0 @@
|
||||
-- SPDX-License-Identifier: GPL-2.0-only
|
||||
--
|
||||
-- This file is part of Nominatim. (https://nominatim.org)
|
||||
--
|
||||
-- Copyright (C) 2022 by the Nominatim developer community.
|
||||
-- For a full list of authors see the git log.
|
||||
|
||||
DROP TABLE IF EXISTS word;
|
||||
CREATE TABLE word (
|
||||
word_id INTEGER,
|
||||
word_token text NOT NULL,
|
||||
type text NOT NULL,
|
||||
word text,
|
||||
info jsonb
|
||||
) {{db.tablespace.search_data}};
|
||||
|
||||
CREATE INDEX idx_word_word_token ON word
|
||||
USING BTREE (word_token) {{db.tablespace.search_index}};
|
||||
-- Used when updating country names from the boundary relation.
|
||||
CREATE INDEX idx_word_country_names ON word
|
||||
USING btree(word) {{db.tablespace.address_index}}
|
||||
WHERE type = 'C';
|
||||
-- Used when inserting new postcodes on updates.
|
||||
CREATE INDEX idx_word_postcodes ON word
|
||||
USING btree(word) {{db.tablespace.address_index}}
|
||||
WHERE type = 'P';
|
||||
-- Used when inserting full words.
|
||||
CREATE INDEX idx_word_full_word ON word
|
||||
USING btree(word) {{db.tablespace.address_index}}
|
||||
WHERE type = 'W';
|
||||
-- Used when inserting analyzed housenumbers (exclude old-style entries).
|
||||
CREATE INDEX idx_word_housenumbers ON word
|
||||
USING btree(word) {{db.tablespace.address_index}}
|
||||
WHERE type = 'H' and word is not null;
|
||||
|
||||
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
|
||||
|
||||
DROP SEQUENCE IF EXISTS seq_word;
|
||||
CREATE SEQUENCE seq_word start 1;
|
||||
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
|
@ -110,7 +110,7 @@ class UpdateRefresh:
|
||||
|
||||
if args.word_counts:
|
||||
LOG.warning('Recompute word statistics')
|
||||
self._get_tokenizer(args.config).update_statistics()
|
||||
self._get_tokenizer(args.config).update_statistics(args.config)
|
||||
|
||||
if args.address_levels:
|
||||
LOG.warning('Updating address levels')
|
||||
|
@ -169,7 +169,7 @@ class SetupAll:
|
||||
tokenizer.finalize_import(args.config)
|
||||
|
||||
LOG.warning('Recompute word counts')
|
||||
tokenizer.update_statistics()
|
||||
tokenizer.update_statistics(args.config)
|
||||
|
||||
webdir = args.project_dir / 'website'
|
||||
LOG.warning('Setup website at %s', webdir)
|
||||
|
@ -90,6 +90,18 @@ class SQLPreprocessor:
|
||||
self.env.globals['postgres'] = _setup_postgresql_features(conn)
|
||||
|
||||
|
||||
def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
|
||||
""" Execute the given SQL template string on the connection.
|
||||
The keyword arguments may supply additional parameters
|
||||
for preprocessing.
|
||||
"""
|
||||
sql = self.env.from_string(template).render(**kwargs)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
|
||||
""" Execute the given SQL file on the connection. The keyword arguments
|
||||
may supply additional parameters for preprocessing.
|
||||
|
@ -201,7 +201,7 @@ class AbstractTokenizer(ABC):
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_statistics(self) -> None:
|
||||
def update_statistics(self, config: Configuration) -> None:
|
||||
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||
This function is meant to be called from time to time by the user
|
||||
to improve performance. However, the tokenizer must not depend on
|
||||
|
@ -31,6 +31,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
WORD_TYPES =(('country_names', 'C'),
|
||||
('postcodes', 'P'),
|
||||
('full_word', 'W'),
|
||||
('housenumbers', 'H'))
|
||||
|
||||
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
|
||||
""" Create a new instance of the tokenizer provided by this module.
|
||||
"""
|
||||
@ -62,7 +67,8 @@ class ICUTokenizer(AbstractTokenizer):
|
||||
|
||||
if init_db:
|
||||
self.update_sql_functions(config)
|
||||
self._init_db_tables(config)
|
||||
self._setup_db_tables(config, 'word')
|
||||
self._create_base_indices(config, 'word')
|
||||
|
||||
|
||||
def init_from_project(self, config: Configuration) -> None:
|
||||
@ -80,9 +86,7 @@ class ICUTokenizer(AbstractTokenizer):
|
||||
""" Do any required postprocessing to make the tokenizer data ready
|
||||
for use.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
||||
self._create_lookup_indices(config, 'word')
|
||||
|
||||
|
||||
def update_sql_functions(self, config: Configuration) -> None:
|
||||
@ -100,24 +104,35 @@ class ICUTokenizer(AbstractTokenizer):
|
||||
self.init_from_project(config)
|
||||
|
||||
|
||||
def update_statistics(self) -> None:
|
||||
def update_statistics(self, config: Configuration) -> None:
|
||||
""" Recompute frequencies for all name words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
if conn.table_exists('search_name'):
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word
|
||||
SET info = info || jsonb_build_object('count', count)
|
||||
FROM word_frequencies WHERE word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
if not conn.table_exists('search_name'):
|
||||
return
|
||||
|
||||
with conn.cursor() as cur:
|
||||
LOG.info('Computing word frequencies')
|
||||
cur.drop_table('word_frequencies')
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute('CREATE INDEX ON word_frequencies(id)')
|
||||
LOG.info('Update word table with recomputed frequencies')
|
||||
cur.drop_table('tmp_word')
|
||||
cur.execute("""CREATE TABLE tmp_word AS
|
||||
SELECT word_id, word_token, type, word,
|
||||
(CASE WHEN wf.count is null THEN info
|
||||
ELSE info || jsonb_build_object('count', wf.count)
|
||||
END) as info
|
||||
FROM word LEFT JOIN word_frequencies wf
|
||||
ON word.word_id = wf.id""")
|
||||
cur.drop_table('word_frequencies')
|
||||
conn.commit()
|
||||
self._create_base_indices(config, 'tmp_word')
|
||||
self._create_lookup_indices(config, 'tmp_word')
|
||||
self._move_temporary_word_table('tmp_word')
|
||||
|
||||
|
||||
|
||||
def _cleanup_housenumbers(self) -> None:
|
||||
@ -219,16 +234,81 @@ class ICUTokenizer(AbstractTokenizer):
|
||||
self.loader.save_config_to_db(conn)
|
||||
|
||||
|
||||
def _init_db_tables(self, config: Configuration) -> None:
|
||||
def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
|
||||
""" Set up the word table and fill it with pre-computed word
|
||||
frequencies.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table(table_name)
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_string(conn, """
|
||||
CREATE TABLE {{table_name}} (
|
||||
word_id INTEGER,
|
||||
word_token text NOT NULL,
|
||||
type text NOT NULL,
|
||||
word text,
|
||||
info jsonb
|
||||
) {{db.tablespace.search_data}};
|
||||
GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
|
||||
|
||||
DROP SEQUENCE IF EXISTS seq_{{table_name}};
|
||||
CREATE SEQUENCE seq_{{table_name}} start 1;
|
||||
GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
|
||||
""", table_name=table_name)
|
||||
|
||||
|
||||
def _create_base_indices(self, config: Configuration, table_name: str) -> None:
|
||||
""" Set up the word table and fill it with pre-computed word
|
||||
frequencies.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
||||
sqlp.run_string(conn,
|
||||
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
|
||||
USING BTREE (word_token) {{db.tablespace.search_index}}""",
|
||||
table_name=table_name)
|
||||
for name, ctype in WORD_TYPES:
|
||||
sqlp.run_string(conn,
|
||||
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
|
||||
USING BTREE (word) {{db.tablespace.address_index}}
|
||||
WHERE type = '{{column_type}}'
|
||||
""",
|
||||
table_name=table_name, idx_name=name,
|
||||
column_type=ctype)
|
||||
|
||||
|
||||
def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
|
||||
""" Create addtional indexes used when running the API.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
# Index required for details lookup.
|
||||
sqlp.run_string(conn, """
|
||||
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
|
||||
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
|
||||
""",
|
||||
table_name=table_name)
|
||||
|
||||
|
||||
def _move_temporary_word_table(self, old: str) -> None:
|
||||
""" Rename all tables and indexes used by the tokenizer.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table('word')
|
||||
cur.execute(f"ALTER TABLE {old} RENAME TO word")
|
||||
for idx in ('word_token', 'word_id'):
|
||||
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
|
||||
RENAME TO idx_word_{idx}""")
|
||||
for name, _ in WORD_TYPES:
|
||||
cur.execute(f"""ALTER INDEX idx_{old}_{name}
|
||||
RENAME TO idx_word_{name}""")
|
||||
conn.commit()
|
||||
|
||||
|
||||
|
||||
|
||||
class ICUNameAnalyzer(AbstractAnalyzer):
|
||||
""" The ICU analyzer uses the ICU library for splitting names.
|
||||
|
||||
|
@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
||||
self._save_config(conn, config)
|
||||
|
||||
|
||||
def update_statistics(self) -> None:
|
||||
def update_statistics(self, _: Configuration) -> None:
|
||||
""" Recompute the frequency of full words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
|
@ -38,10 +38,10 @@ class DummyTokenizer:
|
||||
def finalize_import(self, *args):
|
||||
self.finalize_import_called = True
|
||||
|
||||
def update_statistics(self):
|
||||
def update_statistics(self, *args):
|
||||
self.update_statistics_called = True
|
||||
|
||||
def update_word_tokens(self):
|
||||
def update_word_tokens(self, *args):
|
||||
self.update_word_tokens_called = True
|
||||
|
||||
|
||||
|
@ -7,7 +7,6 @@
|
||||
"""
|
||||
Tests for ICU tokenizer.
|
||||
"""
|
||||
import shutil
|
||||
import yaml
|
||||
import itertools
|
||||
|
||||
@ -32,8 +31,6 @@ def test_config(project_env, tmp_path):
|
||||
sqldir.mkdir()
|
||||
(sqldir / 'tokenizer').mkdir()
|
||||
(sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
|
||||
shutil.copy(str(project_env.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
|
||||
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
|
||||
|
||||
project_env.lib_dir.sql = sqldir
|
||||
|
||||
@ -204,16 +201,14 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
|
||||
|
||||
def test_finalize_import(tokenizer_factory, temp_db_conn,
|
||||
temp_db_cursor, test_config, sql_preprocessor_cfg):
|
||||
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_indices.sql'
|
||||
func_file.write_text("""CREATE FUNCTION test() RETURNS TEXT
|
||||
AS $$ SELECT 'b'::text $$ LANGUAGE SQL""")
|
||||
|
||||
tok = tokenizer_factory()
|
||||
tok.init_new_db(test_config)
|
||||
|
||||
assert not temp_db_conn.index_exists('idx_word_word_id')
|
||||
|
||||
tok.finalize_import(test_config)
|
||||
|
||||
temp_db_cursor.scalar('SELECT test()') == 'b'
|
||||
assert temp_db_conn.index_exists('idx_word_word_id')
|
||||
|
||||
|
||||
def test_check_database(test_config, tokenizer_factory,
|
||||
@ -224,19 +219,20 @@ def test_check_database(test_config, tokenizer_factory,
|
||||
assert tok.check_database(test_config) is None
|
||||
|
||||
|
||||
def test_update_statistics_reverse_only(word_table, tokenizer_factory):
|
||||
def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
|
||||
tok = tokenizer_factory()
|
||||
tok.update_statistics()
|
||||
tok.update_statistics(test_config)
|
||||
|
||||
|
||||
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
|
||||
def test_update_statistics(word_table, table_factory, temp_db_cursor,
|
||||
tokenizer_factory, test_config):
|
||||
word_table.add_full_word(1000, 'hello')
|
||||
table_factory('search_name',
|
||||
'place_id BIGINT, name_vector INT[]',
|
||||
[(12, [1000])])
|
||||
tok = tokenizer_factory()
|
||||
|
||||
tok.update_statistics()
|
||||
tok.update_statistics(test_config)
|
||||
|
||||
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
||||
WHERE type = 'W' and
|
||||
|
@ -238,19 +238,19 @@ def test_check_database_bad_setup(test_config, tokenizer_factory, monkeypatch,
|
||||
assert tok.check_database(False) is not None
|
||||
|
||||
|
||||
def test_update_statistics_reverse_only(word_table, tokenizer_factory):
|
||||
def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
|
||||
tok = tokenizer_factory()
|
||||
tok.update_statistics()
|
||||
tok.update_statistics(test_config)
|
||||
|
||||
|
||||
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
|
||||
def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config):
|
||||
word_table.add_full_word(1000, 'hello')
|
||||
table_factory('search_name',
|
||||
'place_id BIGINT, name_vector INT[]',
|
||||
[(12, [1000])])
|
||||
tok = tokenizer_factory()
|
||||
|
||||
tok.update_statistics()
|
||||
tok.update_statistics(test_config)
|
||||
|
||||
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
||||
WHERE word_token like ' %' and
|
||||
|
Loading…
Reference in New Issue
Block a user