mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-23 21:14:11 +03:00
move word table and normalisation SQL into tokenizer
Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation.
This commit is contained in:
parent
b5540dc35c
commit
fbbdd31399
@ -1,5 +1,4 @@
|
||||
{% include('functions/utils.sql') %}
|
||||
{% include('functions/normalization.sql') %}
|
||||
{% include('functions/ranking.sql') %}
|
||||
{% include('functions/importance.sql') %}
|
||||
{% include('functions/address_lookup.sql') %}
|
||||
|
@ -43,22 +43,6 @@ CREATE TABLE nominatim_properties (
|
||||
);
|
||||
GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";
|
||||
|
||||
drop table IF EXISTS word;
|
||||
CREATE TABLE word (
|
||||
word_id INTEGER,
|
||||
word_token text,
|
||||
word text,
|
||||
class text,
|
||||
type text,
|
||||
country_code varchar(2),
|
||||
search_name_count INTEGER,
|
||||
operator TEXT
|
||||
) {{db.tablespace.search_data}};
|
||||
CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}};
|
||||
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ;
|
||||
DROP SEQUENCE IF EXISTS seq_word;
|
||||
CREATE SEQUENCE seq_word start 1;
|
||||
|
||||
drop table IF EXISTS location_area CASCADE;
|
||||
CREATE TABLE location_area (
|
||||
place_id BIGINT,
|
||||
@ -178,7 +162,6 @@ DROP SEQUENCE IF EXISTS seq_place;
|
||||
CREATE SEQUENCE seq_place start 1;
|
||||
GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ;
|
||||
GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ;
|
||||
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ;
|
||||
GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ;
|
||||
GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ;
|
||||
GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ;
|
||||
|
@ -38,7 +38,7 @@ BEGIN
|
||||
return_word_id := nextval('seq_word');
|
||||
INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
|
||||
ELSE
|
||||
IF count > get_maxwordfreq() THEN
|
||||
IF count > {{ max_word_freq }} THEN
|
||||
return_word_id := NULL;
|
||||
END IF;
|
||||
END IF;
|
19
lib-sql/tokenizer/legacy_tokenizer_tables.sql
Normal file
19
lib-sql/tokenizer/legacy_tokenizer_tables.sql
Normal file
@ -0,0 +1,19 @@
|
||||
DROP TABLE IF EXISTS word;
|
||||
CREATE TABLE word (
|
||||
word_id INTEGER,
|
||||
word_token text NOT NULL,
|
||||
word text,
|
||||
class text,
|
||||
type text,
|
||||
country_code varchar(2),
|
||||
search_name_count INTEGER,
|
||||
operator TEXT
|
||||
) {{db.tablespace.search_data}};
|
||||
|
||||
CREATE INDEX idx_word_word_token ON word
|
||||
USING BTREE (word_token) {{db.tablespace.search_index}};
|
||||
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
|
||||
|
||||
DROP SEQUENCE IF EXISTS seq_word;
|
||||
CREATE SEQUENCE seq_word start 1;
|
||||
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
|
@ -46,6 +46,7 @@ class UpdateRefresh:
|
||||
@staticmethod
|
||||
def run(args):
|
||||
from ..tools import refresh
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
if args.postcodes:
|
||||
LOG.warning("Update postcodes centroid")
|
||||
@ -66,6 +67,8 @@ class UpdateRefresh:
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.create_functions(conn, args.config,
|
||||
args.diffs, args.enable_debug_statements)
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
tokenizer.update_sql_functions(args.config)
|
||||
|
||||
if args.wiki_data:
|
||||
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
|
||||
|
@ -100,15 +100,19 @@ class SetupAll:
|
||||
if args.continue_at is None or args.continue_at == 'load-data':
|
||||
LOG.warning('Initialise tables')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
|
||||
database_import.truncate_data_tables(conn)
|
||||
|
||||
LOG.warning('Load data into placex table')
|
||||
database_import.load_data(args.config.get_libpq_dsn(),
|
||||
args.data_dir,
|
||||
args.threads or psutil.cpu_count() or 1)
|
||||
|
||||
LOG.warning("Setting up tokenizer")
|
||||
tokenizer = tokenizer_factory.create_tokenizer(args.config)
|
||||
if args.continue_at is None or args.continue_at == 'load-data':
|
||||
# (re)initialise the tokenizer data
|
||||
tokenizer = tokenizer_factory.create_tokenizer(args.config)
|
||||
else:
|
||||
# just load the tokenizer
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
|
||||
if args.continue_at is None or args.continue_at == 'load-data':
|
||||
LOG.warning('Calculate postcodes')
|
||||
|
@ -89,8 +89,6 @@ class SQLPreprocessor:
|
||||
self.env.globals['db'] = db_info
|
||||
self.env.globals['sql'] = _setup_postgres_sql(conn)
|
||||
self.env.globals['postgres'] = _setup_postgresql_features(conn)
|
||||
self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
|
||||
str((config.project_dir / 'module').resolve())
|
||||
|
||||
|
||||
def run_sql_file(self, conn, name, **kwargs):
|
||||
|
@ -8,9 +8,12 @@ import psycopg2
|
||||
|
||||
from nominatim.db.connection import connect
|
||||
from nominatim.db import properties
|
||||
from nominatim.db import utils as db_utils
|
||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
DBCFG_NORMALIZATION = "tokenizer_normalization"
|
||||
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
@ -53,6 +56,9 @@ def _install_module(config_module_path, src_dir, module_dir):
|
||||
|
||||
|
||||
def _check_module(module_dir, conn):
|
||||
""" Try to use the PostgreSQL module to confirm that it is correctly
|
||||
installed and accessible from PostgreSQL.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
try:
|
||||
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
|
||||
@ -91,7 +97,11 @@ class LegacyTokenizer:
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
_check_module(module_dir, conn)
|
||||
self._save_config(conn)
|
||||
self._save_config(conn, config)
|
||||
conn.commit()
|
||||
|
||||
self.update_sql_functions(config)
|
||||
self._init_db_tables(config)
|
||||
|
||||
|
||||
def init_from_project(self):
|
||||
@ -101,6 +111,19 @@ class LegacyTokenizer:
|
||||
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
|
||||
|
||||
|
||||
def update_sql_functions(self, config):
|
||||
""" Reimport the SQL functions for this tokenizer.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
|
||||
modulepath = config.DATABASE_MODULE_PATH or \
|
||||
str((config.project_dir / 'module').resolve())
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
|
||||
max_word_freq=max_word_freq,
|
||||
modulepath=modulepath)
|
||||
|
||||
|
||||
def migrate_database(self, config):
|
||||
""" Initialise the project directory of an existing database for
|
||||
use with this tokenizer.
|
||||
@ -114,11 +137,25 @@ class LegacyTokenizer:
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
_check_module(module_dir, conn)
|
||||
self._save_config(conn)
|
||||
self._save_config(conn, config)
|
||||
|
||||
|
||||
def _save_config(self, conn):
|
||||
def _init_db_tables(self, config):
|
||||
""" Set up the word table and fill it with pre-computed word
|
||||
frequencies.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
|
||||
conn.commit()
|
||||
|
||||
LOG.warning("Precomputing word tokens")
|
||||
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
|
||||
|
||||
|
||||
def _save_config(self, conn, config):
|
||||
""" Save the configuration that needs to remain stable for the given
|
||||
database as database properties.
|
||||
"""
|
||||
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
|
||||
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
||||
|
@ -160,11 +160,10 @@ def create_partition_tables(conn, config):
|
||||
sql.run_sql_file(conn, 'partition-tables.src.sql')
|
||||
|
||||
|
||||
def truncate_data_tables(conn, max_word_frequency=None):
|
||||
def truncate_data_tables(conn):
|
||||
""" Truncate all data tables to prepare for a fresh load.
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('TRUNCATE word')
|
||||
cur.execute('TRUNCATE placex')
|
||||
cur.execute('TRUNCATE place_addressline')
|
||||
cur.execute('TRUNCATE location_area')
|
||||
@ -183,23 +182,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
|
||||
for table in [r[0] for r in list(cur)]:
|
||||
cur.execute('TRUNCATE ' + table)
|
||||
|
||||
if max_word_frequency is not None:
|
||||
# Used by getorcreate_word_id to ignore frequent partial words.
|
||||
cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
|
||||
RETURNS integer AS $$
|
||||
SELECT {} as maxwordfreq;
|
||||
$$ LANGUAGE SQL IMMUTABLE
|
||||
""".format(max_word_frequency))
|
||||
conn.commit()
|
||||
conn.commit()
|
||||
|
||||
_COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
|
||||
|
||||
def load_data(dsn, data_dir, threads):
|
||||
def load_data(dsn, threads):
|
||||
""" Copy data into the word and placex table.
|
||||
"""
|
||||
# Pre-calculate the most important terms in the word list.
|
||||
db_utils.execute_file(dsn, data_dir / 'words.sql')
|
||||
|
||||
sel = selectors.DefaultSelector()
|
||||
# Then copy data from place to placex in <threads - 1> chunks.
|
||||
place_threads = max(1, threads - 1)
|
||||
|
@ -49,6 +49,8 @@ def migrate(config, paths):
|
||||
if has_run_migration:
|
||||
LOG.warning('Updating SQL functions.')
|
||||
refresh.create_functions(conn, config)
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
|
||||
tokenizer.update_sql_functions(config)
|
||||
|
||||
properties.set_property(conn, 'database_version',
|
||||
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
|
||||
|
@ -286,7 +286,6 @@ def osm2pgsql_options(temp_db):
|
||||
|
||||
@pytest.fixture
|
||||
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
|
||||
monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
|
||||
table_factory('country_name', 'partition INT', (0, 1, 2))
|
||||
cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
|
||||
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
|
||||
|
@ -139,7 +139,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
|
||||
mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
|
||||
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
|
||||
mock_func_factory(nominatim.db.properties, 'set_property')
|
||||
]
|
||||
@ -161,7 +161,7 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
|
||||
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
|
||||
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'),
|
||||
mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'),
|
||||
mock_func_factory(nominatim.db.properties, 'set_property')
|
||||
]
|
||||
|
||||
@ -242,7 +242,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
|
||||
('postcodes', 'update_postcodes'),
|
||||
('word-counts', 'recompute_word_counts'),
|
||||
('address-levels', 'load_address_levels_from_file'),
|
||||
('functions', 'create_functions'),
|
||||
('wiki-data', 'import_wikipedia_articles'),
|
||||
('importance', 'recompute_importance'),
|
||||
('website', 'setup_website'),
|
||||
@ -254,6 +253,22 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
|
||||
assert func_mock.called == 1
|
||||
|
||||
|
||||
def test_refresh_create_functions(mock_func_factory, monkeypatch, temp_db):
|
||||
class DummyTokenizer:
|
||||
def update_sql_functions(self, *args):
|
||||
self.called = True
|
||||
|
||||
func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
|
||||
tok = DummyTokenizer()
|
||||
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
|
||||
lambda *args: tok)
|
||||
|
||||
|
||||
assert 0 == call_nominatim('refresh', '--functions')
|
||||
assert func_mock.called == 1
|
||||
assert hasattr(tok, 'called')
|
||||
|
||||
|
||||
def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
|
||||
calls = []
|
||||
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
|
||||
|
@ -24,7 +24,6 @@ def sql_factory(tmp_path):
|
||||
("'{{db.partitions|join}}'", '012'),
|
||||
("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
|
||||
("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
|
||||
("'{{config.DATABASE_MODULE_PATH}}'", '.')
|
||||
])
|
||||
def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
|
||||
sqlfile = sql_factory("RETURN {};".format(expr))
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""
|
||||
Test for legacy tokenizer.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
import pytest
|
||||
|
||||
from nominatim.tokenizer import legacy_tokenizer
|
||||
@ -18,6 +20,18 @@ def test_config(def_config, tmp_path):
|
||||
|
||||
def_config.lib_dir.module = module_dir
|
||||
|
||||
sqldir = tmp_path / 'sql'
|
||||
sqldir.mkdir()
|
||||
(sqldir / 'tokenizer').mkdir()
|
||||
(sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
|
||||
(sqldir / 'words.sql').write_text("SELECT 'a'")
|
||||
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
|
||||
str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
|
||||
|
||||
def_config.lib_dir.sql = sqldir
|
||||
def_config.lib_dir.data = sqldir
|
||||
|
||||
|
||||
return def_config
|
||||
|
||||
|
||||
@ -30,13 +44,15 @@ def tokenizer_factory(dsn, tmp_path, monkeypatch):
|
||||
return _maker
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer_setup(tokenizer_factory, test_config, property_table, monkeypatch):
|
||||
def tokenizer_setup(tokenizer_factory, test_config, property_table,
|
||||
monkeypatch, sql_preprocessor):
|
||||
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
||||
tok = tokenizer_factory()
|
||||
tok.init_new_db(test_config)
|
||||
|
||||
|
||||
def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
|
||||
def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch,
|
||||
temp_db_conn, sql_preprocessor):
|
||||
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
|
||||
monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
|
||||
|
||||
@ -52,7 +68,8 @@ def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, t
|
||||
assert outfile.stat().st_mode == 33261
|
||||
|
||||
|
||||
def test_init_module_load_failed(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn):
|
||||
def test_init_module_load_failed(tokenizer_factory, test_config, property_table,
|
||||
monkeypatch, temp_db_conn):
|
||||
tok = tokenizer_factory()
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
@ -60,7 +77,7 @@ def test_init_module_load_failed(tokenizer_factory, test_config, property_table,
|
||||
|
||||
|
||||
def test_init_module_custom(tokenizer_factory, test_config, property_table,
|
||||
monkeypatch, tmp_path):
|
||||
monkeypatch, tmp_path, sql_preprocessor):
|
||||
module_dir = (tmp_path / 'custom').resolve()
|
||||
module_dir.mkdir()
|
||||
(module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
|
||||
|
@ -138,14 +138,14 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):
|
||||
|
||||
|
||||
def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
|
||||
tables = ('word', 'placex', 'place_addressline', 'location_area',
|
||||
tables = ('placex', 'place_addressline', 'location_area',
|
||||
'location_area_country',
|
||||
'location_property_tiger', 'location_property_osmline',
|
||||
'location_postcode', 'search_name', 'location_road_23')
|
||||
for table in tables:
|
||||
table_factory(table, content=(1, 2, 3))
|
||||
|
||||
database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
|
||||
database_import.truncate_data_tables(temp_db_conn)
|
||||
|
||||
for table in tables:
|
||||
assert temp_db_cursor.table_rows(table) == 0
|
||||
@ -163,7 +163,7 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
|
||||
place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
|
||||
geom='SRID=4326;LINESTRING(0 0, 10 10)')
|
||||
|
||||
database_import.load_data(dsn, src_dir / 'data', threads)
|
||||
database_import.load_data(dsn, threads)
|
||||
|
||||
assert temp_db_cursor.table_rows('placex') == 30
|
||||
assert temp_db_cursor.table_rows('location_property_osmline') == 1
|
||||
|
Loading…
Reference in New Issue
Block a user