Merge pull request #2641 from lonvia/reinit-tokenizer-dir

Transparantly reinitialize tokenizer directory when necessary
This commit is contained in:
Sarah Hoffmann 2022-03-20 21:46:07 +01:00 committed by GitHub
commit d33c82cb66
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 62 additions and 37 deletions

View File

@ -198,11 +198,10 @@ target machine.
of a full database.
Next install Nominatim on the target machine by following the standard installation
instructions. Again make sure to use the same version as the source machine.
instructions. Again, make sure to use the same version as the source machine.
You can now copy the project directory from the source machine to the new machine.
If necessary, edit the `.env` file to point it to the restored database.
Finally run
Create a project directory on your destination machine and set up the `.env`
file to match the configuration on the source machine. Finally run
nominatim refresh --website
@ -210,6 +209,8 @@ to make sure that the local installation of Nominatim will be used.
If you are using the legacy tokenizer you might also have to switch to the
PostgreSQL module that was compiled on your target machine. If you get errors
that PostgreSQL cannot find or access `nominatim.so` then copy the installed
version into the `module` directory of your project directory. The installed
copy can usually be found under `/usr/local/lib/nominatim/module/nominatim.so`.
that PostgreSQL cannot find or access `nominatim.so` then rerun
nominatim refresh --functions
on the target machine to update the the location of the module.

View File

@ -117,6 +117,10 @@ class UpdateRefresh:
if args.website:
webdir = args.project_dir / 'website'
LOG.warning('Setting up website directory at %s', webdir)
# This is a little bit hacky: call the tokenizer setup, so that
# the tokenizer directory gets repopulated as well, in case it
# wasn't there yet.
self._get_tokenizer(args.config)
with connect(args.config.get_libpq_dsn()) as conn:
refresh.setup_website(webdir, args.config, conn)

View File

@ -18,7 +18,7 @@ from dotenv import dotenv_values
from nominatim.errors import UsageError
LOG = logging.getLogger()
CONFIG_CACHE = {}
def flatten_config_list(content, section=''):
""" Flatten YAML configuration lists that contain include sections
@ -181,14 +181,19 @@ class Configuration:
"""
configfile = self.find_config_file(filename, config)
if str(configfile) in CONFIG_CACHE:
return CONFIG_CACHE[str(configfile)]
if configfile.suffix in ('.yaml', '.yml'):
return self._load_from_yaml(configfile)
if configfile.suffix == '.json':
result = self._load_from_yaml(configfile)
elif configfile.suffix == '.json':
with configfile.open('r') as cfg:
return json.load(cfg)
result = json.load(cfg)
else:
raise UsageError(f"Config file '{configfile}' has unknown format.")
raise UsageError(f"Config file '{configfile}' has unknown format.")
CONFIG_CACHE[str(configfile)] = result
return result
def find_config_file(self, filename, config=None):

View File

@ -27,6 +27,9 @@ def get_property(conn, name):
""" Return the current value of the given propery or None if the property
is not set.
"""
if not conn.table_exists('nominatim_properties'):
return None
with conn.cursor() as cur:
cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
(name, ))

View File

@ -78,8 +78,8 @@ def get_tokenizer_for_db(config):
"""
basedir = config.project_dir / 'tokenizer'
if not basedir.is_dir():
LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
raise UsageError('Cannot initialize tokenizer.')
# Directory will be repopulated by tokenizer below.
basedir.mkdir()
with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer')

View File

@ -51,7 +51,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
"""
self.loader = ICURuleLoader(config)
self._install_php(config.lib_dir.php)
self._install_php(config.lib_dir.php, overwrite=True)
self._save_config()
if init_db:
@ -67,6 +67,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
self._install_php(config.lib_dir.php, overwrite=False)
def finalize_import(self, config):
""" Do any required postprocessing to make the tokenizer data ready
@ -174,16 +176,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
self.loader.make_token_analysis())
def _install_php(self, phpdir):
def _install_php(self, phpdir, overwrite=True):
""" Install the php script for the tokenizer.
"""
php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
if not php_file.exists() or overwrite:
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
def _save_config(self):

View File

@ -107,7 +107,7 @@ class LegacyTokenizer(AbstractTokenizer):
self.normalization = config.TERM_NORMALIZATION
self._install_php(config)
self._install_php(config, overwrite=True)
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
@ -119,12 +119,18 @@ class LegacyTokenizer(AbstractTokenizer):
self._init_db_tables(config)
def init_from_project(self, _):
def init_from_project(self, config):
""" Initialise the tokenizer from the project directory.
"""
with connect(self.dsn) as conn:
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
if not (config.project_dir / 'module' / 'nominatim.so').exists():
_install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
self._install_php(config, overwrite=False)
def finalize_import(self, config):
""" Do any required postprocessing to make the tokenizer data ready
@ -238,16 +244,18 @@ class LegacyTokenizer(AbstractTokenizer):
return LegacyNameAnalyzer(self.dsn, normalizer)
def _install_php(self, config):
def _install_php(self, config, overwrite=True):
""" Install the php script for the tokenizer.
"""
php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent("""\
<?php
@define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
""".format(config)))
if not php_file.exists() or overwrite:
php_file.write_text(dedent("""\
<?php
@define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
""".format(config)))
def _init_db_tables(self, config):

View File

@ -217,7 +217,7 @@ class NominatimEnvironment:
self.db_drop_database(self.api_test_db)
raise
tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
tokenizer_factory.get_tokenizer_for_db(self.get_test_config())
def setup_unknown_db(self):

View File

@ -63,13 +63,13 @@ class TestFactory:
assert tokenizer.init_state == "loaded"
def test_load_no_tokenizer_dir(self):
def test_load_repopulate_tokenizer_dir(self):
factory.create_tokenizer(self.config)
self.config.project_dir = self.config.project_dir / 'foo'
self.config.project_dir = self.config.project_dir
with pytest.raises(UsageError):
factory.get_tokenizer_for_db(self.config)
factory.get_tokenizer_for_db(self.config)
assert (self.config.project_dir / 'tokenizer').exists()
def test_load_missing_property(self, temp_db_cursor):