Merge pull request #2641 from lonvia/reinit-tokenizer-dir

Transparantly reinitialize tokenizer directory when necessary
2024-12-26 22:44:44 +03:00 · 2022-03-20 21:46:07 +01:00 · 2022-03-20 21:46:07 +01:00 · d33c82cb66
commit d33c82cb66
parent 2f266d946b 4c66c35ed6
9 changed files with 62 additions and 37 deletions
--- a/docs/admin/Advanced-Installations.md
+++ b/docs/admin/Advanced-Installations.md
@ -198,11 +198,10 @@ target machine.
    of a full database.

 Next install Nominatim on the target machine by following the standard installation
-instructions. Again make sure to use the same version as the source machine.
+instructions. Again, make sure to use the same version as the source machine.

-You can now copy the project directory from the source machine to the new machine.
-If necessary, edit the `.env` file to point it to the restored database.
-Finally run
+Create a project directory on your destination machine and set up the `.env`
+file to match the configuration on the source machine. Finally run

    nominatim refresh --website

@ -210,6 +209,8 @@ to make sure that the local installation of Nominatim will be used.

 If you are using the legacy tokenizer you might also have to switch to the
 PostgreSQL module that was compiled on your target machine. If you get errors
-that PostgreSQL cannot find or access `nominatim.so` then copy the installed
-version into the `module` directory of your project directory. The installed
-copy can usually be found under `/usr/local/lib/nominatim/module/nominatim.so`.
+that PostgreSQL cannot find or access `nominatim.so` then rerun
+
+   nominatim refresh --functions
+
+on the target machine to update the the location of the module.
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@ -117,6 +117,10 @@ class UpdateRefresh:
        if args.website:
            webdir = args.project_dir / 'website'
            LOG.warning('Setting up website directory at %s', webdir)
+            # This is a little bit hacky: call the tokenizer setup, so that
+            # the tokenizer directory gets repopulated as well, in case it
+            # wasn't there yet.
+            self._get_tokenizer(args.config)
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.setup_website(webdir, args.config, conn)

--- a/nominatim/config.py
+++ b/nominatim/config.py
@ -18,7 +18,7 @@ from dotenv import dotenv_values
 from nominatim.errors import UsageError

 LOG = logging.getLogger()
-
+CONFIG_CACHE = {}

 def flatten_config_list(content, section=''):
    """ Flatten YAML configuration lists that contain include sections
@ -181,14 +181,19 @@ class Configuration:
        """
        configfile = self.find_config_file(filename, config)

+        if str(configfile) in CONFIG_CACHE:
+            return CONFIG_CACHE[str(configfile)]
+
        if configfile.suffix in ('.yaml', '.yml'):
-            return self._load_from_yaml(configfile)
-
-        if configfile.suffix == '.json':
+            result = self._load_from_yaml(configfile)
+        elif configfile.suffix == '.json':
            with configfile.open('r') as cfg:
-                return json.load(cfg)
+                result = json.load(cfg)
+        else:
+            raise UsageError(f"Config file '{configfile}' has unknown format.")

-        raise UsageError(f"Config file '{configfile}' has unknown format.")
+        CONFIG_CACHE[str(configfile)] = result
+        return result


    def find_config_file(self, filename, config=None):
--- a/nominatim/db/properties.py
+++ b/nominatim/db/properties.py
@ -27,6 +27,9 @@ def get_property(conn, name):
    """ Return the current value of the given propery or None if the property
        is not set.
    """
+    if not conn.table_exists('nominatim_properties'):
+        return None
+
    with conn.cursor() as cur:
        cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
                    (name, ))
--- a/nominatim/tokenizer/factory.py
+++ b/nominatim/tokenizer/factory.py
@ -78,8 +78,8 @@ def get_tokenizer_for_db(config):
    """
    basedir = config.project_dir / 'tokenizer'
    if not basedir.is_dir():
-        LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
-        raise UsageError('Cannot initialize tokenizer.')
+        # Directory will be repopulated by tokenizer below.
+        basedir.mkdir()

    with connect(config.get_libpq_dsn()) as conn:
        name = properties.get_property(conn, 'tokenizer')
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@ -51,7 +51,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
        """
        self.loader = ICURuleLoader(config)

-        self._install_php(config.lib_dir.php)
+        self._install_php(config.lib_dir.php, overwrite=True)
        self._save_config()

        if init_db:
@ -67,6 +67,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
        with connect(self.dsn) as conn:
            self.loader.load_config_from_db(conn)

+        self._install_php(config.lib_dir.php, overwrite=False)
+

    def finalize_import(self, config):
        """ Do any required postprocessing to make the tokenizer data ready
@ -174,16 +176,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
                                     self.loader.make_token_analysis())


-    def _install_php(self, phpdir):
+    def _install_php(self, phpdir, overwrite=True):
        """ Install the php script for the tokenizer.
        """
        php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent(f"""\
-            <?php
-            @define('CONST_Max_Word_Frequency', 10000000);
-            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+
+        if not php_file.exists() or overwrite:
+            php_file.write_text(dedent(f"""\
+                <?php
+                @define('CONST_Max_Word_Frequency', 10000000);
+                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))


    def _save_config(self):
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@ -107,7 +107,7 @@ class LegacyTokenizer(AbstractTokenizer):

        self.normalization = config.TERM_NORMALIZATION

-        self._install_php(config)
+        self._install_php(config, overwrite=True)

        with connect(self.dsn) as conn:
            _check_module(module_dir, conn)
@ -119,12 +119,18 @@ class LegacyTokenizer(AbstractTokenizer):
            self._init_db_tables(config)


-    def init_from_project(self, _):
+    def init_from_project(self, config):
        """ Initialise the tokenizer from the project directory.
        """
        with connect(self.dsn) as conn:
            self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)

+        if not (config.project_dir / 'module' / 'nominatim.so').exists():
+            _install_module(config.DATABASE_MODULE_PATH,
+                            config.lib_dir.module,
+                            config.project_dir / 'module')
+
+        self._install_php(config, overwrite=False)

    def finalize_import(self, config):
        """ Do any required postprocessing to make the tokenizer data ready
@ -238,16 +244,18 @@ class LegacyTokenizer(AbstractTokenizer):
        return LegacyNameAnalyzer(self.dsn, normalizer)


-    def _install_php(self, config):
+    def _install_php(self, config, overwrite=True):
        """ Install the php script for the tokenizer.
        """
        php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent("""\
-            <?php
-            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
-            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-            """.format(config)))
+
+        if not php_file.exists() or overwrite:
+            php_file.write_text(dedent("""\
+                <?php
+                @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+                @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+                require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+                """.format(config)))


    def _init_db_tables(self, config):
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@ -217,7 +217,7 @@ class NominatimEnvironment:
                    self.db_drop_database(self.api_test_db)
                    raise

-        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
+        tokenizer_factory.get_tokenizer_for_db(self.get_test_config())


    def setup_unknown_db(self):
--- a/test/python/tokenizer/test_factory.py
+++ b/test/python/tokenizer/test_factory.py
@ -63,13 +63,13 @@ class TestFactory:
        assert tokenizer.init_state == "loaded"


-    def test_load_no_tokenizer_dir(self):
+    def test_load_repopulate_tokenizer_dir(self):
        factory.create_tokenizer(self.config)

-        self.config.project_dir = self.config.project_dir / 'foo'
+        self.config.project_dir = self.config.project_dir

-        with pytest.raises(UsageError):
-            factory.get_tokenizer_for_db(self.config)
+        factory.get_tokenizer_for_db(self.config)
+        assert (self.config.project_dir / 'tokenizer').exists()


    def test_load_missing_property(self, temp_db_cursor):