2022-01-03 18:23:58 +03:00
|
|
|
# SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
#
|
|
|
|
# This file is part of Nominatim. (https://nominatim.org)
|
|
|
|
#
|
|
|
|
# Copyright (C) 2022 by the Nominatim developer community.
|
|
|
|
# For a full list of authors see the git log.
|
2021-04-21 10:57:17 +03:00
|
|
|
"""
|
|
|
|
Functions for creating a tokenizer or initialising the right one for an
|
|
|
|
existing database.
|
|
|
|
|
|
|
|
A tokenizer is something that is bound to the lifetime of a database. It
|
|
|
|
can be choosen and configured before the intial import but then needs to
|
|
|
|
be used consistently when querying and updating the database.
|
|
|
|
|
|
|
|
This module provides the functions to create and configure a new tokenizer
|
|
|
|
as well as instanciating the appropriate tokenizer for updating an existing
|
|
|
|
database.
|
|
|
|
|
|
|
|
A tokenizer usually also includes PHP code for querying. The appropriate PHP
|
|
|
|
normalizer module is installed, when the tokenizer is created.
|
|
|
|
"""
|
2022-07-13 23:55:40 +03:00
|
|
|
from typing import Optional
|
2021-04-21 10:57:17 +03:00
|
|
|
import logging
|
|
|
|
import importlib
|
2021-05-18 17:28:21 +03:00
|
|
|
from pathlib import Path
|
2021-04-21 10:57:17 +03:00
|
|
|
|
2022-07-13 23:55:40 +03:00
|
|
|
from nominatim.errors import UsageError
|
|
|
|
from nominatim.db import properties
|
|
|
|
from nominatim.db.connection import connect
|
|
|
|
from nominatim.config import Configuration
|
|
|
|
from nominatim.tokenizer.base import AbstractTokenizer, TokenizerModule
|
2021-04-21 10:57:17 +03:00
|
|
|
|
|
|
|
LOG = logging.getLogger()
|
|
|
|
|
2022-07-13 23:55:40 +03:00
|
|
|
def _import_tokenizer(name: str) -> TokenizerModule:
|
2021-04-21 10:57:17 +03:00
|
|
|
""" Load the tokenizer.py module from project directory.
|
|
|
|
"""
|
2021-05-18 17:28:21 +03:00
|
|
|
src_file = Path(__file__).parent / (name + '_tokenizer.py')
|
|
|
|
if not src_file.is_file():
|
2021-04-21 10:57:17 +03:00
|
|
|
LOG.fatal("No tokenizer named '%s' available. "
|
|
|
|
"Check the setting of NOMINATIM_TOKENIZER.", name)
|
2021-05-18 17:28:21 +03:00
|
|
|
raise UsageError('Tokenizer not found')
|
|
|
|
|
|
|
|
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
|
2021-04-21 10:57:17 +03:00
|
|
|
|
|
|
|
|
2022-07-13 23:55:40 +03:00
|
|
|
def create_tokenizer(config: Configuration, init_db: bool = True,
|
|
|
|
module_name: Optional[str] = None) -> AbstractTokenizer:
|
2021-04-21 10:57:17 +03:00
|
|
|
""" Create a new tokenizer as defined by the given configuration.
|
|
|
|
|
|
|
|
The tokenizer data and code is copied into the 'tokenizer' directory
|
|
|
|
of the project directory and the tokenizer loaded from its new location.
|
|
|
|
"""
|
2021-04-21 16:38:52 +03:00
|
|
|
if module_name is None:
|
|
|
|
module_name = config.TOKENIZER
|
|
|
|
|
2021-04-21 10:57:17 +03:00
|
|
|
# Create the directory for the tokenizer data
|
|
|
|
basedir = config.project_dir / 'tokenizer'
|
|
|
|
if not basedir.exists():
|
|
|
|
basedir.mkdir()
|
|
|
|
elif not basedir.is_dir():
|
|
|
|
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
|
|
|
|
raise UsageError("Tokenizer setup failed.")
|
|
|
|
|
2021-04-21 16:38:52 +03:00
|
|
|
# Import and initialize the tokenizer.
|
|
|
|
tokenizer_module = _import_tokenizer(module_name)
|
2021-04-21 10:57:17 +03:00
|
|
|
|
|
|
|
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
2021-04-28 11:59:07 +03:00
|
|
|
tokenizer.init_new_db(config, init_db=init_db)
|
2021-04-21 10:57:17 +03:00
|
|
|
|
|
|
|
with connect(config.get_libpq_dsn()) as conn:
|
2021-04-21 16:38:52 +03:00
|
|
|
properties.set_property(conn, 'tokenizer', module_name)
|
2021-04-21 10:57:17 +03:00
|
|
|
|
|
|
|
return tokenizer
|
|
|
|
|
|
|
|
|
2022-07-13 23:55:40 +03:00
|
|
|
def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
|
2021-04-21 10:57:17 +03:00
|
|
|
""" Instantiate a tokenizer for an existing database.
|
|
|
|
|
|
|
|
The function looks up the appropriate tokenizer in the database
|
|
|
|
and initialises it.
|
|
|
|
"""
|
|
|
|
basedir = config.project_dir / 'tokenizer'
|
|
|
|
if not basedir.is_dir():
|
2022-03-20 13:31:42 +03:00
|
|
|
# Directory will be repopulated by tokenizer below.
|
|
|
|
basedir.mkdir()
|
2021-04-21 10:57:17 +03:00
|
|
|
|
|
|
|
with connect(config.get_libpq_dsn()) as conn:
|
|
|
|
name = properties.get_property(conn, 'tokenizer')
|
|
|
|
|
|
|
|
if name is None:
|
|
|
|
LOG.fatal("Tokenizer was not set up properly. Database property missing.")
|
|
|
|
raise UsageError('Cannot initialize tokenizer.')
|
|
|
|
|
|
|
|
tokenizer_module = _import_tokenizer(name)
|
|
|
|
|
|
|
|
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
2021-09-29 18:37:04 +03:00
|
|
|
tokenizer.init_from_project(config)
|
2021-04-21 10:57:17 +03:00
|
|
|
|
|
|
|
return tokenizer
|