Nominatim/nominatim/tokenizer/base.py

# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Abstract class definitions for tokenizers. These base classes are here
mainly for documentation purposes.
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional, Iterable
from pathlib import Path

from nominatim.config import Configuration
from nominatim.db.connection import Connection
from nominatim.data.place_info import PlaceInfo
from nominatim.typing import Protocol

class AbstractAnalyzer(ABC):
    """ The analyzer provides the functions for analysing names and building
        the token database.

        Analyzers are instantiated on a per-thread base. Access to global data
        structures must be synchronised accordingly.
    """

    def __enter__(self) -> 'AbstractAnalyzer':
        return self


    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
        self.close()


    @abstractmethod
    def close(self) -> None:
        """ Free all resources used by the analyzer.
        """


    @abstractmethod
    def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
        """ Return token information for the given list of words.

            The function is used for testing and debugging only
            and does not need to be particularly efficient.

            Arguments:
                words: A list of words to look up the tokens for.
                       If a word starts with # it is assumed to be a full name
                       otherwise is a partial term.

            Returns:
                The function returns the list of all tuples that could be
                    found for the given words. Each list entry is a tuple of
                    (original word, word token, word id).
        """


    @abstractmethod
    def normalize_postcode(self, postcode: str) -> str:
        """ Convert the postcode to its standardized form.

            This function must yield exactly the same result as the SQL function
            `token_normalized_postcode()`.

            Arguments:
                postcode: The postcode to be normalized.

            Returns:
                The given postcode after normalization.
        """


    @abstractmethod
    def update_postcodes_from_db(self) -> None:
        """ Update the tokenizer's postcode tokens from the current content
            of the `location_postcode` table.
        """


    @abstractmethod
    def update_special_phrases(self,
                               phrases: Iterable[Tuple[str, str, str, str]],
                               should_replace: bool) -> None:
        """ Update the tokenizer's special phrase tokens from the given
            list of special phrases.

            Arguments:
                phrases: The new list of special phrases. Each entry is
                         a tuple of (phrase, class, type, operator).
                should_replace: If true, replace the current list of phrases.
                                When false, just add the given phrases to the
                                ones that already exist.
        """


    @abstractmethod
    def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
        """ Add the given names to the tokenizer's list of country tokens.

            Arguments:
                country_code: two-letter country code for the country the names
                              refer to.
                names: Dictionary of name type to name.
        """


    @abstractmethod
    def process_place(self, place: PlaceInfo) -> Any:
        """ Extract tokens for the given place and compute the
            information to be handed to the PL/pgSQL processor for building
            the search index.

            Arguments:
                place: Place information retrieved from the database.

            Returns:
                A JSON-serialisable structure that will be handed into
                    the database via the `token_info` field.
        """


class AbstractTokenizer(ABC):
    """ The tokenizer instance is the central instance of the tokenizer in
        the system. There will only be a single instance of the tokenizer
        active at any time.
    """

    @abstractmethod
    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
        """ Set up a new tokenizer for the database.

            The function should copy all necessary data into the project
            directory or save it in the property table to make sure that
            the tokenizer remains stable over updates.

            Arguments:
              config: Read-only object with configuration options.

              init_db: When set to False, then initialisation of database
                tables should be skipped. This option is only required for
                migration purposes and can be safely ignored by custom
                tokenizers.
        """


    @abstractmethod
    def init_from_project(self, config: Configuration) -> None:
        """ Initialise the tokenizer from an existing database setup.

            The function should load all previously saved configuration from
            the project directory and/or the property table.

            Arguments:
              config: Read-only object with configuration options.
        """


    @abstractmethod
    def finalize_import(self, config: Configuration) -> None:
        """ This function is called at the very end of an import when all
            data has been imported and indexed. The tokenizer may create
            at this point any additional indexes and data structures needed
            during query time.

            Arguments:
              config: Read-only object with configuration options.
        """


    @abstractmethod
    def update_sql_functions(self, config: Configuration) -> None:
        """ Update the SQL part of the tokenizer. This function is called
            automatically on migrations or may be called explicitly by the
            user through the `nominatim refresh --functions` command.

            The tokenizer must only update the code of the tokenizer. The
            data structures or data itself must not be changed by this function.

            Arguments:
              config: Read-only object with configuration options.
        """


    @abstractmethod
    def check_database(self, config: Configuration) -> Optional[str]:
        """ Check that the database is set up correctly and ready for being
            queried.

            Arguments:
              config: Read-only object with configuration options.

            Returns:
              If an issue was found, return an error message with the
                  description of the issue as well as hints for the user on
                  how to resolve the issue. If everything is okay, return `None`.
        """


    @abstractmethod
    def update_statistics(self, config: Configuration) -> None:
        """ Recompute any tokenizer statistics necessary for efficient lookup.
            This function is meant to be called from time to time by the user
            to improve performance. However, the tokenizer must not depend on
            it to be called in order to work.
        """


    @abstractmethod
    def update_word_tokens(self) -> None:
        """ Do house-keeping on the tokenizers internal data structures.
            Remove unused word tokens, resort data etc.
        """


    @abstractmethod
    def name_analyzer(self) -> AbstractAnalyzer:
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:

            ```
            with tokenizer.name_analyzer() as analyzer:
                analyser.tokenize()
            ```

            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.
        """


    @abstractmethod
    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
        """ Return a list of the most frequent full words in the database.

            Arguments:
              conn: Open connection to the database which may be used to
                    retrieve the words.
              num: Maximum number of words to return.
        """


class TokenizerModule(Protocol):
    """ Interface that must be exported by modules that implement their
        own tokenizer.
    """

    def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
        """ Factory for new tokenizers.
        """
add consistent SPDX copyright headers 2022-01-03 18:23:58 +03:00			`# SPDX-License-Identifier: GPL-2.0-only`
			`#`
			`# This file is part of Nominatim. (https://nominatim.org)`
			`#`
			`# Copyright (C) 2022 by the Nominatim developer community.`
			`# For a full list of authors see the git log.`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`"""`
docs: fix typos 2022-07-20 17:05:25 +03:00			`Abstract class definitions for tokenizers. These base classes are here`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`mainly for documentation purposes.`
			`"""`
			`from abc import ABC, abstractmethod`
add type annotations to special phrase importer 2022-07-17 11:46:59 +03:00			`from typing import List, Tuple, Dict, Any, Optional, Iterable`
add type annotations to ICU tokenizer helper modules 2022-07-13 23:55:40 +03:00			`from pathlib import Path`

define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`from nominatim.config import Configuration`
move warm script to python code 2023-07-16 21:12:53 +03:00			`from nominatim.db.connection import Connection`
move PlaceInfo into data submodule This data structure is shared between indexer and tokenizer. 2022-07-06 11:54:47 +03:00			`from nominatim.data.place_info import PlaceInfo`
remove typing_extensions requirement The typing_extensions package is only necessary now when running mypy. It won't be used at runtime anymore. 2022-07-18 00:18:55 +03:00			`from nominatim.typing import Protocol`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00
			`class AbstractAnalyzer(ABC):`
			`""" The analyzer provides the functions for analysing names and building`
			`the token database.`

			`Analyzers are instantiated on a per-thread base. Access to global data`
			`structures must be synchronised accordingly.`
			`"""`

			`def __enter__(self) -> 'AbstractAnalyzer':`
			`return self`


add typing information for place_info and country_info 2022-07-07 18:31:20 +03:00			`def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`self.close()`


			`@abstractmethod`
			`def close(self) -> None:`
			`""" Free all resources used by the analyzer.`
			`"""`


			`@abstractmethod`
			`def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:`
			`""" Return token information for the given list of words.`

			`The function is used for testing and debugging only`
			`and does not need to be particularly efficient.`

			`Arguments:`
			`words: A list of words to look up the tokens for.`
			`If a word starts with # it is assumed to be a full name`
			`otherwise is a partial term.`

			`Returns:`
			`The function returns the list of all tuples that could be`
update to modern mkdocstrings python handler 2023-08-18 18:28:45 +03:00			`found for the given words. Each list entry is a tuple of`
			`(original word, word token, word id).`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`"""`


			`@abstractmethod`
			`def normalize_postcode(self, postcode: str) -> str:`
			`""" Convert the postcode to its standardized form.`

			`This function must yield exactly the same result as the SQL function`
			`token_normalized_postcode()`.

			`Arguments:`
			`postcode: The postcode to be normalized.`

			`Returns:`
			`The given postcode after normalization.`
			`"""`


			`@abstractmethod`
			`def update_postcodes_from_db(self) -> None:`
			`""" Update the tokenizer's postcode tokens from the current content`
			of the `location_postcode` table.
			`"""`


			`@abstractmethod`
add type annotations to special phrase importer 2022-07-17 11:46:59 +03:00			`def update_special_phrases(self,`
			`phrases: Iterable[Tuple[str, str, str, str]],`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`should_replace: bool) -> None:`
			`""" Update the tokenizer's special phrase tokens from the given`
			`list of special phrases.`

			`Arguments:`
			`phrases: The new list of special phrases. Each entry is`
			`a tuple of (phrase, class, type, operator).`
			`should_replace: If true, replace the current list of phrases.`
			`When false, just add the given phrases to the`
			`ones that already exist.`
			`"""`


			`@abstractmethod`
add typing information for place_info and country_info 2022-07-07 18:31:20 +03:00			`def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`""" Add the given names to the tokenizer's list of country tokens.`

			`Arguments:`
			`country_code: two-letter country code for the country the names`
			`refer to.`
			`names: Dictionary of name type to name.`
			`"""`


			`@abstractmethod`
add wrapper class for place data passed to tokenizer This is mostly for convenience and documentation purposes. 2021-09-29 11:37:54 +03:00			`def process_place(self, place: PlaceInfo) -> Any:`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`""" Extract tokens for the given place and compute the`
			`information to be handed to the PL/pgSQL processor for building`
			`the search index.`

			`Arguments:`
docs: fix typos 2022-07-20 17:05:25 +03:00			`place: Place information retrieved from the database.`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00
			`Returns:`
			`A JSON-serialisable structure that will be handed into`
update to modern mkdocstrings python handler 2023-08-18 18:28:45 +03:00			the database via the `token_info` field.
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`"""`



			`class AbstractTokenizer(ABC):`
			`""" The tokenizer instance is the central instance of the tokenizer in`
			`the system. There will only be a single instance of the tokenizer`
			`active at any time.`
			`"""`

			`@abstractmethod`
			`def init_new_db(self, config: Configuration, init_db: bool = True) -> None:`
			`""" Set up a new tokenizer for the database.`

			`The function should copy all necessary data into the project`
			`directory or save it in the property table to make sure that`
			`the tokenizer remains stable over updates.`

			`Arguments:`
fix typo 2021-09-29 15:16:09 +03:00			`config: Read-only object with configuration options.`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00
			`init_db: When set to False, then initialisation of database`
			`tables should be skipped. This option is only required for`
docs: fix typos 2022-07-20 17:05:25 +03:00			`migration purposes and can be safely ignored by custom`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`tokenizers.`
			`"""`


			`@abstractmethod`
unify ICUNameProcessorRules and ICURuleLoader There is no need for the additional layer of indirection that the ICUNameProcessorRules class adds. The ICURuleLoader can fill the database properties directly. 2021-09-29 18:37:04 +03:00			`def init_from_project(self, config: Configuration) -> None:`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`""" Initialise the tokenizer from an existing database setup.`

			`The function should load all previously saved configuration from`
			`the project directory and/or the property table.`
unify ICUNameProcessorRules and ICURuleLoader There is no need for the additional layer of indirection that the ICUNameProcessorRules class adds. The ICURuleLoader can fill the database properties directly. 2021-09-29 18:37:04 +03:00
			`Arguments:`
			`config: Read-only object with configuration options.`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`"""`


			`@abstractmethod`
			`def finalize_import(self, config: Configuration) -> None:`
			`""" This function is called at the very end of an import when all`
			`data has been imported and indexed. The tokenizer may create`
			`at this point any additional indexes and data structures needed`
			`during query time.`

			`Arguments:`
fix typo 2021-09-29 15:16:09 +03:00			`config: Read-only object with configuration options.`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`"""`


			`@abstractmethod`
			`def update_sql_functions(self, config: Configuration) -> None:`
			`""" Update the SQL part of the tokenizer. This function is called`
			`automatically on migrations or may be called explicitly by the`
			user through the `nominatim refresh --functions` command.

			`The tokenizer must only update the code of the tokenizer. The`
			`data structures or data itself must not be changed by this function.`

			`Arguments:`
fix typo 2021-09-29 15:16:09 +03:00			`config: Read-only object with configuration options.`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`"""`


			`@abstractmethod`
add type annotations for legacy tokenizer 2022-07-15 23:52:26 +03:00			`def check_database(self, config: Configuration) -> Optional[str]:`
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`""" Check that the database is set up correctly and ready for being`
			`queried.`

unify ICUNameProcessorRules and ICURuleLoader There is no need for the additional layer of indirection that the ICUNameProcessorRules class adds. The ICURuleLoader can fill the database properties directly. 2021-09-29 18:37:04 +03:00			`Arguments:`
			`config: Read-only object with configuration options.`

fix argument description for check_database 2021-10-07 10:49:13 +03:00			`Returns:`
			`If an issue was found, return an error message with the`
update to modern mkdocstrings python handler 2023-08-18 18:28:45 +03:00			`description of the issue as well as hints for the user on`
			how to resolve the issue. If everything is okay, return `None`.
define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`"""`


make word recount a tokenizer-specific function 2021-10-19 12:21:16 +03:00			`@abstractmethod`
recreate word table when refreshing counts The counting touches a large part of the word table, leaving bloated tables and indexes. Thus recreate the table instead and swap it in. 2024-02-04 18:43:33 +03:00			`def update_statistics(self, config: Configuration) -> None:`
make word recount a tokenizer-specific function 2021-10-19 12:21:16 +03:00			`""" Recompute any tokenizer statistics necessary for efficient lookup.`
			`This function is meant to be called from time to time by the user`
			`to improve performance. However, the tokenizer must not depend on`
			`it to be called in order to work.`
			`"""`


add new command for cleaning word tokens Just pulls outdated housenumbers for the moment. 2022-01-20 22:05:15 +03:00			`@abstractmethod`
			`def update_word_tokens(self) -> None:`
			`""" Do house-keeping on the tokenizers internal data structures.`
			`Remove unused word tokens, resort data etc.`
			`"""`


define formal public Python interface for tokenizer This introduces an abstract class for the Tokenizer/Analyzer for documentation purposes. 2021-08-10 15:51:35 +03:00			`@abstractmethod`
			`def name_analyzer(self) -> AbstractAnalyzer:`
			`""" Create a new analyzer for tokenizing names and queries`
			`using this tokinzer. Analyzers are context managers and should`
			`be used accordingly:`

			```
			`with tokenizer.name_analyzer() as analyzer:`
			`analyser.tokenize()`
			```

			`When used outside the with construct, the caller must ensure to`
			`call the close() function before destructing the analyzer.`
			`"""`
add type annotations to ICU tokenizer helper modules 2022-07-13 23:55:40 +03:00

move warm script to python code 2023-07-16 21:12:53 +03:00			`@abstractmethod`
			`def most_frequent_words(self, conn: Connection, num: int) -> List[str]:`
update to modern mkdocstrings python handler 2023-08-18 18:28:45 +03:00			`""" Return a list of the most frequent full words in the database.`

			`Arguments:`
			`conn: Open connection to the database which may be used to`
fix typos and grammar issues 2023-08-29 13:14:44 +03:00			`retrieve the words.`
update to modern mkdocstrings python handler 2023-08-18 18:28:45 +03:00			`num: Maximum number of words to return.`
move warm script to python code 2023-07-16 21:12:53 +03:00			`"""`


add type annotations to ICU tokenizer helper modules 2022-07-13 23:55:40 +03:00			`class TokenizerModule(Protocol):`
			`""" Interface that must be exported by modules that implement their`
			`own tokenizer.`
			`"""`

			`def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:`
			`""" Factory for new tokenizers.`
			`"""`