2022-01-03 18:23:58 +03:00
|
|
|
# SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
#
|
|
|
|
# This file is part of Nominatim. (https://nominatim.org)
|
|
|
|
#
|
|
|
|
# Copyright (C) 2022 by the Nominatim developer community.
|
|
|
|
# For a full list of authors see the git log.
|
2021-08-10 15:51:35 +03:00
|
|
|
"""
|
2022-07-20 17:05:25 +03:00
|
|
|
Abstract class definitions for tokenizers. These base classes are here
|
2021-08-10 15:51:35 +03:00
|
|
|
mainly for documentation purposes.
|
|
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
2022-07-17 11:46:59 +03:00
|
|
|
from typing import List, Tuple, Dict, Any, Optional, Iterable
|
2022-07-13 23:55:40 +03:00
|
|
|
from pathlib import Path
|
|
|
|
|
2021-08-10 15:51:35 +03:00
|
|
|
from nominatim.config import Configuration
|
2023-07-16 21:12:53 +03:00
|
|
|
from nominatim.db.connection import Connection
|
2022-07-06 11:54:47 +03:00
|
|
|
from nominatim.data.place_info import PlaceInfo
|
2022-07-18 00:18:55 +03:00
|
|
|
from nominatim.typing import Protocol
|
2021-08-10 15:51:35 +03:00
|
|
|
|
|
|
|
class AbstractAnalyzer(ABC):
|
|
|
|
""" The analyzer provides the functions for analysing names and building
|
|
|
|
the token database.
|
|
|
|
|
|
|
|
Analyzers are instantiated on a per-thread base. Access to global data
|
|
|
|
structures must be synchronised accordingly.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __enter__(self) -> 'AbstractAnalyzer':
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
2022-07-07 18:31:20 +03:00
|
|
|
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
2021-08-10 15:51:35 +03:00
|
|
|
self.close()
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def close(self) -> None:
|
|
|
|
""" Free all resources used by the analyzer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
|
|
|
|
""" Return token information for the given list of words.
|
|
|
|
|
|
|
|
The function is used for testing and debugging only
|
|
|
|
and does not need to be particularly efficient.
|
|
|
|
|
|
|
|
Arguments:
|
|
|
|
words: A list of words to look up the tokens for.
|
|
|
|
If a word starts with # it is assumed to be a full name
|
|
|
|
otherwise is a partial term.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
The function returns the list of all tuples that could be
|
2023-08-18 18:28:45 +03:00
|
|
|
found for the given words. Each list entry is a tuple of
|
|
|
|
(original word, word token, word id).
|
2021-08-10 15:51:35 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def normalize_postcode(self, postcode: str) -> str:
|
|
|
|
""" Convert the postcode to its standardized form.
|
|
|
|
|
|
|
|
This function must yield exactly the same result as the SQL function
|
|
|
|
`token_normalized_postcode()`.
|
|
|
|
|
|
|
|
Arguments:
|
|
|
|
postcode: The postcode to be normalized.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
The given postcode after normalization.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def update_postcodes_from_db(self) -> None:
|
|
|
|
""" Update the tokenizer's postcode tokens from the current content
|
|
|
|
of the `location_postcode` table.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2022-07-17 11:46:59 +03:00
|
|
|
def update_special_phrases(self,
|
|
|
|
phrases: Iterable[Tuple[str, str, str, str]],
|
2021-08-10 15:51:35 +03:00
|
|
|
should_replace: bool) -> None:
|
|
|
|
""" Update the tokenizer's special phrase tokens from the given
|
|
|
|
list of special phrases.
|
|
|
|
|
|
|
|
Arguments:
|
|
|
|
phrases: The new list of special phrases. Each entry is
|
|
|
|
a tuple of (phrase, class, type, operator).
|
|
|
|
should_replace: If true, replace the current list of phrases.
|
|
|
|
When false, just add the given phrases to the
|
|
|
|
ones that already exist.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2022-07-07 18:31:20 +03:00
|
|
|
def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
|
2021-08-10 15:51:35 +03:00
|
|
|
""" Add the given names to the tokenizer's list of country tokens.
|
|
|
|
|
|
|
|
Arguments:
|
|
|
|
country_code: two-letter country code for the country the names
|
|
|
|
refer to.
|
|
|
|
names: Dictionary of name type to name.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2021-09-29 11:37:54 +03:00
|
|
|
def process_place(self, place: PlaceInfo) -> Any:
|
2021-08-10 15:51:35 +03:00
|
|
|
""" Extract tokens for the given place and compute the
|
|
|
|
information to be handed to the PL/pgSQL processor for building
|
|
|
|
the search index.
|
|
|
|
|
|
|
|
Arguments:
|
2022-07-20 17:05:25 +03:00
|
|
|
place: Place information retrieved from the database.
|
2021-08-10 15:51:35 +03:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
A JSON-serialisable structure that will be handed into
|
2023-08-18 18:28:45 +03:00
|
|
|
the database via the `token_info` field.
|
2021-08-10 15:51:35 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AbstractTokenizer(ABC):
|
|
|
|
""" The tokenizer instance is the central instance of the tokenizer in
|
|
|
|
the system. There will only be a single instance of the tokenizer
|
|
|
|
active at any time.
|
|
|
|
"""
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
|
|
|
|
""" Set up a new tokenizer for the database.
|
|
|
|
|
|
|
|
The function should copy all necessary data into the project
|
|
|
|
directory or save it in the property table to make sure that
|
|
|
|
the tokenizer remains stable over updates.
|
|
|
|
|
|
|
|
Arguments:
|
2021-09-29 15:16:09 +03:00
|
|
|
config: Read-only object with configuration options.
|
2021-08-10 15:51:35 +03:00
|
|
|
|
|
|
|
init_db: When set to False, then initialisation of database
|
|
|
|
tables should be skipped. This option is only required for
|
2022-07-20 17:05:25 +03:00
|
|
|
migration purposes and can be safely ignored by custom
|
2021-08-10 15:51:35 +03:00
|
|
|
tokenizers.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2021-09-29 18:37:04 +03:00
|
|
|
def init_from_project(self, config: Configuration) -> None:
|
2021-08-10 15:51:35 +03:00
|
|
|
""" Initialise the tokenizer from an existing database setup.
|
|
|
|
|
|
|
|
The function should load all previously saved configuration from
|
|
|
|
the project directory and/or the property table.
|
2021-09-29 18:37:04 +03:00
|
|
|
|
|
|
|
Arguments:
|
|
|
|
config: Read-only object with configuration options.
|
2021-08-10 15:51:35 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def finalize_import(self, config: Configuration) -> None:
|
|
|
|
""" This function is called at the very end of an import when all
|
|
|
|
data has been imported and indexed. The tokenizer may create
|
|
|
|
at this point any additional indexes and data structures needed
|
|
|
|
during query time.
|
|
|
|
|
|
|
|
Arguments:
|
2021-09-29 15:16:09 +03:00
|
|
|
config: Read-only object with configuration options.
|
2021-08-10 15:51:35 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def update_sql_functions(self, config: Configuration) -> None:
|
|
|
|
""" Update the SQL part of the tokenizer. This function is called
|
|
|
|
automatically on migrations or may be called explicitly by the
|
|
|
|
user through the `nominatim refresh --functions` command.
|
|
|
|
|
|
|
|
The tokenizer must only update the code of the tokenizer. The
|
|
|
|
data structures or data itself must not be changed by this function.
|
|
|
|
|
|
|
|
Arguments:
|
2021-09-29 15:16:09 +03:00
|
|
|
config: Read-only object with configuration options.
|
2021-08-10 15:51:35 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2022-07-15 23:52:26 +03:00
|
|
|
def check_database(self, config: Configuration) -> Optional[str]:
|
2021-08-10 15:51:35 +03:00
|
|
|
""" Check that the database is set up correctly and ready for being
|
|
|
|
queried.
|
|
|
|
|
2021-09-29 18:37:04 +03:00
|
|
|
Arguments:
|
|
|
|
config: Read-only object with configuration options.
|
|
|
|
|
2021-10-07 10:49:13 +03:00
|
|
|
Returns:
|
|
|
|
If an issue was found, return an error message with the
|
2023-08-18 18:28:45 +03:00
|
|
|
description of the issue as well as hints for the user on
|
|
|
|
how to resolve the issue. If everything is okay, return `None`.
|
2021-08-10 15:51:35 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
2021-10-19 12:21:16 +03:00
|
|
|
@abstractmethod
|
|
|
|
def update_statistics(self) -> None:
|
|
|
|
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
|
|
|
This function is meant to be called from time to time by the user
|
|
|
|
to improve performance. However, the tokenizer must not depend on
|
|
|
|
it to be called in order to work.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2022-01-20 22:05:15 +03:00
|
|
|
@abstractmethod
|
|
|
|
def update_word_tokens(self) -> None:
|
|
|
|
""" Do house-keeping on the tokenizers internal data structures.
|
|
|
|
Remove unused word tokens, resort data etc.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2021-08-10 15:51:35 +03:00
|
|
|
@abstractmethod
|
|
|
|
def name_analyzer(self) -> AbstractAnalyzer:
|
|
|
|
""" Create a new analyzer for tokenizing names and queries
|
|
|
|
using this tokinzer. Analyzers are context managers and should
|
|
|
|
be used accordingly:
|
|
|
|
|
|
|
|
```
|
|
|
|
with tokenizer.name_analyzer() as analyzer:
|
|
|
|
analyser.tokenize()
|
|
|
|
```
|
|
|
|
|
|
|
|
When used outside the with construct, the caller must ensure to
|
|
|
|
call the close() function before destructing the analyzer.
|
|
|
|
"""
|
2022-07-13 23:55:40 +03:00
|
|
|
|
|
|
|
|
2023-07-16 21:12:53 +03:00
|
|
|
@abstractmethod
|
|
|
|
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
|
2023-08-18 18:28:45 +03:00
|
|
|
""" Return a list of the most frequent full words in the database.
|
|
|
|
|
|
|
|
Arguments:
|
|
|
|
conn: Open connection to the database which may be used to
|
|
|
|
retrive the words.
|
|
|
|
num: Maximum number of words to return.
|
2023-07-16 21:12:53 +03:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
2022-07-13 23:55:40 +03:00
|
|
|
class TokenizerModule(Protocol):
|
|
|
|
""" Interface that must be exported by modules that implement their
|
|
|
|
own tokenizer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
|
|
|
|
""" Factory for new tokenizers.
|
|
|
|
"""
|