From 3746befd8859cbf1a1f86459b09db57c21d26c75 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 28 Jul 2022 22:00:29 +0200 Subject: [PATCH] add documentation for sanitizer interface Also switches mkdocstrings to 0.18 with the rather unfortunate consequence that now mkdocstrings-python-legacy is needed as well. --- docs/develop/Development-Environment.md | 3 +- docs/develop/ICU-Tokenizer-Modules.md | 82 ++++++++++++++++++++++++ docs/extra.css | 13 ++-- docs/mkdocs.yml | 3 +- nominatim/data/place_info.py | 33 ++++++---- nominatim/tokenizer/sanitizers/base.py | 30 ++++++--- nominatim/tokenizer/sanitizers/config.py | 71 +++++++++++++------- 7 files changed, 185 insertions(+), 50 deletions(-) create mode 100644 docs/develop/ICU-Tokenizer-Modules.md diff --git a/docs/develop/Development-Environment.md b/docs/develop/Development-Environment.md index 6bb33f00..58f802f1 100644 --- a/docs/develop/Development-Environment.md +++ b/docs/develop/Development-Environment.md @@ -40,7 +40,8 @@ It has the following additional requirements: The documentation is built with mkdocs: * [mkdocs](https://www.mkdocs.org/) >= 1.1.2 -* [mkdocstrings](https://mkdocstrings.github.io/) +* [mkdocstrings](https://mkdocstrings.github.io/) >= 0.16 +* [mkdocstrings-python-legacy](https://mkdocstrings.github.io/python-legacy/) ### Installing prerequisites on Ubuntu/Debian diff --git a/docs/develop/ICU-Tokenizer-Modules.md b/docs/develop/ICU-Tokenizer-Modules.md new file mode 100644 index 00000000..0578026c --- /dev/null +++ b/docs/develop/ICU-Tokenizer-Modules.md @@ -0,0 +1,82 @@ +# Writing custom sanitizer and token analysis modules for the ICU tokenizer + +The [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer) provides a +highly customizable method to pre-process and normalize the name information +of the input data before it is added to the search index. It comes with a +selection of sanitizers and token analyzers which you can use to adapt your +installation to your needs. If the provided modules are not enough, you can +also provide your own implementations. This section describes how to do that. + +## Using non-standard sanitizers and token analyzers + +Sanitizer names (in the `step` property) and token analysis names (in the +`analyzer`) may refer to externally supplied modules. There are two ways +to include external modules: through a library or from the project directory. + +To include a module from a library, use the absolute import path as name and +make sure the library can be found in your PYTHONPATH. + +To use a custom module without creating a library, you can put the module +somewhere in your project directory and then use the relative path to the +file. Include the whole name of the file including the `.py` ending. + +## Custom sanitizer modules + +A sanitizer module must export a single factory function `create` with the +following signature: + +``` python +def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None] +``` + +The function receives the custom configuration for the sanitizer and must +return a callable (function or class) that transforms the name and address +terms of a place. When a place is processed, then a `ProcessInfo` object +is created from the information that was queried from the database. This +object is sequentially handed to each configured sanitizer, so that each +sanitizer receives the result of processing from the previous sanitizer. +After the last sanitizer is finished, the resulting name and address lists +are forwarded to the token analysis module. + +Sanitizer functions are instantiated once and then called for each place +that is imported or updated. They don't need to be thread-safe. +If multi-threading is used, each thread creates their own instance of +the function. + +### Sanitizer configuration + +::: nominatim.tokenizer.sanitizers.config.SanitizerConfig + rendering: + show_source: no + heading_level: 6 + +### The sanitation function + +The sanitation function receives a single object with three members: + + * `place`: read-only information about the place being processed. + See PlaceInfo below. + * `names`: The current list of names for the place. Each name is a + PlaceName object. + * `address`: The current list of address names for the place. Each name + is a PlaceName object. + +While the `place` member is provided for information only, the `names` and +`address` lists are meant to be manipulated by the sanitizer. If may add and +remove entries, change information within a single entry (for example by +adding extra attributes) or completely replace the list with a different one. + +#### PlaceInfo - information about the place + +::: nominatim.data.place_info.PlaceInfo + rendering: + show_source: no + heading_level: 6 + + +#### PlaceName - extended naming information + +::: nominatim.tokenizer.sanitizers.base.PlaceName + rendering: + show_source: no + heading_level: 6 diff --git a/docs/extra.css b/docs/extra.css index 9289c1d3..3aecf2ef 100644 --- a/docs/extra.css +++ b/docs/extra.css @@ -14,10 +14,11 @@ th { background-color: #eee; } -/* Indentation for mkdocstrings. -div.doc-contents:not(.first) { - padding-left: 25px; - border-left: 4px solid rgba(230, 230, 230); - margin-bottom: 60px; -}*/ +.doc-object h6 { + margin-bottom: 0.8em; + font-size: 120%; +} +.doc-object { + margin-bottom: 1.3em; +} diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 48fe1d0d..43bb533d 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -39,6 +39,7 @@ nav: - 'Database Layout' : 'develop/Database-Layout.md' - 'Indexing' : 'develop/Indexing.md' - 'Tokenizers' : 'develop/Tokenizers.md' + - 'Custom modules for ICU tokenizer': 'develop/ICU-Tokenizer-Modules.md' - 'Setup for Development' : 'develop/Development-Environment.md' - 'Testing' : 'develop/Testing.md' - 'External Data Sources': 'develop/data-sources.md' @@ -58,7 +59,7 @@ plugins: - search - mkdocstrings: handlers: - python: + python-legacy: rendering: show_source: false show_signature_annotations: false diff --git a/nominatim/data/place_info.py b/nominatim/data/place_info.py index 96912a61..ab895352 100644 --- a/nominatim/data/place_info.py +++ b/nominatim/data/place_info.py @@ -11,8 +11,8 @@ the tokenizer. from typing import Optional, Mapping, Any class PlaceInfo: - """ Data class containing all information the tokenizer gets about a - place it should process the names for. + """ This data class contains all information the tokenizer can access + about a place. """ def __init__(self, info: Mapping[str, Any]) -> None: @@ -21,16 +21,25 @@ class PlaceInfo: @property def name(self) -> Optional[Mapping[str, str]]: - """ A dictionary with the names of the place or None if the place - has no names. + """ A dictionary with the names of the place. Keys and values represent + the full key and value of the corresponding OSM tag. Which tags + are saved as names is determined by the import style. + The property may be None if the place has no names. """ return self._info.get('name') @property def address(self) -> Optional[Mapping[str, str]]: - """ A dictionary with the address elements of the place - or None if no address information is available. + """ A dictionary with the address elements of the place. They key + usually corresponds to the suffix part of the key of an OSM + 'addr:*' or 'isin:*' tag. There are also some special keys like + `country` or `country_code` which merge OSM keys that contain + the same information. See [Import Styles][1] for details. + + The property may be None if the place has no address information. + + [1]: ../customize/Import-Styles.md """ return self._info.get('address') @@ -38,28 +47,30 @@ class PlaceInfo: @property def country_code(self) -> Optional[str]: """ The country code of the country the place is in. Guaranteed - to be a two-letter lower-case string or None, if no country - could be found. + to be a two-letter lower-case string. If the place is not inside + any country, the property is set to None. """ return self._info.get('country_code') @property def rank_address(self) -> int: - """ The computed rank address before rank correction. + """ The [rank address][1] before ant rank correction is applied. + + [1]: ../customize/Ranking.md#address-rank """ return self._info.get('rank_address', 0) def is_a(self, key: str, value: str) -> bool: - """ Check if the place's primary tag corresponds to the given + """ Set to True when the place's primary tag corresponds to the given key and value. """ return self._info.get('class') == key and self._info.get('type') == value def is_country(self) -> bool: - """ Check if the place is a valid country boundary. + """ Set to True when the place is a valid country boundary. """ return self.rank_address == 4 \ and self.is_a('boundary', 'administrative') \ diff --git a/nominatim/tokenizer/sanitizers/base.py b/nominatim/tokenizer/sanitizers/base.py index 692c6d5f..09ea2dae 100644 --- a/nominatim/tokenizer/sanitizers/base.py +++ b/nominatim/tokenizer/sanitizers/base.py @@ -14,14 +14,20 @@ from nominatim.data.place_info import PlaceInfo from nominatim.typing import Protocol, Final class PlaceName: - """ A searchable name for a place together with properties. - Every name object saves the name proper and two basic properties: - * 'kind' describes the name of the OSM key used without any suffixes + """ Each name and address part of a place is encapsulated in an object of + this class. It saves not only the name proper but also describes the + kind of name with two properties: + + * `kind` describes the name of the OSM key used without any suffixes (i.e. the part after the colon removed) - * 'suffix' contains the suffix of the OSM tag, if any. The suffix + * `suffix` contains the suffix of the OSM tag, if any. The suffix is the part of the key after the first colon. - In addition to that, the name may have arbitrary additional attributes. - Which attributes are used, depends on the token analyser. + + In addition to that, a name may have arbitrary additional attributes. + How attributes are used, depends on the sanatizers and token analysers. + The exception is is the 'analyzer' attribute. This apptribute determines + which token analysis module will be used to finalize the treatment of + names. """ def __init__(self, name: str, kind: str, suffix: Optional[str]): @@ -113,7 +119,13 @@ class SanitizerHandler(Protocol): def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]: """ - A sanitizer must define a single function `create`. It takes the - dictionary with the configuration information for the sanitizer and - returns a function that transforms name and address. + Create a function for sanitizing a place. + + Arguments: + config: A dictionary with the additional configuration options + specified in the tokenizer configuration + + Return: + The result must be a callable that takes a place description + and transforms name and address as reuqired. """ diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py index f6abf20c..52cb2c04 100644 --- a/nominatim/tokenizer/sanitizers/config.py +++ b/nominatim/tokenizer/sanitizers/config.py @@ -21,8 +21,8 @@ else: _BaseUserDict = UserDict class SanitizerConfig(_BaseUserDict): - """ Dictionary with configuration options for a sanitizer. - + """ The `SanitizerConfig` class is a read-only dictionary + with configuration options for the sanitizer. In addition to the usual dictionary function, the class provides accessors to standard sanatizer options that are used by many of the sanitizers. @@ -30,10 +30,16 @@ class SanitizerConfig(_BaseUserDict): def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]: """ Extract a configuration parameter as a string list. - If the parameter value is a simple string, it is returned as a - one-item list. If the parameter value does not exist, the given - default is returned. If the parameter value is a list, it is checked - to contain only strings before being returned. + + Arguments: + param: Name of the configuration parameter. + default: Value to return, when the parameter is missing. + + Returns: + If the parameter value is a simple string, it is returned as a + one-item list. If the parameter value does not exist, the given + default is returned. If the parameter value is a list, it is + checked to contain only strings before being returned. """ values = self.data.get(param, None) @@ -54,9 +60,16 @@ class SanitizerConfig(_BaseUserDict): def get_bool(self, param: str, default: Optional[bool] = None) -> bool: """ Extract a configuration parameter as a boolean. - The parameter must be one of the yaml boolean values or an - user error will be raised. If `default` is given, then the parameter - may also be missing or empty. + + Arguments: + param: Name of the configuration parameter. The parameter must + contain one of the yaml boolean values or an + UsageError will be raised. + default: Value to return, when the parameter is missing. + When set to `None`, the parameter must be defined. + + Returns: + Boolean value of the given parameter. """ value = self.data.get(param, default) @@ -67,15 +80,20 @@ class SanitizerConfig(_BaseUserDict): def get_delimiter(self, default: str = ',;') -> Pattern[str]: - """ Return the 'delimiter' parameter in the configuration as a - compiled regular expression that can be used to split the names on the - delimiters. The regular expression makes sure that the resulting names - are stripped and that repeated delimiters - are ignored but it will still create empty fields on occasion. The - code needs to filter those. + """ Return the 'delimiters' parameter in the configuration as a + compiled regular expression that can be used to split names on these + delimiters. - The 'default' parameter defines the delimiter set to be used when - not explicitly configured. + Arguments: + default: Delimiters to be used, when 'delimiters' parameter + is not explicitly configured. + + Returns: + A regular expression pattern, which can be used to + split a string. The regular expression makes sure that the + resulting names are stripped and that repeated delimiters + are ignored. It may still create empty fields on occasion. The + code needs to filter those. """ delimiter_set = set(self.data.get('delimiters', default)) if not delimiter_set: @@ -86,13 +104,22 @@ class SanitizerConfig(_BaseUserDict): def get_filter_kind(self, *default: str) -> Callable[[str], bool]: """ Return a filter function for the name kind from the 'filter-kind' - config parameter. The filter functions takes a name item and returns - True when the item passes the filter. + config parameter. - If the parameter is empty, the filter lets all items pass. If the - parameter is a string, it is interpreted as a single regular expression - that must match the full kind string. If the parameter is a list then + If the 'filter-kind' parameter is empty, the filter lets all items + pass. If the parameter is a string, it is interpreted as a single + regular expression that must match the full kind string. + If the parameter is a list then any of the regular expressions in the list must match to pass. + + Arguments: + default: Filters to be used, when the 'filter-kind' parameter + is not specified. If omitted then the default is to + let all names pass. + + Returns: + A filter function which takes a name string and returns + True when the item passes the filter. """ filters = self.get_string_list('filter-kind', default)