diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 9c2a67a1..8ae8cf39 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -1,30 +1,33 @@ -- Trigger functions for the placex table. +-- Information returned by update preparation. +DROP TYPE IF EXISTS prepare_update_info CASCADE; +CREATE TYPE prepare_update_info AS ( + name HSTORE, + address HSTORE, + rank_address SMALLINT, + country_code TEXT, + class TEXT, + type TEXT, + linked_place_id BIGINT +); + -- Retrieve the data needed by the indexer for updating the place. --- --- Return parameters: --- name list of names --- address list of address tags, either from the object or a surrounding --- building --- country_feature If the place is a country feature, this contains the --- country code, otherwise it is null. -CREATE OR REPLACE FUNCTION placex_prepare_update(p placex, - OUT name HSTORE, - OUT address HSTORE, - OUT country_feature VARCHAR, - OUT linked_place_id BIGINT) +CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex) + RETURNS prepare_update_info AS $$ DECLARE location RECORD; + result prepare_update_info; BEGIN -- For POI nodes, check if the address should be derived from a surrounding -- building. IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN - address := p.address; + result.address := p.address; ELSE -- The additional && condition works around the misguided query -- planner of postgis 3.0. - SELECT placex.address || hstore('_inherited', '') INTO address + SELECT placex.address || hstore('_inherited', '') INTO result.address FROM placex WHERE ST_Covers(geometry, p.centroid) and geometry && p.centroid @@ -34,27 +37,26 @@ BEGIN LIMIT 1; END IF; - address := address - '_unlisted_place'::TEXT; - name := p.name; + result.address := result.address - '_unlisted_place'::TEXT; + result.name := p.name; + result.class := p.class; + result.type := p.type; + result.country_code := p.country_code; + result.rank_address := p.rank_address; -- Names of linked places need to be merged in, so search for a linkable -- place already here. SELECT * INTO location FROM find_linked_place(p); IF location.place_id is not NULL THEN - linked_place_id := location.place_id; + result.linked_place_id := location.place_id; IF NOT location.name IS NULL THEN - name := location.name || name; + result.name := location.name || result.name; END IF; END IF; - country_feature := CASE WHEN p.admin_level = 2 - and p.class = 'boundary' and p.type = 'administrative' - and p.osm_type = 'R' - THEN p.country_code - ELSE null - END; + RETURN result; END; $$ LANGUAGE plpgsql STABLE; diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py new file mode 100644 index 00000000..06d730e0 --- /dev/null +++ b/nominatim/indexer/place_info.py @@ -0,0 +1,68 @@ +""" +Wrapper around place information the indexer gets from the database and hands to +the tokenizer. +""" + +import psycopg2.extras + +class PlaceInfo: + """ Data class containing all information the tokenizer gets about a + place it should process the names for. + """ + + def __init__(self, info): + self._info = info + + + def analyze(self, analyzer): + """ Process this place with the given tokenizer and return the + result in psycopg2-compatible Json. + """ + return psycopg2.extras.Json(analyzer.process_place(self)) + + + @property + def name(self): + """ A dictionary with the names of the place or None if the place + has no names. + """ + return self._info.get('name') + + + @property + def address(self): + """ A dictionary with the address elements of the place + or None if no address information is available. + """ + return self._info.get('address') + + + @property + def country_code(self): + """ The country code of the country the place is in. Guaranteed + to be a two-letter lower-case string or None, if no country + could be found. + """ + return self._info.get('country_code') + + + @property + def rank_address(self): + """ The computed rank address before rank correction. + """ + return self._info.get('rank_address') + + + def is_a(self, key, value): + """ Check if the place's primary tag corresponds to the given + key and value. + """ + return self._info.get('class') == key and self._info.get('type') == value + + + def is_country(self): + """ Check if the place is a valid country boundary. + """ + return self.rank_address == 4 \ + and self.is_a('boundary', 'administrative') \ + and self.country_code is not None diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py index 29261ee5..70536a71 100644 --- a/nominatim/indexer/runners.py +++ b/nominatim/indexer/runners.py @@ -4,14 +4,16 @@ tasks. """ import functools -import psycopg2.extras from psycopg2 import sql as pysql +from nominatim.indexer.place_info import PlaceInfo + # pylint: disable=C0111 def _mk_valuelist(template, num): return pysql.SQL(',').join([pysql.SQL(template)] * num) + class AbstractPlacexRunner: """ Returns SQL commands for indexing of the placex table. """ @@ -37,7 +39,7 @@ class AbstractPlacexRunner: @staticmethod def get_place_details(worker, ids): - worker.perform("""SELECT place_id, (placex_prepare_update(placex)).* + worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).* FROM placex WHERE place_id IN %s""", (tuple((p[0] for p in ids)), )) @@ -47,7 +49,7 @@ class AbstractPlacexRunner: for place in places: for field in ('place_id', 'name', 'address', 'linked_place_id'): values.append(place[field]) - values.append(psycopg2.extras.Json(self.analyzer.process_place(place))) + values.append(PlaceInfo(place).analyze(self.analyzer)) worker.perform(self._index_sql(len(places)), values) @@ -141,7 +143,7 @@ class InterpolationRunner: values = [] for place in places: values.extend((place[x] for x in ('place_id', 'address'))) - values.append(psycopg2.extras.Json(self.analyzer.process_place(place))) + values.append(PlaceInfo(place).analyze(self.analyzer)) worker.perform(self._index_sql(len(places)), values) diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 00ecae44..53289c78 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod from typing import List, Tuple, Dict, Any from nominatim.config import Configuration +from nominatim.indexer.place_info import PlaceInfo # pylint: disable=unnecessary-pass @@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC): @abstractmethod - def process_place(self, place: Dict) -> Any: + def process_place(self, place: PlaceInfo) -> Any: """ Extract tokens for the given place and compute the information to be handed to the PL/pgSQL processor for building the search index. Arguments: - place: Dictionary with the information about the place. Currently - the following fields may be present: - - - *name* is a dictionary of names for the place together - with the designation of the name. - - *address* is a dictionary of address terms. - - *country_feature* is set to a country code when the - place describes a country. + place: Place information retrived from the database. Returns: A JSON-serialisable structure that will be handed into @@ -142,7 +136,7 @@ class AbstractTokenizer(ABC): the tokenizer remains stable over updates. Arguments: - config: Read-only object with configuration obtions. + config: Read-only object with configuration options. init_db: When set to False, then initialisation of database tables should be skipped. This option is only required for @@ -155,11 +149,14 @@ class AbstractTokenizer(ABC): @abstractmethod - def init_from_project(self) -> None: + def init_from_project(self, config: Configuration) -> None: """ Initialise the tokenizer from an existing database setup. The function should load all previously saved configuration from the project directory and/or the property table. + + Arguments: + config: Read-only object with configuration options. """ pass @@ -172,7 +169,7 @@ class AbstractTokenizer(ABC): during query time. Arguments: - config: Read-only object with configuration obtions. + config: Read-only object with configuration options. """ pass @@ -187,13 +184,13 @@ class AbstractTokenizer(ABC): data structures or data itself must not be changed by this function. Arguments: - config: Read-only object with configuration obtions. + config: Read-only object with configuration options. """ pass @abstractmethod - def check_database(self) -> str: + def check_database(self, config: Configuration) -> str: """ Check that the database is set up correctly and ready for being queried. @@ -202,6 +199,9 @@ class AbstractTokenizer(ABC): description of the issue as well as hints for the user on how to resolve the issue. + Arguments: + config: Read-only object with configuration options. + Return `None`, if no issue was found. """ pass diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py index 069672d4..dc3e7411 100644 --- a/nominatim/tokenizer/factory.py +++ b/nominatim/tokenizer/factory.py @@ -85,6 +85,6 @@ def get_tokenizer_for_db(config): tokenizer_module = _import_tokenizer(name) tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) - tokenizer.init_from_project() + tokenizer.init_from_project(config) return tokenizer diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py index 93d2b0ff..544f5ebc 100644 --- a/nominatim/tokenizer/icu_name_processor.py +++ b/nominatim/tokenizer/icu_name_processor.py @@ -8,67 +8,25 @@ import itertools from icu import Transliterator import datrie -from nominatim.db.properties import set_property, get_property -from nominatim.tokenizer import icu_variants as variants - -DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" -DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" -DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements" -DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization" - - -class ICUNameProcessorRules: - """ Data object that saves the rules needed for the name processor. - - The rules can either be initialised through an ICURuleLoader or - be loaded from a database when a connection is given. - """ - def __init__(self, loader=None, conn=None): - if loader is not None: - self.norm_rules = loader.get_normalization_rules() - self.trans_rules = loader.get_transliteration_rules() - self.replacements = loader.get_replacement_pairs() - self.search_rules = loader.get_search_rules() - elif conn is not None: - self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) - self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) - self.replacements = \ - variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) - self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES) - else: - assert False, "Parameter loader or conn required." - - - def save_rules(self, conn): - """ Save the rules in the property table of the given database. - the rules can be loaded again by handing in a connection into - the constructor of the class. - """ - set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules) - set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules) - set_property(conn, DBCFG_IMPORT_REPLACEMENTS, - variants.pickle_variant_set(self.replacements)) - set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules) - class ICUNameProcessor: """ Collects the different transformation rules for normalisation of names - and provides the functions to aply the transformations. + and provides the functions to apply the transformations. """ - def __init__(self, rules): + def __init__(self, norm_rules, trans_rules, replacements): self.normalizer = Transliterator.createFromRules("icu_normalization", - rules.norm_rules) + norm_rules) self.to_ascii = Transliterator.createFromRules("icu_to_ascii", - rules.trans_rules + + trans_rules + ";[:Space:]+ > ' '") self.search = Transliterator.createFromRules("icu_search", - rules.search_rules) + norm_rules + trans_rules) # Intermediate reorder by source. Also compute required character set. immediate = defaultdict(list) chars = set() - for variant in rules.replacements: + for variant in replacements: if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': replstr = variant.replacement[:-1] else: diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index 0e6e40b4..330179bb 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -2,17 +2,26 @@ Helper class to create ICU rules from a configuration file. """ import io +import json import logging import itertools import re from icu import Transliterator +from nominatim.db.properties import set_property, get_property from nominatim.errors import UsageError +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer import nominatim.tokenizer.icu_variants as variants LOG = logging.getLogger() +DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" +DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" +DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules" + + def _flatten_config_list(content): if not content: return [] @@ -46,12 +55,52 @@ class ICURuleLoader: """ Compiler for ICU rules from a tokenizer configuration file. """ - def __init__(self, rules): + def __init__(self, config): + rules = config.load_sub_configuration('icu_tokenizer.yaml', + config='TOKENIZER_CONFIG') + self.variants = set() self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') - self._parse_variant_list(self._get_section(rules, 'variants')) + self.analysis_rules = self._get_section(rules, 'variants') + self._parse_variant_list() + + # Load optional sanitizer rule set. + self.sanitizer_rules = rules.get('sanitizers', []) + + + def load_config_from_db(self, conn): + """ Get previously saved parts of the configuration from the + database. + """ + self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) + self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) + self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)) + self._parse_variant_list() + + + def save_config_to_db(self, conn): + """ Save the part of the configuration that cannot be changed into + the database. + """ + set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules) + set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules) + set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules)) + + + def make_sanitizer(self): + """ Create a place sanitizer from the configured rules. + """ + return PlaceSanitizer(self.sanitizer_rules) + + + def make_token_analysis(self): + """ Create a token analyser from the reviouly loaded rules. + """ + return ICUNameProcessor(self.normalization_rules, + self.transliteration_rules, + self.variants) def get_search_rules(self): @@ -112,7 +161,9 @@ class ICURuleLoader: return ';'.join(_flatten_config_list(content)) + ';' - def _parse_variant_list(self, rules): + def _parse_variant_list(self): + rules = self.analysis_rules + self.variants.clear() if not rules: diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 5768fd35..2ece10f2 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -13,8 +13,8 @@ from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.indexer.place_info import PlaceInfo from nominatim.tokenizer.icu_rule_loader import ICURuleLoader -from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" @@ -36,7 +36,7 @@ class LegacyICUTokenizer(AbstractTokenizer): def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir - self.naming_rules = None + self.loader = None self.term_normalization = None @@ -46,9 +46,8 @@ class LegacyICUTokenizer(AbstractTokenizer): This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ - loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml', - config='TOKENIZER_CONFIG')) - self.naming_rules = ICUNameProcessorRules(loader=loader) + self.loader = ICURuleLoader(config) + self.term_normalization = config.TERM_NORMALIZATION self._install_php(config.lib_dir.php) @@ -59,11 +58,13 @@ class LegacyICUTokenizer(AbstractTokenizer): self._init_db_tables(config) - def init_from_project(self): + def init_from_project(self, config): """ Initialise the tokenizer from the project directory. """ + self.loader = ICURuleLoader(config) + with connect(self.dsn) as conn: - self.naming_rules = ICUNameProcessorRules(conn=conn) + self.loader.load_config_from_db(conn) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) @@ -81,12 +82,12 @@ class LegacyICUTokenizer(AbstractTokenizer): sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql') - def check_database(self): + def check_database(self, config): """ Check that the tokenizer is set up correctly. """ - self.init_from_project() + self.init_from_project(config) - if self.naming_rules is None: + if self.term_normalization is None: return "Configuration for tokenizer 'icu' are missing." return None @@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer): Analyzers are not thread-safe. You need to instantiate one per thread. """ - return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) + return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(), + self.loader.make_token_analysis()) def _install_php(self, phpdir): @@ -118,7 +120,7 @@ class LegacyICUTokenizer(AbstractTokenizer): = 0: - full_names.add(name[:brace_idx].strip()) - - return full_names - - def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ @@ -511,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.name_processor.get_search_normalized(postcode) + term = self._search_normalized(postcode) if not term: return diff --git a/nominatim/tokenizer/icu_variants.py b/nominatim/tokenizer/icu_variants.py index 9ebe3684..93272f58 100644 --- a/nominatim/tokenizer/icu_variants.py +++ b/nominatim/tokenizer/icu_variants.py @@ -2,7 +2,6 @@ Data structures for saving variant expansions for ICU tokenizer. """ from collections import namedtuple -import json _ICU_VARIANT_PORPERTY_FIELDS = ['lang'] @@ -24,34 +23,3 @@ class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORP ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties']) - - -def pickle_variant_set(variants): - """ Serializes an iterable of variant rules to a string. - """ - # Create a list of property sets. So they don't need to be duplicated - properties = {} - pid = 1 - for variant in variants: - if variant.properties not in properties: - properties[variant.properties] = pid - pid += 1 - - # Convert the variants into a simple list. - variants = [(v.source, v.replacement, properties[v.properties]) for v in variants] - - # Convert everythin to json. - return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()}, - 'variants': variants}) - - -def unpickle_variant_set(variant_string): - """ Deserializes a variant string that was previously created with - pickle_variant_set() into a set of ICUVariants. - """ - data = json.loads(variant_string) - - properties = {int(k): ICUVariantProperties.from_rules(v) - for k, v in data['properties'].items()} - - return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants'])) diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 8957426b..c935f20d 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer): self._init_db_tables(config) - def init_from_project(self): + def init_from_project(self, _): """ Initialise the tokenizer from the project directory. """ with connect(self.dsn) as conn: @@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer): modulepath=modulepath) - def check_database(self): + def check_database(self, _): """ Check that the tokenizer is set up correctly. """ hint = """\ @@ -405,16 +405,15 @@ class LegacyNameAnalyzer(AbstractAnalyzer): """ token_info = _TokenInfo(self._cache) - names = place.get('name') + names = place.name if names: token_info.add_names(self.conn, names) - country_feature = place.get('country_feature') - if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), names) + if place.is_country(): + self.add_country_names(place.country_code, names) - address = place.get('address') + address = place.address if address: self._process_place_address(token_info, address) diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py new file mode 100644 index 00000000..5961dcf0 --- /dev/null +++ b/nominatim/tokenizer/place_sanitizer.py @@ -0,0 +1,127 @@ +""" +Handler for cleaning name and address tags in place information before it +is handed to the token analysis. +""" +import importlib + +from nominatim.errors import UsageError + +class PlaceName: + """ A searchable name for a place together with properties. + Every name object saves the name proper and two basic properties: + * 'kind' describes the name of the OSM key used without any suffixes + (i.e. the part after the colon removed) + * 'suffix' contains the suffix of the OSM tag, if any. The suffix + is the part of the key after the first colon. + In addition to that, the name may have arbitrary additional attributes. + Which attributes are used, depends on the token analyser. + """ + + def __init__(self, name, kind, suffix): + self.name = name + self.kind = kind + self.suffix = suffix + self.attr = {} + + + def __repr__(self): + return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')" + + + def clone(self, name=None, kind=None, suffix=None, attr=None): + """ Create a deep copy of the place name, optionally with the + given parameters replaced. In the attribute list only the given + keys are updated. The list is not replaced completely. + In particular, the function cannot to be used to remove an + attribute from a place name. + """ + newobj = PlaceName(name or self.name, + kind or self.kind, + suffix or self.suffix) + + newobj.attr.update(self.attr) + if attr: + newobj.attr.update(attr) + + return newobj + + + def set_attr(self, key, value): + """ Add the given property to the name. If the property was already + set, then the value is overwritten. + """ + self.attr[key] = value + + + def get_attr(self, key, default=None): + """ Return the given property or the value of 'default' if it + is not set. + """ + return self.attr.get(key, default) + + + def has_attr(self, key): + """ Check if the given attribute is set. + """ + return key in self.attr + + +class _ProcessInfo: + """ Container class for information handed into to handler functions. + The 'names' and 'address' members are mutable. A handler must change + them by either modifying the lists place or replacing the old content + with a new list. + """ + + def __init__(self, place): + self.place = place + self.names = self._convert_name_dict(place.name) + self.address = self._convert_name_dict(place.address) + + + @staticmethod + def _convert_name_dict(names): + """ Convert a dictionary of names into a list of PlaceNames. + The dictionary key is split into the primary part of the key + and the suffix (the part after an optional colon). + """ + out = [] + + if names: + for key, value in names.items(): + parts = key.split(':', 1) + out.append(PlaceName(value.strip(), + parts[0].strip(), + parts[1].strip() if len(parts) > 1 else None)) + + return out + + +class PlaceSanitizer: + """ Controller class which applies sanitizer functions on the place + names and address before they are used by the token analysers. + """ + + def __init__(self, rules): + self.handlers = [] + + if rules: + for func in rules: + if 'step' not in func: + raise UsageError("Sanitizer rule is missing the 'step' attribute.") + module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_') + handler_module = importlib.import_module(module_name) + self.handlers.append(handler_module.create(func)) + + + def process_names(self, place): + """ Extract a sanitized list of names and address parts from the + given place. The function returns a tuple + (list of names, list of address names) + """ + obj = _ProcessInfo(place) + + for func in self.handlers: + func(obj) + + return obj.names, obj.address diff --git a/nominatim/tokenizer/sanitizers/__init__.py b/nominatim/tokenizer/sanitizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py new file mode 100644 index 00000000..f1514203 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/split_name_list.py @@ -0,0 +1,35 @@ +""" +Name processor that splits name values with multiple values into their components. +""" +import re + +from nominatim.errors import UsageError + +def create(func): + """ Create a name processing function that splits name values with + multiple values into their components. The optional parameter + 'delimiters' can be used to define the characters that should be used + for splitting. The default is ',;'. + """ + delimiter_set = set(func.get('delimiters', ',;')) + if not delimiter_set: + raise UsageError("Set of delimiters in split-name-list sanitizer is empty.") + + regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set))) + + def _process(obj): + if not obj.names: + return + + new_names = [] + for name in obj.names: + split_names = regexp.split(name.name) + print(split_names) + if len(split_names) == 1: + new_names.append(name) + else: + new_names.extend(name.clone(name=n) for n in split_names if n) + + obj.names = new_names + + return _process diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py new file mode 100644 index 00000000..ec91bac9 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py @@ -0,0 +1,22 @@ +""" +Sanitizer handling names with addendums in braces. +""" + +def create(_): + """ Create a name processing function that creates additional name variants + when a name has an addendum in brackets (e.g. "Halle (Saale)"). The + additional variant only contains the main name without the bracket part. + """ + def _process(obj): + """ Add variants for names that have a bracket extension. + """ + if obj.names: + new_names = [] + for name in (n for n in obj.names if '(' in n.name): + new_name = name.name.split('(')[0].strip() + if new_name: + new_names.append(name.clone(name=new_name)) + + obj.names.extend(new_names) + + return _process diff --git a/nominatim/tools/check_database.py b/nominatim/tools/check_database.py index d116554f..30b27d1f 100644 --- a/nominatim/tools/check_database.py +++ b/nominatim/tools/check_database.py @@ -166,7 +166,7 @@ def check_tokenizer(_, config): return CheckState.FAIL, dict(msg="""\ Cannot load tokenizer. Did the import finish sucessfully?""") - result = tokenizer.check_database() + result = tokenizer.check_database(config) if result is None: return CheckState.OK diff --git a/nominatim/tools/tiger_data.py b/nominatim/tools/tiger_data.py index ff498f77..19a12682 100644 --- a/nominatim/tools/tiger_data.py +++ b/nominatim/tools/tiger_data.py @@ -7,12 +7,11 @@ import logging import os import tarfile -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.async_connection import WorkerPool from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.errors import UsageError +from nominatim.indexer.place_info import PlaceInfo LOG = logging.getLogger() @@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer): address = dict(street=row['street'], postcode=row['postcode']) args = ('SRID=4326;' + row['geometry'], int(row['from']), int(row['to']), row['interpolation'], - psycopg2.extras.Json(analyzer.process_place(dict(address=address))), + PlaceInfo({'address': address}).analyze(analyzer), analyzer.normalize_postcode(row['postcode'])) except ValueError: continue diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index c0c8c043..08b7a7ff 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -24,6 +24,9 @@ transliteration: - "[^[:Ascii:]] >" - ":: lower ()" - ":: NFC ()" +sanitizers: + - step: split-name-list + - step: strip-brace-terms variants: - !include icu-rules/variants-bg.yaml - !include icu-rules/variants-ca.yaml diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py index 69202bc3..6029eb7c 100644 --- a/test/python/dummy_tokenizer.py +++ b/test/python/dummy_tokenizer.py @@ -1,6 +1,8 @@ """ Tokenizer for testing. """ +from nominatim.indexer.place_info import PlaceInfo +from nominatim.config import Configuration def create(dsn, data_dir): """ Create a new instance of the tokenizer provided by this module. @@ -21,7 +23,8 @@ class DummyTokenizer: self.init_state = "new" - def init_from_project(self): + def init_from_project(self, config): + assert isinstance(config, Configuration) assert self.init_state is None self.init_state = "loaded" @@ -68,4 +71,5 @@ class DummyNameAnalyzer: @staticmethod def process_place(place): + assert isinstance(place, PlaceInfo) return {} diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index 41978e59..00c29a43 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -100,6 +100,6 @@ def test_get_pg_env_overwrite_variable(monkeypatch): def test_get_pg_env_ignore_unknown(): - env = get_pg_env('tty=stuff', base_env={}) + env = get_pg_env('client_encoding=stuff', base_env={}) assert env == {} diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py index 60ad0bc4..4c9d940d 100644 --- a/test/python/test_indexing.py +++ b/test/python/test_indexing.py @@ -29,6 +29,7 @@ class IndexerTestDB: indexed_date TIMESTAMP, partition SMALLINT, admin_level SMALLINT, + country_code TEXT, address HSTORE, token_info JSONB, geometry_sector INTEGER)""") @@ -54,15 +55,26 @@ class IndexerTestDB: END IF; RETURN NEW; END; $$ LANGUAGE plpgsql;""") - cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex, - OUT name HSTORE, - OUT address HSTORE, - OUT country_feature VARCHAR, - OUT linked_place_id BIGINT) + cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE") + cur.execute("""CREATE TYPE prepare_update_info AS ( + name HSTORE, + address HSTORE, + rank_address SMALLINT, + country_code TEXT, + class TEXT, + type TEXT, + linked_place_id BIGINT + )""") + cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex, + OUT result prepare_update_info) AS $$ BEGIN - address := p.address; - name := p.name; + result.address := p.address; + result.name := p.name; + result.class := p.class; + result.type := p.type; + result.country_code := p.country_code; + result.rank_address := p.rank_address; END; $$ LANGUAGE plpgsql STABLE; """) diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py index ed079269..9a6f5a94 100644 --- a/test/python/test_tokenizer_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -7,10 +7,10 @@ import yaml import pytest from nominatim.tokenizer import icu_tokenizer -from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.db import properties from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.indexer.place_info import PlaceInfo from mock_icu_word_table import MockIcuWordTable @@ -67,11 +67,14 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, monkeypatch.undo() def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',), - variants=('~gasse -> gasse', 'street => st', )): + variants=('~gasse -> gasse', 'street => st', ), + sanitizers=[]): cfgstr = {'normalization' : list(norm), - 'transliteration' : list(trans), - 'variants' : [ {'words': list(variants)}]} - tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr)) + 'sanitizers' : sanitizers, + 'transliteration' : list(trans), + 'variants' : [ {'words': list(variants)}]} + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr)) + tok.loader = ICURuleLoader(test_config) return tok.name_analyzer() @@ -177,9 +180,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory): monkeypatch.undo() tok = tokenizer_factory() - tok.init_from_project() + tok.init_from_project(test_config) - assert tok.naming_rules is not None + assert tok.loader is not None assert tok.term_normalization == ':: lower();' @@ -308,44 +311,54 @@ class TestPlaceNames: @pytest.fixture(autouse=True) def setup(self, analyzer, sql_functions): - with analyzer() as anl: + sanitizers = [{'step': 'split-name-list'}, + {'step': 'strip-brace-terms'}] + with analyzer(sanitizers=sanitizers) as anl: self.analyzer = anl yield anl def expect_name_terms(self, info, *expected_terms): tokens = self.analyzer.get_word_token_info(expected_terms) - print (tokens) for token in tokens: assert token[2] is not None, "No token for {0}".format(token) assert eval(info['names']) == set((t[2] for t in tokens)) + def process_named_place(self, names): + return self.analyzer.process_place(PlaceInfo({'name': names})) + + def test_simple_names(self): - info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}}) + info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'}) self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34') @pytest.mark.parametrize('sep', [',' , ';']) def test_names_with_separator(self, sep): - info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}}) + info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))}) self.expect_name_terms(info, '#New York', '#Big Apple', 'new', 'york', 'big', 'apple') def test_full_names_with_bracket(self): - info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}}) + info = self.process_named_place({'name': 'Houseboat (left)'}) self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat', 'houseboat', 'left') def test_country_name(self, word_table): - info = self.analyzer.process_place({'name': {'name': 'Norge'}, - 'country_feature': 'no'}) + place = PlaceInfo({'name' : {'name': 'Norge'}, + 'country_code': 'no', + 'rank_address': 4, + 'class': 'boundary', + 'type': 'administrative'}) + + info = self.analyzer.process_place(place) self.expect_name_terms(info, '#norge', 'norge') assert word_table.get_country() == {('no', 'NORGE')} @@ -361,7 +374,7 @@ class TestPlaceAddress: def process_address(self, **kwargs): - return self.analyzer.process_place({'address': kwargs}) + return self.analyzer.process_place(PlaceInfo({'address': kwargs})) def name_token_set(self, *expected_terms): diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py index ae05988a..d0ed21ec 100644 --- a/test/python/test_tokenizer_icu_name_processor.py +++ b/test/python/test_tokenizer_icu_name_processor.py @@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation. from textwrap import dedent import pytest -import yaml from nominatim.tokenizer.icu_rule_loader import ICURuleLoader -from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.errors import UsageError @pytest.fixture -def cfgfile(): +def cfgfile(def_config, tmp_path): + project_dir = tmp_path / 'project_dir' + project_dir.mkdir() + def_config.project_dir = project_dir + def _create_config(*variants, **kwargs): content = dedent("""\ normalization: @@ -30,7 +32,9 @@ def cfgfile(): content += '\n'.join((" - " + s for s in variants)) + '\n' for k, v in kwargs: content += " {}: {}\n".format(k, v) - return yaml.safe_load(content) + (project_dir / 'icu_tokenizer.yaml').write_text(content) + + return def_config return _create_config @@ -40,10 +44,9 @@ def get_normalized_variants(proc, name): def test_variants_empty(cfgfile): - fpath = cfgfile('saint -> 🜵', 'street -> st') + config = cfgfile('saint -> 🜵', 'street -> st') - rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) - proc = ICUNameProcessor(rules) + proc = ICURuleLoader(config).make_token_analysis() assert get_normalized_variants(proc, '🜵') == [] assert get_normalized_variants(proc, '🜳') == [] @@ -83,8 +86,8 @@ VARIANT_TESTS = [ @pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS) def test_variants(cfgfile, rules, name, variants): - fpath = cfgfile(*rules) - proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath))) + config = cfgfile(*rules) + proc = ICURuleLoader(config).make_token_analysis() result = get_normalized_variants(proc, name) @@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants): def test_search_normalized(cfgfile): - fpath = cfgfile('~street => s,st', 'master => mstr') - - rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) - proc = ICUNameProcessor(rules) + config = cfgfile('~street => s,st', 'master => mstr') + proc = ICURuleLoader(config).make_token_analysis() assert proc.get_search_normalized('Master Street') == 'master street' assert proc.get_search_normalized('Earnes St') == 'earnes st' diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py index c3480de8..6ec53edc 100644 --- a/test/python/test_tokenizer_icu_rule_loader.py +++ b/test/python/test_tokenizer_icu_rule_loader.py @@ -12,7 +12,16 @@ from nominatim.errors import UsageError from icu import Transliterator @pytest.fixture -def cfgrules(): +def test_config(def_config, tmp_path): + project_dir = tmp_path / 'project_dir' + project_dir.mkdir() + def_config.project_dir = project_dir + + return def_config + + +@pytest.fixture +def cfgrules(test_config): def _create_config(*variants, **kwargs): content = dedent("""\ normalization: @@ -29,19 +38,21 @@ def cfgrules(): content += '\n'.join((" - " + s for s in variants)) + '\n' for k, v in kwargs: content += " {}: {}\n".format(k, v) - return yaml.safe_load(content) + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content) + + return test_config return _create_config -def test_empty_rule_set(): - rule_cfg = yaml.safe_load(dedent("""\ +def test_empty_rule_set(test_config): + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\ normalization: transliteration: variants: """)) - rules = ICURuleLoader(rule_cfg) + rules = ICURuleLoader(test_config) assert rules.get_search_rules() == '' assert rules.get_normalization_rules() == '' assert rules.get_transliteration_rules() == '' @@ -50,11 +61,12 @@ def test_empty_rule_set(): CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants') @pytest.mark.parametrize("section", CONFIG_SECTIONS) -def test_missing_section(section): +def test_missing_section(section, test_config): rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section} + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg)) with pytest.raises(UsageError): - ICURuleLoader(rule_cfg) + ICURuleLoader(test_config) def test_get_search_rules(cfgrules): @@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules): assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt " -def test_transliteration_rules_from_file(def_config, tmp_path): - def_config.project_dir = tmp_path - cfgpath = tmp_path / ('test_config.yaml') +def test_transliteration_rules_from_file(test_config): + cfgpath = test_config.project_dir / ('icu_tokenizer.yaml') cfgpath.write_text(dedent("""\ normalization: transliteration: @@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path): - !include transliteration.yaml variants: """)) - transpath = tmp_path / ('transliteration.yaml') + transpath = test_config.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') - loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml')) + loader = ICURuleLoader(test_config) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py index 4dd3a141..53d45c1c 100644 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@ -5,6 +5,7 @@ import shutil import pytest +from nominatim.indexer.place_info import PlaceInfo from nominatim.tokenizer import legacy_tokenizer from nominatim.db import properties from nominatim.errors import UsageError @@ -131,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config, assert not (test_config.project_dir / 'module').exists() -def test_init_from_project(tokenizer_setup, tokenizer_factory): +def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config): tok = tokenizer_factory() - tok.init_from_project() + tok.init_from_project(test_config) assert tok.normalization is not None @@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name): def test_process_place_names(analyzer, make_keywords): - info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}}) + info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}})) assert info['names'] == '{1,2,3}' @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode): - analyzer.process_place({'address': {'postcode' : pcode}}) + analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}})) assert word_table.get_postcodes() == {pcode, } @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836']) def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode): - analyzer.process_place({'address': {'postcode' : pcode}}) + analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}})) assert not word_table.get_postcodes() @@ -319,7 +320,7 @@ class TestHousenumberName: @staticmethod @pytest.mark.parametrize('hnr', ['123a', '1', '101']) def test_process_place_housenumbers_simple(analyzer, hnr): - info = analyzer.process_place({'address': {'housenumber' : hnr}}) + info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}})) assert info['hnr'] == hnr assert info['hnr_tokens'].startswith("{") @@ -327,15 +328,15 @@ class TestHousenumberName: @staticmethod def test_process_place_housenumbers_lists(analyzer): - info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}}) + info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}})) assert set(info['hnr'].split(';')) == set(('1', '2', '3')) @staticmethod def test_process_place_housenumbers_duplicates(analyzer): - info = analyzer.process_place({'address': {'housenumber' : '134', + info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134', 'conscriptionnumber' : '134', - 'streetnumber' : '99a'}}) + 'streetnumber' : '99a'}})) assert set(info['hnr'].split(';')) == set(('134', '99a')) diff --git a/test/python/test_tools_check_database.py b/test/python/test_tools_check_database.py index aed5cb7e..edba3236 100644 --- a/test/python/test_tools_check_database.py +++ b/test/python/test_tools_check_database.py @@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch, check_result, state): class _TestTokenizer: @staticmethod - def check_database(): + def check_database(_): return check_result monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db', diff --git a/test/python/tokenizer/sanitizers/test_split_name_list.py b/test/python/tokenizer/sanitizers/test_split_name_list.py new file mode 100644 index 00000000..ee745469 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_split_name_list.py @@ -0,0 +1,65 @@ +""" +Tests for the sanitizer that splitts multivalue lists. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.indexer.place_info import PlaceInfo + +from nominatim.errors import UsageError + +def run_sanitizer_on(**kwargs): + place = PlaceInfo({'name': kwargs}) + name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place) + + return sorted([(p.name, p.kind, p.suffix) for p in name]) + + +def sanitize_with_delimiter(delimiter, name): + place = PlaceInfo({'name': {'name': name}}) + san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}]) + name, _ = san.process_names(place) + + return sorted([p.name for p in name]) + + +def test_simple(): + assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)] + assert run_sanitizer_on(name='') == [('', 'name', None)] + + +def test_splits(): + assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None), + ('B', 'name', None), + ('C', 'name', None)] + assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None), + ('boat', 'short_name', None)] + + +def test_empty_fields(): + assert run_sanitizer_on(name='A;;B') == [('A', 'name', None), + ('B', 'name', None)] + assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None), + ('B', 'name', None)] + assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)] + assert run_sanitizer_on(name='B,') == [('B', 'name', None)] + + +def test_custom_delimiters(): + assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3'] + assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@'] + assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to'] + assert sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun'] + + +def test_empty_delimiter_set(): + with pytest.raises(UsageError): + sanitize_with_delimiter('', 'abc') + + +def test_no_name_list(): + place = PlaceInfo({'address': {'housenumber': '3'}}) + name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place) + + assert not name + assert len(address) == 1 diff --git a/test/python/tokenizer/sanitizers/test_strip_brace_terms.py b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py new file mode 100644 index 00000000..50af2449 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py @@ -0,0 +1,44 @@ +""" +Tests for the sanitizer that handles braced suffixes. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.indexer.place_info import PlaceInfo + +def run_sanitizer_on(**kwargs): + place = PlaceInfo({'name': kwargs}) + name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place) + + return sorted([(p.name, p.kind, p.suffix) for p in name]) + + +def test_no_braces(): + assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None), + ('foo', 'name', None)] + + +def test_simple_braces(): + assert run_sanitizer_on(name='Halle (Saale)', ref='3')\ + == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)] + assert run_sanitizer_on(name='ack ( bar')\ + == [('ack', 'name', None), ('ack ( bar', 'name', None)] + + +def test_only_braces(): + assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)] + + +def test_double_braces(): + assert run_sanitizer_on(name='a((b))') == [('a', 'name', None), + ('a((b))', 'name', None)] + assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None), + ('a (b) (c)', 'name', None)] + + +def test_no_names(): + place = PlaceInfo({'address': {'housenumber': '3'}}) + name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place) + + assert not name + assert len(address) == 1 diff --git a/test/python/tokenizer/test_place_sanitizer.py b/test/python/tokenizer/test_place_sanitizer.py new file mode 100644 index 00000000..389b068c --- /dev/null +++ b/test/python/tokenizer/test_place_sanitizer.py @@ -0,0 +1,71 @@ +""" +Tests for execution of the sanitztion step. +""" +import pytest + +from nominatim.errors import UsageError +import nominatim.tokenizer.place_sanitizer as sanitizer +from nominatim.indexer.place_info import PlaceInfo + + +def test_placeinfo_clone_new_name(): + place = sanitizer.PlaceName('foo', 'ki', 'su') + + newplace = place.clone(name='bar') + + assert place.name == 'foo' + assert newplace.name == 'bar' + assert newplace.kind == 'ki' + assert newplace.suffix == 'su' + + +def test_placeinfo_clone_merge_attr(): + place = sanitizer.PlaceName('foo', 'ki', 'su') + place.set_attr('a1', 'v1') + place.set_attr('a2', 'v2') + + newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'}) + + assert place.get_attr('a2') == 'v2' + assert place.get_attr('b2') is None + assert newplace.get_attr('a1') == 'v1' + assert newplace.get_attr('a2') == 'new' + assert newplace.get_attr('b2') == 'foo' + + +def test_placeinfo_has_attr(): + place = sanitizer.PlaceName('foo', 'ki', 'su') + place.set_attr('a1', 'v1') + + assert place.has_attr('a1') + assert not place.has_attr('whatever') + + +def test_sanitizer_default(): + san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}]) + + name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}, + 'address': {'street': 'Bald'}})) + + assert len(name) == 3 + assert all(isinstance(n, sanitizer.PlaceName) for n in name) + assert all(n.kind == 'name' for n in name) + assert all(n.suffix == 'de:de' for n in name) + + assert len(address) == 1 + assert all(isinstance(n, sanitizer.PlaceName) for n in address) + + +@pytest.mark.parametrize('rules', [None, []]) +def test_sanitizer_empty_list(rules): + san = sanitizer.PlaceSanitizer(rules) + + name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}})) + + assert len(name) == 1 + assert all(isinstance(n, sanitizer.PlaceName) for n in name) + + +def test_sanitizer_missing_step_definition(): + with pytest.raises(UsageError): + san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])