Merge pull request #2458 from lonvia/add-tokenizer-preprocessing

Add a "sanitation" step for name and address tags before token processing
This commit is contained in:
Sarah Hoffmann 2021-10-01 21:53:34 +02:00 committed by GitHub
commit 19d4e047f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 726 additions and 262 deletions

View File

@ -1,30 +1,33 @@
-- Trigger functions for the placex table.
-- Information returned by update preparation.
DROP TYPE IF EXISTS prepare_update_info CASCADE;
CREATE TYPE prepare_update_info AS (
name HSTORE,
address HSTORE,
rank_address SMALLINT,
country_code TEXT,
class TEXT,
type TEXT,
linked_place_id BIGINT
);
-- Retrieve the data needed by the indexer for updating the place.
--
-- Return parameters:
-- name list of names
-- address list of address tags, either from the object or a surrounding
-- building
-- country_feature If the place is a country feature, this contains the
-- country code, otherwise it is null.
CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
OUT name HSTORE,
OUT address HSTORE,
OUT country_feature VARCHAR,
OUT linked_place_id BIGINT)
CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
RETURNS prepare_update_info
AS $$
DECLARE
location RECORD;
result prepare_update_info;
BEGIN
-- For POI nodes, check if the address should be derived from a surrounding
-- building.
IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
address := p.address;
result.address := p.address;
ELSE
-- The additional && condition works around the misguided query
-- planner of postgis 3.0.
SELECT placex.address || hstore('_inherited', '') INTO address
SELECT placex.address || hstore('_inherited', '') INTO result.address
FROM placex
WHERE ST_Covers(geometry, p.centroid)
and geometry && p.centroid
@ -34,27 +37,26 @@ BEGIN
LIMIT 1;
END IF;
address := address - '_unlisted_place'::TEXT;
name := p.name;
result.address := result.address - '_unlisted_place'::TEXT;
result.name := p.name;
result.class := p.class;
result.type := p.type;
result.country_code := p.country_code;
result.rank_address := p.rank_address;
-- Names of linked places need to be merged in, so search for a linkable
-- place already here.
SELECT * INTO location FROM find_linked_place(p);
IF location.place_id is not NULL THEN
linked_place_id := location.place_id;
result.linked_place_id := location.place_id;
IF NOT location.name IS NULL THEN
name := location.name || name;
result.name := location.name || result.name;
END IF;
END IF;
country_feature := CASE WHEN p.admin_level = 2
and p.class = 'boundary' and p.type = 'administrative'
and p.osm_type = 'R'
THEN p.country_code
ELSE null
END;
RETURN result;
END;
$$
LANGUAGE plpgsql STABLE;

View File

@ -0,0 +1,68 @@
"""
Wrapper around place information the indexer gets from the database and hands to
the tokenizer.
"""
import psycopg2.extras
class PlaceInfo:
""" Data class containing all information the tokenizer gets about a
place it should process the names for.
"""
def __init__(self, info):
self._info = info
def analyze(self, analyzer):
""" Process this place with the given tokenizer and return the
result in psycopg2-compatible Json.
"""
return psycopg2.extras.Json(analyzer.process_place(self))
@property
def name(self):
""" A dictionary with the names of the place or None if the place
has no names.
"""
return self._info.get('name')
@property
def address(self):
""" A dictionary with the address elements of the place
or None if no address information is available.
"""
return self._info.get('address')
@property
def country_code(self):
""" The country code of the country the place is in. Guaranteed
to be a two-letter lower-case string or None, if no country
could be found.
"""
return self._info.get('country_code')
@property
def rank_address(self):
""" The computed rank address before rank correction.
"""
return self._info.get('rank_address')
def is_a(self, key, value):
""" Check if the place's primary tag corresponds to the given
key and value.
"""
return self._info.get('class') == key and self._info.get('type') == value
def is_country(self):
""" Check if the place is a valid country boundary.
"""
return self.rank_address == 4 \
and self.is_a('boundary', 'administrative') \
and self.country_code is not None

View File

@ -4,14 +4,16 @@ tasks.
"""
import functools
import psycopg2.extras
from psycopg2 import sql as pysql
from nominatim.indexer.place_info import PlaceInfo
# pylint: disable=C0111
def _mk_valuelist(template, num):
return pysql.SQL(',').join([pysql.SQL(template)] * num)
class AbstractPlacexRunner:
""" Returns SQL commands for indexing of the placex table.
"""
@ -37,7 +39,7 @@ class AbstractPlacexRunner:
@staticmethod
def get_place_details(worker, ids):
worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
FROM placex WHERE place_id IN %s""",
(tuple((p[0] for p in ids)), ))
@ -47,7 +49,7 @@ class AbstractPlacexRunner:
for place in places:
for field in ('place_id', 'name', 'address', 'linked_place_id'):
values.append(place[field])
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
values.append(PlaceInfo(place).analyze(self.analyzer))
worker.perform(self._index_sql(len(places)), values)
@ -141,7 +143,7 @@ class InterpolationRunner:
values = []
for place in places:
values.extend((place[x] for x in ('place_id', 'address')))
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
values.append(PlaceInfo(place).analyze(self.analyzer))
worker.perform(self._index_sql(len(places)), values)

View File

@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any
from nominatim.config import Configuration
from nominatim.indexer.place_info import PlaceInfo
# pylint: disable=unnecessary-pass
@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC):
@abstractmethod
def process_place(self, place: Dict) -> Any:
def process_place(self, place: PlaceInfo) -> Any:
""" Extract tokens for the given place and compute the
information to be handed to the PL/pgSQL processor for building
the search index.
Arguments:
place: Dictionary with the information about the place. Currently
the following fields may be present:
- *name* is a dictionary of names for the place together
with the designation of the name.
- *address* is a dictionary of address terms.
- *country_feature* is set to a country code when the
place describes a country.
place: Place information retrived from the database.
Returns:
A JSON-serialisable structure that will be handed into
@ -142,7 +136,7 @@ class AbstractTokenizer(ABC):
the tokenizer remains stable over updates.
Arguments:
config: Read-only object with configuration obtions.
config: Read-only object with configuration options.
init_db: When set to False, then initialisation of database
tables should be skipped. This option is only required for
@ -155,11 +149,14 @@ class AbstractTokenizer(ABC):
@abstractmethod
def init_from_project(self) -> None:
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from an existing database setup.
The function should load all previously saved configuration from
the project directory and/or the property table.
Arguments:
config: Read-only object with configuration options.
"""
pass
@ -172,7 +169,7 @@ class AbstractTokenizer(ABC):
during query time.
Arguments:
config: Read-only object with configuration obtions.
config: Read-only object with configuration options.
"""
pass
@ -187,13 +184,13 @@ class AbstractTokenizer(ABC):
data structures or data itself must not be changed by this function.
Arguments:
config: Read-only object with configuration obtions.
config: Read-only object with configuration options.
"""
pass
@abstractmethod
def check_database(self) -> str:
def check_database(self, config: Configuration) -> str:
""" Check that the database is set up correctly and ready for being
queried.
@ -202,6 +199,9 @@ class AbstractTokenizer(ABC):
description of the issue as well as hints for the user on
how to resolve the issue.
Arguments:
config: Read-only object with configuration options.
Return `None`, if no issue was found.
"""
pass

View File

@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_from_project()
tokenizer.init_from_project(config)
return tokenizer

View File

@ -8,67 +8,25 @@ import itertools
from icu import Transliterator
import datrie
from nominatim.db.properties import set_property, get_property
from nominatim.tokenizer import icu_variants as variants
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
class ICUNameProcessorRules:
""" Data object that saves the rules needed for the name processor.
The rules can either be initialised through an ICURuleLoader or
be loaded from a database when a connection is given.
"""
def __init__(self, loader=None, conn=None):
if loader is not None:
self.norm_rules = loader.get_normalization_rules()
self.trans_rules = loader.get_transliteration_rules()
self.replacements = loader.get_replacement_pairs()
self.search_rules = loader.get_search_rules()
elif conn is not None:
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.replacements = \
variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
else:
assert False, "Parameter loader or conn required."
def save_rules(self, conn):
""" Save the rules in the property table of the given database.
the rules can be loaded again by handing in a connection into
the constructor of the class.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
variants.pickle_variant_set(self.replacements))
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
class ICUNameProcessor:
""" Collects the different transformation rules for normalisation of names
and provides the functions to aply the transformations.
and provides the functions to apply the transformations.
"""
def __init__(self, rules):
def __init__(self, norm_rules, trans_rules, replacements):
self.normalizer = Transliterator.createFromRules("icu_normalization",
rules.norm_rules)
norm_rules)
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
rules.trans_rules +
trans_rules +
";[:Space:]+ > ' '")
self.search = Transliterator.createFromRules("icu_search",
rules.search_rules)
norm_rules + trans_rules)
# Intermediate reorder by source. Also compute required character set.
immediate = defaultdict(list)
chars = set()
for variant in rules.replacements:
for variant in replacements:
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
replstr = variant.replacement[:-1]
else:

View File

@ -2,17 +2,26 @@
Helper class to create ICU rules from a configuration file.
"""
import io
import json
import logging
import itertools
import re
from icu import Transliterator
from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
import nominatim.tokenizer.icu_variants as variants
LOG = logging.getLogger()
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
def _flatten_config_list(content):
if not content:
return []
@ -46,12 +55,52 @@ class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, rules):
def __init__(self, config):
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
self.variants = set()
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self._parse_variant_list(self._get_section(rules, 'variants'))
self.analysis_rules = self._get_section(rules, 'variants')
self._parse_variant_list()
# Load optional sanitizer rule set.
self.sanitizer_rules = rules.get('sanitizers', [])
def load_config_from_db(self, conn):
""" Get previously saved parts of the configuration from the
database.
"""
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
self._parse_variant_list()
def save_config_to_db(self, conn):
""" Save the part of the configuration that cannot be changed into
the database.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
def make_sanitizer(self):
""" Create a place sanitizer from the configured rules.
"""
return PlaceSanitizer(self.sanitizer_rules)
def make_token_analysis(self):
""" Create a token analyser from the reviouly loaded rules.
"""
return ICUNameProcessor(self.normalization_rules,
self.transliteration_rules,
self.variants)
def get_search_rules(self):
@ -112,7 +161,9 @@ class ICURuleLoader:
return ';'.join(_flatten_config_list(content)) + ';'
def _parse_variant_list(self, rules):
def _parse_variant_list(self):
rules = self.analysis_rules
self.variants.clear()
if not rules:

View File

@ -13,8 +13,8 @@ from nominatim.db.connection import connect
from nominatim.db.properties import set_property, get_property
from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.indexer.place_info import PlaceInfo
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@ -36,7 +36,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
def __init__(self, dsn, data_dir):
self.dsn = dsn
self.data_dir = data_dir
self.naming_rules = None
self.loader = None
self.term_normalization = None
@ -46,9 +46,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG'))
self.naming_rules = ICUNameProcessorRules(loader=loader)
self.loader = ICURuleLoader(config)
self.term_normalization = config.TERM_NORMALIZATION
self._install_php(config.lib_dir.php)
@ -59,11 +58,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
self._init_db_tables(config)
def init_from_project(self):
def init_from_project(self, config):
""" Initialise the tokenizer from the project directory.
"""
self.loader = ICURuleLoader(config)
with connect(self.dsn) as conn:
self.naming_rules = ICUNameProcessorRules(conn=conn)
self.loader.load_config_from_db(conn)
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
@ -81,12 +82,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
def check_database(self):
def check_database(self, config):
""" Check that the tokenizer is set up correctly.
"""
self.init_from_project()
self.init_from_project(config)
if self.naming_rules is None:
if self.term_normalization is None:
return "Configuration for tokenizer 'icu' are missing."
return None
@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
self.loader.make_token_analysis())
def _install_php(self, phpdir):
@ -118,7 +120,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
@ -127,8 +129,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
database as database properties.
"""
with connect(self.dsn) as conn:
self.naming_rules.save_rules(conn)
self.loader.save_config_to_db(conn)
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
@ -163,7 +164,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
""" Count the partial terms from the names in the place table.
"""
words = Counter()
name_proc = ICUNameProcessor(self.naming_rules)
name_proc = self.loader.make_token_analysis()
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM
@ -188,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
normalization.
"""
def __init__(self, dsn, name_proc):
def __init__(self, dsn, sanitizer, token_analysis):
self.conn = connect(dsn).connection
self.conn.autocommit = True
self.name_processor = name_proc
self.sanitizer = sanitizer
self.token_analysis = token_analysis
self._cache = _TokenCache()
@ -204,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
self.conn = None
def _search_normalized(self, name):
""" Return the search token transliteration of the given name.
"""
return self.token_analysis.get_search_normalized(name)
def _normalized(self, name):
""" Return the normalized version of the given name with all
non-relevant information removed.
"""
return self.token_analysis.get_normalized(name)
def get_word_token_info(self, words):
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
@ -219,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
partial_tokens = {}
for word in words:
if word.startswith('#'):
full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
full_tokens[word] = self._search_normalized(word[1:])
else:
partial_tokens[word] = self.name_processor.get_search_normalized(word)
partial_tokens[word] = self._search_normalized(word)
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
@ -252,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
This function takes minor shortcuts on transliteration.
"""
return self.name_processor.get_search_normalized(hnr)
return self._search_normalized(hnr)
def update_postcodes_from_db(self):
""" Update postcode tokens in the word table from the location_postcode
@ -275,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
if postcode is None:
to_delete.append(word)
else:
copystr.add(self.name_processor.get_search_normalized(postcode),
copystr.add(self._search_normalized(postcode),
'P', postcode)
if to_delete:
@ -293,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
completely replaced. Otherwise the phrases are added to the
already existing ones.
"""
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
@ -323,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
added = 0
with CopyBuffer() as copystr:
for word, cls, typ, oper in to_add:
term = self.name_processor.get_search_normalized(word)
term = self._search_normalized(word)
if term:
copystr.add(term, 'S', word,
json.dumps({'class': cls, 'type': typ,
@ -357,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def add_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
# Make sure any name preprocessing for country names applies.
info = PlaceInfo({'name': names, 'country_code': country_code,
'rank_address': 4, 'class': 'boundary',
'type': 'administrative'})
self._add_country_full_names(country_code,
self.sanitizer.process_names(info)[0])
def _add_country_full_names(self, country_code, names):
""" Add names for the given country from an already sanitized
name list.
"""
word_tokens = set()
for name in self._compute_full_names(names):
norm_name = self.name_processor.get_search_normalized(name)
for name in names:
norm_name = self._search_normalized(name.name)
if norm_name:
word_tokens.add(norm_name)
@ -385,23 +412,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def process_place(self, place):
""" Determine tokenizer information about the given place.
Returns a JSON-serialisable structure that will be handed into
Returns a JSON-serializable structure that will be handed into
the database via the token_info field.
"""
token_info = _TokenInfo(self._cache)
names = place.get('name')
names, address = self.sanitizer.process_names(place)
if names:
fulls, partials = self._compute_name_tokens(names)
token_info.add_names(fulls, partials)
country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self.add_country_names(country_feature.lower(), names)
if place.is_country():
self._add_country_full_names(place.country_code, names)
address = place.get('address')
if address:
self._process_place_address(token_info, address)
@ -411,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def _process_place_address(self, token_info, address):
hnrs = []
addr_terms = []
for key, value in address.items():
if key == 'postcode':
self._add_postcode(value)
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
token_info.add_street(self._compute_partial_tokens(value))
elif key == 'place':
token_info.add_place(self._compute_partial_tokens(value))
elif not key.startswith('_') and \
key not in ('country', 'full'):
addr_terms.append((key, self._compute_partial_tokens(value)))
for item in address:
if item.kind == 'postcode':
self._add_postcode(item.name)
elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(item.name)
elif item.kind == 'street':
token_info.add_street(self._compute_partial_tokens(item.name))
elif item.kind == 'place':
token_info.add_place(self._compute_partial_tokens(item.name))
elif not item.kind.startswith('_') and \
item.kind not in ('country', 'full'):
addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
if hnrs:
hnrs = self._split_housenumbers(hnrs)
@ -435,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" Normalize the given term, split it into partial words and return
then token list for them.
"""
norm_name = self.name_processor.get_search_normalized(name)
norm_name = self._search_normalized(name)
tokens = []
need_lookup = []
@ -458,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
return tokens
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
"""
full_names = self._compute_full_names(names)
full_tokens = set()
partial_tokens = set()
for name in full_names:
norm_name = self.name_processor.get_normalized(name)
for name in names:
norm_name = self._normalized(name.name)
full, part = self._cache.names.get(norm_name, (None, None))
if full is None:
variants = self.name_processor.get_variants_ascii(norm_name)
variants = self.token_analysis.get_variants_ascii(norm_name)
if not variants:
continue
@ -487,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
return full_tokens, partial_tokens
@staticmethod
def _compute_full_names(names):
""" Return the set of all full name word ids to be used with the
given dictionary of names.
"""
full_names = set()
for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
if name:
full_names.add(name)
brace_idx = name.find('(')
if brace_idx >= 0:
full_names.add(name[:brace_idx].strip())
return full_names
def _add_postcode(self, postcode):
""" Make sure the normalized postcode is present in the word table.
"""
@ -511,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
postcode = self.normalize_postcode(postcode)
if postcode not in self._cache.postcodes:
term = self.name_processor.get_search_normalized(postcode)
term = self._search_normalized(postcode)
if not term:
return

View File

@ -2,7 +2,6 @@
Data structures for saving variant expansions for ICU tokenizer.
"""
from collections import namedtuple
import json
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
@ -24,34 +23,3 @@ class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORP
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
def pickle_variant_set(variants):
""" Serializes an iterable of variant rules to a string.
"""
# Create a list of property sets. So they don't need to be duplicated
properties = {}
pid = 1
for variant in variants:
if variant.properties not in properties:
properties[variant.properties] = pid
pid += 1
# Convert the variants into a simple list.
variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
# Convert everythin to json.
return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
'variants': variants})
def unpickle_variant_set(variant_string):
""" Deserializes a variant string that was previously created with
pickle_variant_set() into a set of ICUVariants.
"""
data = json.loads(variant_string)
properties = {int(k): ICUVariantProperties.from_rules(v)
for k, v in data['properties'].items()}
return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))

View File

@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer):
self._init_db_tables(config)
def init_from_project(self):
def init_from_project(self, _):
""" Initialise the tokenizer from the project directory.
"""
with connect(self.dsn) as conn:
@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer):
modulepath=modulepath)
def check_database(self):
def check_database(self, _):
""" Check that the tokenizer is set up correctly.
"""
hint = """\
@ -405,16 +405,15 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
"""
token_info = _TokenInfo(self._cache)
names = place.get('name')
names = place.name
if names:
token_info.add_names(self.conn, names)
country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self.add_country_names(country_feature.lower(), names)
if place.is_country():
self.add_country_names(place.country_code, names)
address = place.get('address')
address = place.address
if address:
self._process_place_address(token_info, address)

View File

@ -0,0 +1,127 @@
"""
Handler for cleaning name and address tags in place information before it
is handed to the token analysis.
"""
import importlib
from nominatim.errors import UsageError
class PlaceName:
""" A searchable name for a place together with properties.
Every name object saves the name proper and two basic properties:
* 'kind' describes the name of the OSM key used without any suffixes
(i.e. the part after the colon removed)
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
is the part of the key after the first colon.
In addition to that, the name may have arbitrary additional attributes.
Which attributes are used, depends on the token analyser.
"""
def __init__(self, name, kind, suffix):
self.name = name
self.kind = kind
self.suffix = suffix
self.attr = {}
def __repr__(self):
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
def clone(self, name=None, kind=None, suffix=None, attr=None):
""" Create a deep copy of the place name, optionally with the
given parameters replaced. In the attribute list only the given
keys are updated. The list is not replaced completely.
In particular, the function cannot to be used to remove an
attribute from a place name.
"""
newobj = PlaceName(name or self.name,
kind or self.kind,
suffix or self.suffix)
newobj.attr.update(self.attr)
if attr:
newobj.attr.update(attr)
return newobj
def set_attr(self, key, value):
""" Add the given property to the name. If the property was already
set, then the value is overwritten.
"""
self.attr[key] = value
def get_attr(self, key, default=None):
""" Return the given property or the value of 'default' if it
is not set.
"""
return self.attr.get(key, default)
def has_attr(self, key):
""" Check if the given attribute is set.
"""
return key in self.attr
class _ProcessInfo:
""" Container class for information handed into to handler functions.
The 'names' and 'address' members are mutable. A handler must change
them by either modifying the lists place or replacing the old content
with a new list.
"""
def __init__(self, place):
self.place = place
self.names = self._convert_name_dict(place.name)
self.address = self._convert_name_dict(place.address)
@staticmethod
def _convert_name_dict(names):
""" Convert a dictionary of names into a list of PlaceNames.
The dictionary key is split into the primary part of the key
and the suffix (the part after an optional colon).
"""
out = []
if names:
for key, value in names.items():
parts = key.split(':', 1)
out.append(PlaceName(value.strip(),
parts[0].strip(),
parts[1].strip() if len(parts) > 1 else None))
return out
class PlaceSanitizer:
""" Controller class which applies sanitizer functions on the place
names and address before they are used by the token analysers.
"""
def __init__(self, rules):
self.handlers = []
if rules:
for func in rules:
if 'step' not in func:
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
handler_module = importlib.import_module(module_name)
self.handlers.append(handler_module.create(func))
def process_names(self, place):
""" Extract a sanitized list of names and address parts from the
given place. The function returns a tuple
(list of names, list of address names)
"""
obj = _ProcessInfo(place)
for func in self.handlers:
func(obj)
return obj.names, obj.address

View File

@ -0,0 +1,35 @@
"""
Name processor that splits name values with multiple values into their components.
"""
import re
from nominatim.errors import UsageError
def create(func):
""" Create a name processing function that splits name values with
multiple values into their components. The optional parameter
'delimiters' can be used to define the characters that should be used
for splitting. The default is ',;'.
"""
delimiter_set = set(func.get('delimiters', ',;'))
if not delimiter_set:
raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
def _process(obj):
if not obj.names:
return
new_names = []
for name in obj.names:
split_names = regexp.split(name.name)
print(split_names)
if len(split_names) == 1:
new_names.append(name)
else:
new_names.extend(name.clone(name=n) for n in split_names if n)
obj.names = new_names
return _process

View File

@ -0,0 +1,22 @@
"""
Sanitizer handling names with addendums in braces.
"""
def create(_):
""" Create a name processing function that creates additional name variants
when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
additional variant only contains the main name without the bracket part.
"""
def _process(obj):
""" Add variants for names that have a bracket extension.
"""
if obj.names:
new_names = []
for name in (n for n in obj.names if '(' in n.name):
new_name = name.name.split('(')[0].strip()
if new_name:
new_names.append(name.clone(name=new_name))
obj.names.extend(new_names)
return _process

View File

@ -166,7 +166,7 @@ def check_tokenizer(_, config):
return CheckState.FAIL, dict(msg="""\
Cannot load tokenizer. Did the import finish sucessfully?""")
result = tokenizer.check_database()
result = tokenizer.check_database(config)
if result is None:
return CheckState.OK

View File

@ -7,12 +7,11 @@ import logging
import os
import tarfile
import psycopg2.extras
from nominatim.db.connection import connect
from nominatim.db.async_connection import WorkerPool
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.errors import UsageError
from nominatim.indexer.place_info import PlaceInfo
LOG = logging.getLogger()
@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
address = dict(street=row['street'], postcode=row['postcode'])
args = ('SRID=4326;' + row['geometry'],
int(row['from']), int(row['to']), row['interpolation'],
psycopg2.extras.Json(analyzer.process_place(dict(address=address))),
PlaceInfo({'address': address}).analyze(analyzer),
analyzer.normalize_postcode(row['postcode']))
except ValueError:
continue

View File

@ -24,6 +24,9 @@ transliteration:
- "[^[:Ascii:]] >"
- ":: lower ()"
- ":: NFC ()"
sanitizers:
- step: split-name-list
- step: strip-brace-terms
variants:
- !include icu-rules/variants-bg.yaml
- !include icu-rules/variants-ca.yaml

View File

@ -1,6 +1,8 @@
"""
Tokenizer for testing.
"""
from nominatim.indexer.place_info import PlaceInfo
from nominatim.config import Configuration
def create(dsn, data_dir):
""" Create a new instance of the tokenizer provided by this module.
@ -21,7 +23,8 @@ class DummyTokenizer:
self.init_state = "new"
def init_from_project(self):
def init_from_project(self, config):
assert isinstance(config, Configuration)
assert self.init_state is None
self.init_state = "loaded"
@ -68,4 +71,5 @@ class DummyNameAnalyzer:
@staticmethod
def process_place(place):
assert isinstance(place, PlaceInfo)
return {}

View File

@ -100,6 +100,6 @@ def test_get_pg_env_overwrite_variable(monkeypatch):
def test_get_pg_env_ignore_unknown():
env = get_pg_env('tty=stuff', base_env={})
env = get_pg_env('client_encoding=stuff', base_env={})
assert env == {}

View File

@ -29,6 +29,7 @@ class IndexerTestDB:
indexed_date TIMESTAMP,
partition SMALLINT,
admin_level SMALLINT,
country_code TEXT,
address HSTORE,
token_info JSONB,
geometry_sector INTEGER)""")
@ -54,15 +55,26 @@ class IndexerTestDB:
END IF;
RETURN NEW;
END; $$ LANGUAGE plpgsql;""")
cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
OUT name HSTORE,
OUT address HSTORE,
OUT country_feature VARCHAR,
OUT linked_place_id BIGINT)
cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE")
cur.execute("""CREATE TYPE prepare_update_info AS (
name HSTORE,
address HSTORE,
rank_address SMALLINT,
country_code TEXT,
class TEXT,
type TEXT,
linked_place_id BIGINT
)""")
cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex,
OUT result prepare_update_info)
AS $$
BEGIN
address := p.address;
name := p.name;
result.address := p.address;
result.name := p.name;
result.class := p.class;
result.type := p.type;
result.country_code := p.country_code;
result.rank_address := p.rank_address;
END;
$$ LANGUAGE plpgsql STABLE;
""")

View File

@ -7,10 +7,10 @@ import yaml
import pytest
from nominatim.tokenizer import icu_tokenizer
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.indexer.place_info import PlaceInfo
from mock_icu_word_table import MockIcuWordTable
@ -67,11 +67,14 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
monkeypatch.undo()
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
variants=('~gasse -> gasse', 'street => st', )):
variants=('~gasse -> gasse', 'street => st', ),
sanitizers=[]):
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
'variants' : [ {'words': list(variants)}]}
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
'sanitizers' : sanitizers,
'transliteration' : list(trans),
'variants' : [ {'words': list(variants)}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)
return tok.name_analyzer()
@ -177,9 +180,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
monkeypatch.undo()
tok = tokenizer_factory()
tok.init_from_project()
tok.init_from_project(test_config)
assert tok.naming_rules is not None
assert tok.loader is not None
assert tok.term_normalization == ':: lower();'
@ -308,44 +311,54 @@ class TestPlaceNames:
@pytest.fixture(autouse=True)
def setup(self, analyzer, sql_functions):
with analyzer() as anl:
sanitizers = [{'step': 'split-name-list'},
{'step': 'strip-brace-terms'}]
with analyzer(sanitizers=sanitizers) as anl:
self.analyzer = anl
yield anl
def expect_name_terms(self, info, *expected_terms):
tokens = self.analyzer.get_word_token_info(expected_terms)
print (tokens)
for token in tokens:
assert token[2] is not None, "No token for {0}".format(token)
assert eval(info['names']) == set((t[2] for t in tokens))
def process_named_place(self, names):
return self.analyzer.process_place(PlaceInfo({'name': names}))
def test_simple_names(self):
info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
@pytest.mark.parametrize('sep', [',' , ';'])
def test_names_with_separator(self, sep):
info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}})
info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
self.expect_name_terms(info, '#New York', '#Big Apple',
'new', 'york', 'big', 'apple')
def test_full_names_with_bracket(self):
info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}})
info = self.process_named_place({'name': 'Houseboat (left)'})
self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
'houseboat', 'left')
def test_country_name(self, word_table):
info = self.analyzer.process_place({'name': {'name': 'Norge'},
'country_feature': 'no'})
place = PlaceInfo({'name' : {'name': 'Norge'},
'country_code': 'no',
'rank_address': 4,
'class': 'boundary',
'type': 'administrative'})
info = self.analyzer.process_place(place)
self.expect_name_terms(info, '#norge', 'norge')
assert word_table.get_country() == {('no', 'NORGE')}
@ -361,7 +374,7 @@ class TestPlaceAddress:
def process_address(self, **kwargs):
return self.analyzer.process_place({'address': kwargs})
return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
def name_token_set(self, *expected_terms):

View File

@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
from textwrap import dedent
import pytest
import yaml
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.errors import UsageError
@pytest.fixture
def cfgfile():
def cfgfile(def_config, tmp_path):
project_dir = tmp_path / 'project_dir'
project_dir.mkdir()
def_config.project_dir = project_dir
def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
@ -30,7 +32,9 @@ def cfgfile():
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
return yaml.safe_load(content)
(project_dir / 'icu_tokenizer.yaml').write_text(content)
return def_config
return _create_config
@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
def test_variants_empty(cfgfile):
fpath = cfgfile('saint -> 🜵', 'street -> st')
config = cfgfile('saint -> 🜵', 'street -> st')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
proc = ICURuleLoader(config).make_token_analysis()
assert get_normalized_variants(proc, '🜵') == []
assert get_normalized_variants(proc, '🜳') == []
@ -83,8 +86,8 @@ VARIANT_TESTS = [
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
def test_variants(cfgfile, rules, name, variants):
fpath = cfgfile(*rules)
proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
config = cfgfile(*rules)
proc = ICURuleLoader(config).make_token_analysis()
result = get_normalized_variants(proc, name)
@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
def test_search_normalized(cfgfile):
fpath = cfgfile('~street => s,st', 'master => mstr')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
config = cfgfile('~street => s,st', 'master => mstr')
proc = ICURuleLoader(config).make_token_analysis()
assert proc.get_search_normalized('Master Street') == 'master street'
assert proc.get_search_normalized('Earnes St') == 'earnes st'

View File

@ -12,7 +12,16 @@ from nominatim.errors import UsageError
from icu import Transliterator
@pytest.fixture
def cfgrules():
def test_config(def_config, tmp_path):
project_dir = tmp_path / 'project_dir'
project_dir.mkdir()
def_config.project_dir = project_dir
return def_config
@pytest.fixture
def cfgrules(test_config):
def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
@ -29,19 +38,21 @@ def cfgrules():
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
return yaml.safe_load(content)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
return test_config
return _create_config
def test_empty_rule_set():
rule_cfg = yaml.safe_load(dedent("""\
def test_empty_rule_set(test_config):
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
normalization:
transliteration:
variants:
"""))
rules = ICURuleLoader(rule_cfg)
rules = ICURuleLoader(test_config)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
@ -50,11 +61,12 @@ def test_empty_rule_set():
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_section(section):
def test_missing_section(section, test_config):
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
with pytest.raises(UsageError):
ICURuleLoader(rule_cfg)
ICURuleLoader(test_config)
def test_get_search_rules(cfgrules):
@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def test_transliteration_rules_from_file(def_config, tmp_path):
def_config.project_dir = tmp_path
cfgpath = tmp_path / ('test_config.yaml')
def test_transliteration_rules_from_file(test_config):
cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
cfgpath.write_text(dedent("""\
normalization:
transliteration:
@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
- !include transliteration.yaml
variants:
"""))
transpath = tmp_path / ('transliteration.yaml')
transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
loader = ICURuleLoader(test_config)
rules = loader.get_transliteration_rules()
trans = Transliterator.createFromRules("test", rules)

View File

@ -5,6 +5,7 @@ import shutil
import pytest
from nominatim.indexer.place_info import PlaceInfo
from nominatim.tokenizer import legacy_tokenizer
from nominatim.db import properties
from nominatim.errors import UsageError
@ -131,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
assert not (test_config.project_dir / 'module').exists()
def test_init_from_project(tokenizer_setup, tokenizer_factory):
def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
tok = tokenizer_factory()
tok.init_from_project()
tok.init_from_project(test_config)
assert tok.normalization is not None
@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
def test_process_place_names(analyzer, make_keywords):
info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}}))
assert info['names'] == '{1,2,3}'
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
analyzer.process_place({'address': {'postcode' : pcode}})
analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
assert word_table.get_postcodes() == {pcode, }
@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
analyzer.process_place({'address': {'postcode' : pcode}})
analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
assert not word_table.get_postcodes()
@ -319,7 +320,7 @@ class TestHousenumberName:
@staticmethod
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
def test_process_place_housenumbers_simple(analyzer, hnr):
info = analyzer.process_place({'address': {'housenumber' : hnr}})
info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}}))
assert info['hnr'] == hnr
assert info['hnr_tokens'].startswith("{")
@ -327,15 +328,15 @@ class TestHousenumberName:
@staticmethod
def test_process_place_housenumbers_lists(analyzer):
info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}}))
assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
@staticmethod
def test_process_place_housenumbers_duplicates(analyzer):
info = analyzer.process_place({'address': {'housenumber' : '134',
info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134',
'conscriptionnumber' : '134',
'streetnumber' : '99a'}})
'streetnumber' : '99a'}}))
assert set(info['hnr'].split(';')) == set(('134', '99a'))

View File

@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
check_result, state):
class _TestTokenizer:
@staticmethod
def check_database():
def check_database(_):
return check_result
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',

View File

@ -0,0 +1,65 @@
"""
Tests for the sanitizer that splitts multivalue lists.
"""
import pytest
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.indexer.place_info import PlaceInfo
from nominatim.errors import UsageError
def run_sanitizer_on(**kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
return sorted([(p.name, p.kind, p.suffix) for p in name])
def sanitize_with_delimiter(delimiter, name):
place = PlaceInfo({'name': {'name': name}})
san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
name, _ = san.process_names(place)
return sorted([p.name for p in name])
def test_simple():
assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
assert run_sanitizer_on(name='') == [('', 'name', None)]
def test_splits():
assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
('B', 'name', None),
('C', 'name', None)]
assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
('boat', 'short_name', None)]
def test_empty_fields():
assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
('B', 'name', None)]
assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
('B', 'name', None)]
assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
def test_custom_delimiters():
assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
assert sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun']
def test_empty_delimiter_set():
with pytest.raises(UsageError):
sanitize_with_delimiter('', 'abc')
def test_no_name_list():
place = PlaceInfo({'address': {'housenumber': '3'}})
name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
assert not name
assert len(address) == 1

View File

@ -0,0 +1,44 @@
"""
Tests for the sanitizer that handles braced suffixes.
"""
import pytest
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.indexer.place_info import PlaceInfo
def run_sanitizer_on(**kwargs):
place = PlaceInfo({'name': kwargs})
name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
return sorted([(p.name, p.kind, p.suffix) for p in name])
def test_no_braces():
assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
('foo', 'name', None)]
def test_simple_braces():
assert run_sanitizer_on(name='Halle (Saale)', ref='3')\
== [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
assert run_sanitizer_on(name='ack ( bar')\
== [('ack', 'name', None), ('ack ( bar', 'name', None)]
def test_only_braces():
assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
def test_double_braces():
assert run_sanitizer_on(name='a((b))') == [('a', 'name', None),
('a((b))', 'name', None)]
assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
('a (b) (c)', 'name', None)]
def test_no_names():
place = PlaceInfo({'address': {'housenumber': '3'}})
name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
assert not name
assert len(address) == 1

View File

@ -0,0 +1,71 @@
"""
Tests for execution of the sanitztion step.
"""
import pytest
from nominatim.errors import UsageError
import nominatim.tokenizer.place_sanitizer as sanitizer
from nominatim.indexer.place_info import PlaceInfo
def test_placeinfo_clone_new_name():
place = sanitizer.PlaceName('foo', 'ki', 'su')
newplace = place.clone(name='bar')
assert place.name == 'foo'
assert newplace.name == 'bar'
assert newplace.kind == 'ki'
assert newplace.suffix == 'su'
def test_placeinfo_clone_merge_attr():
place = sanitizer.PlaceName('foo', 'ki', 'su')
place.set_attr('a1', 'v1')
place.set_attr('a2', 'v2')
newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'})
assert place.get_attr('a2') == 'v2'
assert place.get_attr('b2') is None
assert newplace.get_attr('a1') == 'v1'
assert newplace.get_attr('a2') == 'new'
assert newplace.get_attr('b2') == 'foo'
def test_placeinfo_has_attr():
place = sanitizer.PlaceName('foo', 'ki', 'su')
place.set_attr('a1', 'v1')
assert place.has_attr('a1')
assert not place.has_attr('whatever')
def test_sanitizer_default():
san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}])
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
'address': {'street': 'Bald'}}))
assert len(name) == 3
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
assert all(n.kind == 'name' for n in name)
assert all(n.suffix == 'de:de' for n in name)
assert len(address) == 1
assert all(isinstance(n, sanitizer.PlaceName) for n in address)
@pytest.mark.parametrize('rules', [None, []])
def test_sanitizer_empty_list(rules):
san = sanitizer.PlaceSanitizer(rules)
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
assert len(name) == 1
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
def test_sanitizer_missing_step_definition():
with pytest.raises(UsageError):
san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])