unify ICUNameProcessorRules and ICURuleLoader

There is no need for the additional layer of indirection that
the ICUNameProcessorRules class adds. The ICURuleLoader can
fill the database properties directly.
This commit is contained in:
Sarah Hoffmann 2021-09-29 17:37:04 +02:00
parent 5e5addcdbf
commit 16daa57e47
14 changed files with 123 additions and 137 deletions

View File

@ -149,11 +149,14 @@ class AbstractTokenizer(ABC):
@abstractmethod
def init_from_project(self) -> None:
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from an existing database setup.
The function should load all previously saved configuration from
the project directory and/or the property table.
Arguments:
config: Read-only object with configuration options.
"""
pass
@ -187,7 +190,7 @@ class AbstractTokenizer(ABC):
@abstractmethod
def check_database(self) -> str:
def check_database(self, config: Configuration) -> str:
""" Check that the database is set up correctly and ready for being
queried.
@ -196,6 +199,9 @@ class AbstractTokenizer(ABC):
description of the issue as well as hints for the user on
how to resolve the issue.
Arguments:
config: Read-only object with configuration options.
Return `None`, if no issue was found.
"""
pass

View File

@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_from_project()
tokenizer.init_from_project(config)
return tokenizer

View File

@ -8,67 +8,25 @@ import itertools
from icu import Transliterator
import datrie
from nominatim.db.properties import set_property, get_property
from nominatim.tokenizer import icu_variants as variants
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
class ICUNameProcessorRules:
""" Data object that saves the rules needed for the name processor.
The rules can either be initialised through an ICURuleLoader or
be loaded from a database when a connection is given.
"""
def __init__(self, loader=None, conn=None):
if loader is not None:
self.norm_rules = loader.get_normalization_rules()
self.trans_rules = loader.get_transliteration_rules()
self.replacements = loader.get_replacement_pairs()
self.search_rules = loader.get_search_rules()
elif conn is not None:
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.replacements = \
variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
else:
assert False, "Parameter loader or conn required."
def save_rules(self, conn):
""" Save the rules in the property table of the given database.
the rules can be loaded again by handing in a connection into
the constructor of the class.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
variants.pickle_variant_set(self.replacements))
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
class ICUNameProcessor:
""" Collects the different transformation rules for normalisation of names
and provides the functions to aply the transformations.
and provides the functions to apply the transformations.
"""
def __init__(self, rules):
def __init__(self, norm_rules, trans_rules, replacements):
self.normalizer = Transliterator.createFromRules("icu_normalization",
rules.norm_rules)
norm_rules)
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
rules.trans_rules +
trans_rules +
";[:Space:]+ > ' '")
self.search = Transliterator.createFromRules("icu_search",
rules.search_rules)
norm_rules + trans_rules)
# Intermediate reorder by source. Also compute required character set.
immediate = defaultdict(list)
chars = set()
for variant in rules.replacements:
for variant in replacements:
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
replstr = variant.replacement[:-1]
else:

View File

@ -2,17 +2,25 @@
Helper class to create ICU rules from a configuration file.
"""
import io
import json
import logging
import itertools
import re
from icu import Transliterator
from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
import nominatim.tokenizer.icu_variants as variants
LOG = logging.getLogger()
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
def _flatten_config_list(content):
if not content:
return []
@ -46,12 +54,43 @@ class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, rules):
def __init__(self, config):
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
self.variants = set()
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self._parse_variant_list(self._get_section(rules, 'variants'))
self.analysis_rules = self._get_section(rules, 'variants')
self._parse_variant_list()
def load_config_from_db(self, conn):
""" Get previously saved parts of the configuration from the
database.
"""
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
self._parse_variant_list()
def save_config_to_db(self, conn):
""" Save the part of the configuration that cannot be changed into
the database.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
def make_token_analysis(self):
""" Create a token analyser from the reviouly loaded rules.
"""
return ICUNameProcessor(self.normalization_rules,
self.transliteration_rules,
self.variants)
def get_search_rules(self):
@ -112,7 +151,9 @@ class ICURuleLoader:
return ';'.join(_flatten_config_list(content)) + ';'
def _parse_variant_list(self, rules):
def _parse_variant_list(self):
rules = self.analysis_rules
self.variants.clear()
if not rules:

View File

@ -14,7 +14,6 @@ from nominatim.db.properties import set_property, get_property
from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
def __init__(self, dsn, data_dir):
self.dsn = dsn
self.data_dir = data_dir
self.naming_rules = None
self.loader = None
self.term_normalization = None
@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG'))
self.naming_rules = ICUNameProcessorRules(loader=loader)
self.loader = ICURuleLoader(config)
self.term_normalization = config.TERM_NORMALIZATION
self._install_php(config.lib_dir.php)
@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
self._init_db_tables(config)
def init_from_project(self):
def init_from_project(self, config):
""" Initialise the tokenizer from the project directory.
"""
self.loader = ICURuleLoader(config)
with connect(self.dsn) as conn:
self.naming_rules = ICUNameProcessorRules(conn=conn)
self.loader.load_config_from_db(conn)
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
@ -81,12 +81,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
def check_database(self):
def check_database(self, config):
""" Check that the tokenizer is set up correctly.
"""
self.init_from_project()
self.init_from_project(config)
if self.naming_rules is None:
if self.term_normalization is None:
return "Configuration for tokenizer 'icu' are missing."
return None
@ -107,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
def _install_php(self, phpdir):
@ -118,7 +118,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
@ -127,8 +127,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
database as database properties.
"""
with connect(self.dsn) as conn:
self.naming_rules.save_rules(conn)
self.loader.save_config_to_db(conn)
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
@ -163,7 +162,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
""" Count the partial terms from the names in the place table.
"""
words = Counter()
name_proc = ICUNameProcessor(self.naming_rules)
name_proc = self.loader.make_token_analysis()
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM

View File

@ -2,7 +2,6 @@
Data structures for saving variant expansions for ICU tokenizer.
"""
from collections import namedtuple
import json
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
@ -24,34 +23,3 @@ class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORP
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
def pickle_variant_set(variants):
""" Serializes an iterable of variant rules to a string.
"""
# Create a list of property sets. So they don't need to be duplicated
properties = {}
pid = 1
for variant in variants:
if variant.properties not in properties:
properties[variant.properties] = pid
pid += 1
# Convert the variants into a simple list.
variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
# Convert everythin to json.
return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
'variants': variants})
def unpickle_variant_set(variant_string):
""" Deserializes a variant string that was previously created with
pickle_variant_set() into a set of ICUVariants.
"""
data = json.loads(variant_string)
properties = {int(k): ICUVariantProperties.from_rules(v)
for k, v in data['properties'].items()}
return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))

View File

@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer):
self._init_db_tables(config)
def init_from_project(self):
def init_from_project(self, _):
""" Initialise the tokenizer from the project directory.
"""
with connect(self.dsn) as conn:
@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer):
modulepath=modulepath)
def check_database(self):
def check_database(self, _):
""" Check that the tokenizer is set up correctly.
"""
hint = """\

View File

@ -166,7 +166,7 @@ def check_tokenizer(_, config):
return CheckState.FAIL, dict(msg="""\
Cannot load tokenizer. Did the import finish sucessfully?""")
result = tokenizer.check_database()
result = tokenizer.check_database(config)
if result is None:
return CheckState.OK

View File

@ -2,6 +2,7 @@
Tokenizer for testing.
"""
from nominatim.indexer.place_info import PlaceInfo
from nominatim.config import Configuration
def create(dsn, data_dir):
""" Create a new instance of the tokenizer provided by this module.
@ -22,7 +23,8 @@ class DummyTokenizer:
self.init_state = "new"
def init_from_project(self):
def init_from_project(self, config):
assert isinstance(config, Configuration)
assert self.init_state is None
self.init_state = "loaded"

View File

@ -7,7 +7,6 @@ import yaml
import pytest
from nominatim.tokenizer import icu_tokenizer
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties
from nominatim.db.sql_preprocessor import SQLPreprocessor
@ -72,7 +71,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
'variants' : [ {'words': list(variants)}]}
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)
return tok.name_analyzer()
@ -178,9 +178,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
monkeypatch.undo()
tok = tokenizer_factory()
tok.init_from_project()
tok.init_from_project(test_config)
assert tok.naming_rules is not None
assert tok.loader is not None
assert tok.term_normalization == ':: lower();'

View File

@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
from textwrap import dedent
import pytest
import yaml
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.errors import UsageError
@pytest.fixture
def cfgfile():
def cfgfile(def_config, tmp_path):
project_dir = tmp_path / 'project_dir'
project_dir.mkdir()
def_config.project_dir = project_dir
def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
@ -30,7 +32,9 @@ def cfgfile():
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
return yaml.safe_load(content)
(project_dir / 'icu_tokenizer.yaml').write_text(content)
return def_config
return _create_config
@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
def test_variants_empty(cfgfile):
fpath = cfgfile('saint -> 🜵', 'street -> st')
config = cfgfile('saint -> 🜵', 'street -> st')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
proc = ICURuleLoader(config).make_token_analysis()
assert get_normalized_variants(proc, '🜵') == []
assert get_normalized_variants(proc, '🜳') == []
@ -83,8 +86,8 @@ VARIANT_TESTS = [
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
def test_variants(cfgfile, rules, name, variants):
fpath = cfgfile(*rules)
proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
config = cfgfile(*rules)
proc = ICURuleLoader(config).make_token_analysis()
result = get_normalized_variants(proc, name)
@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
def test_search_normalized(cfgfile):
fpath = cfgfile('~street => s,st', 'master => mstr')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
config = cfgfile('~street => s,st', 'master => mstr')
proc = ICURuleLoader(config).make_token_analysis()
assert proc.get_search_normalized('Master Street') == 'master street'
assert proc.get_search_normalized('Earnes St') == 'earnes st'

View File

@ -12,7 +12,16 @@ from nominatim.errors import UsageError
from icu import Transliterator
@pytest.fixture
def cfgrules():
def test_config(def_config, tmp_path):
project_dir = tmp_path / 'project_dir'
project_dir.mkdir()
def_config.project_dir = project_dir
return def_config
@pytest.fixture
def cfgrules(test_config):
def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
@ -29,19 +38,21 @@ def cfgrules():
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
return yaml.safe_load(content)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
return test_config
return _create_config
def test_empty_rule_set():
rule_cfg = yaml.safe_load(dedent("""\
def test_empty_rule_set(test_config):
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
normalization:
transliteration:
variants:
"""))
rules = ICURuleLoader(rule_cfg)
rules = ICURuleLoader(test_config)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
@ -50,11 +61,12 @@ def test_empty_rule_set():
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_section(section):
def test_missing_section(section, test_config):
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
with pytest.raises(UsageError):
ICURuleLoader(rule_cfg)
ICURuleLoader(test_config)
def test_get_search_rules(cfgrules):
@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def test_transliteration_rules_from_file(def_config, tmp_path):
def_config.project_dir = tmp_path
cfgpath = tmp_path / ('test_config.yaml')
def test_transliteration_rules_from_file(test_config):
cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
cfgpath.write_text(dedent("""\
normalization:
transliteration:
@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
- !include transliteration.yaml
variants:
"""))
transpath = tmp_path / ('transliteration.yaml')
transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
loader = ICURuleLoader(test_config)
rules = loader.get_transliteration_rules()
trans = Transliterator.createFromRules("test", rules)

View File

@ -132,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
assert not (test_config.project_dir / 'module').exists()
def test_init_from_project(tokenizer_setup, tokenizer_factory):
def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
tok = tokenizer_factory()
tok.init_from_project()
tok.init_from_project(test_config)
assert tok.normalization is not None

View File

@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
check_result, state):
class _TestTokenizer:
@staticmethod
def check_database():
def check_database(_):
return check_result
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',