mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-27 15:05:28 +03:00
unify ICUNameProcessorRules and ICURuleLoader
There is no need for the additional layer of indirection that the ICUNameProcessorRules class adds. The ICURuleLoader can fill the database properties directly.
This commit is contained in:
parent
5e5addcdbf
commit
16daa57e47
@ -149,11 +149,14 @@ class AbstractTokenizer(ABC):
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def init_from_project(self) -> None:
|
||||
def init_from_project(self, config: Configuration) -> None:
|
||||
""" Initialise the tokenizer from an existing database setup.
|
||||
|
||||
The function should load all previously saved configuration from
|
||||
the project directory and/or the property table.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
"""
|
||||
pass
|
||||
|
||||
@ -187,7 +190,7 @@ class AbstractTokenizer(ABC):
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def check_database(self) -> str:
|
||||
def check_database(self, config: Configuration) -> str:
|
||||
""" Check that the database is set up correctly and ready for being
|
||||
queried.
|
||||
|
||||
@ -196,6 +199,9 @@ class AbstractTokenizer(ABC):
|
||||
description of the issue as well as hints for the user on
|
||||
how to resolve the issue.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
|
||||
Return `None`, if no issue was found.
|
||||
"""
|
||||
pass
|
||||
|
@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
|
||||
tokenizer_module = _import_tokenizer(name)
|
||||
|
||||
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
||||
tokenizer.init_from_project()
|
||||
tokenizer.init_from_project(config)
|
||||
|
||||
return tokenizer
|
||||
|
@ -8,67 +8,25 @@ import itertools
|
||||
from icu import Transliterator
|
||||
import datrie
|
||||
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
from nominatim.tokenizer import icu_variants as variants
|
||||
|
||||
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
||||
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
|
||||
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
|
||||
|
||||
|
||||
class ICUNameProcessorRules:
|
||||
""" Data object that saves the rules needed for the name processor.
|
||||
|
||||
The rules can either be initialised through an ICURuleLoader or
|
||||
be loaded from a database when a connection is given.
|
||||
"""
|
||||
def __init__(self, loader=None, conn=None):
|
||||
if loader is not None:
|
||||
self.norm_rules = loader.get_normalization_rules()
|
||||
self.trans_rules = loader.get_transliteration_rules()
|
||||
self.replacements = loader.get_replacement_pairs()
|
||||
self.search_rules = loader.get_search_rules()
|
||||
elif conn is not None:
|
||||
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||
self.replacements = \
|
||||
variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
|
||||
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
|
||||
else:
|
||||
assert False, "Parameter loader or conn required."
|
||||
|
||||
|
||||
def save_rules(self, conn):
|
||||
""" Save the rules in the property table of the given database.
|
||||
the rules can be loaded again by handing in a connection into
|
||||
the constructor of the class.
|
||||
"""
|
||||
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
|
||||
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
|
||||
set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
|
||||
variants.pickle_variant_set(self.replacements))
|
||||
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
|
||||
|
||||
|
||||
class ICUNameProcessor:
|
||||
""" Collects the different transformation rules for normalisation of names
|
||||
and provides the functions to aply the transformations.
|
||||
and provides the functions to apply the transformations.
|
||||
"""
|
||||
|
||||
def __init__(self, rules):
|
||||
def __init__(self, norm_rules, trans_rules, replacements):
|
||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||
rules.norm_rules)
|
||||
norm_rules)
|
||||
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
||||
rules.trans_rules +
|
||||
trans_rules +
|
||||
";[:Space:]+ > ' '")
|
||||
self.search = Transliterator.createFromRules("icu_search",
|
||||
rules.search_rules)
|
||||
norm_rules + trans_rules)
|
||||
|
||||
# Intermediate reorder by source. Also compute required character set.
|
||||
immediate = defaultdict(list)
|
||||
chars = set()
|
||||
for variant in rules.replacements:
|
||||
for variant in replacements:
|
||||
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
||||
replstr = variant.replacement[:-1]
|
||||
else:
|
||||
|
@ -2,17 +2,25 @@
|
||||
Helper class to create ICU rules from a configuration file.
|
||||
"""
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import itertools
|
||||
import re
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
from nominatim.errors import UsageError
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
|
||||
import nominatim.tokenizer.icu_variants as variants
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
||||
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
||||
|
||||
|
||||
def _flatten_config_list(content):
|
||||
if not content:
|
||||
return []
|
||||
@ -46,12 +54,43 @@ class ICURuleLoader:
|
||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||
"""
|
||||
|
||||
def __init__(self, rules):
|
||||
def __init__(self, config):
|
||||
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG')
|
||||
|
||||
self.variants = set()
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self._parse_variant_list(self._get_section(rules, 'variants'))
|
||||
self.analysis_rules = self._get_section(rules, 'variants')
|
||||
self._parse_variant_list()
|
||||
|
||||
|
||||
def load_config_from_db(self, conn):
|
||||
""" Get previously saved parts of the configuration from the
|
||||
database.
|
||||
"""
|
||||
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
|
||||
self._parse_variant_list()
|
||||
|
||||
|
||||
def save_config_to_db(self, conn):
|
||||
""" Save the part of the configuration that cannot be changed into
|
||||
the database.
|
||||
"""
|
||||
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
|
||||
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
|
||||
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
||||
|
||||
|
||||
def make_token_analysis(self):
|
||||
""" Create a token analyser from the reviouly loaded rules.
|
||||
"""
|
||||
return ICUNameProcessor(self.normalization_rules,
|
||||
self.transliteration_rules,
|
||||
self.variants)
|
||||
|
||||
|
||||
def get_search_rules(self):
|
||||
@ -112,7 +151,9 @@ class ICURuleLoader:
|
||||
return ';'.join(_flatten_config_list(content)) + ';'
|
||||
|
||||
|
||||
def _parse_variant_list(self, rules):
|
||||
def _parse_variant_list(self):
|
||||
rules = self.analysis_rules
|
||||
|
||||
self.variants.clear()
|
||||
|
||||
if not rules:
|
||||
|
@ -14,7 +14,6 @@ from nominatim.db.properties import set_property, get_property
|
||||
from nominatim.db.utils import CopyBuffer
|
||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
||||
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||
|
||||
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||
@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
def __init__(self, dsn, data_dir):
|
||||
self.dsn = dsn
|
||||
self.data_dir = data_dir
|
||||
self.naming_rules = None
|
||||
self.loader = None
|
||||
self.term_normalization = None
|
||||
|
||||
|
||||
@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
This copies all necessary data in the project directory to make
|
||||
sure the tokenizer remains stable even over updates.
|
||||
"""
|
||||
loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG'))
|
||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
||||
self.loader = ICURuleLoader(config)
|
||||
|
||||
self.term_normalization = config.TERM_NORMALIZATION
|
||||
|
||||
self._install_php(config.lib_dir.php)
|
||||
@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
self._init_db_tables(config)
|
||||
|
||||
|
||||
def init_from_project(self):
|
||||
def init_from_project(self, config):
|
||||
""" Initialise the tokenizer from the project directory.
|
||||
"""
|
||||
self.loader = ICURuleLoader(config)
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
self.naming_rules = ICUNameProcessorRules(conn=conn)
|
||||
self.loader.load_config_from_db(conn)
|
||||
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
|
||||
|
||||
|
||||
@ -81,12 +81,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
||||
|
||||
|
||||
def check_database(self):
|
||||
def check_database(self, config):
|
||||
""" Check that the tokenizer is set up correctly.
|
||||
"""
|
||||
self.init_from_project()
|
||||
self.init_from_project(config)
|
||||
|
||||
if self.naming_rules is None:
|
||||
if self.term_normalization is None:
|
||||
return "Configuration for tokenizer 'icu' are missing."
|
||||
|
||||
return None
|
||||
@ -107,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
|
||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||
"""
|
||||
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
|
||||
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
|
||||
|
||||
|
||||
def _install_php(self, phpdir):
|
||||
@ -118,7 +118,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
<?php
|
||||
@define('CONST_Max_Word_Frequency', 10000000);
|
||||
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
||||
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
|
||||
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
|
||||
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
||||
|
||||
|
||||
@ -127,8 +127,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
database as database properties.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
self.naming_rules.save_rules(conn)
|
||||
|
||||
self.loader.save_config_to_db(conn)
|
||||
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
|
||||
|
||||
|
||||
@ -163,7 +162,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
""" Count the partial terms from the names in the place table.
|
||||
"""
|
||||
words = Counter()
|
||||
name_proc = ICUNameProcessor(self.naming_rules)
|
||||
name_proc = self.loader.make_token_analysis()
|
||||
|
||||
with conn.cursor(name="words") as cur:
|
||||
cur.execute(""" SELECT v, count(*) FROM
|
||||
|
@ -2,7 +2,6 @@
|
||||
Data structures for saving variant expansions for ICU tokenizer.
|
||||
"""
|
||||
from collections import namedtuple
|
||||
import json
|
||||
|
||||
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
|
||||
|
||||
@ -24,34 +23,3 @@ class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORP
|
||||
|
||||
|
||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
|
||||
|
||||
|
||||
def pickle_variant_set(variants):
|
||||
""" Serializes an iterable of variant rules to a string.
|
||||
"""
|
||||
# Create a list of property sets. So they don't need to be duplicated
|
||||
properties = {}
|
||||
pid = 1
|
||||
for variant in variants:
|
||||
if variant.properties not in properties:
|
||||
properties[variant.properties] = pid
|
||||
pid += 1
|
||||
|
||||
# Convert the variants into a simple list.
|
||||
variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
|
||||
|
||||
# Convert everythin to json.
|
||||
return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
|
||||
'variants': variants})
|
||||
|
||||
|
||||
def unpickle_variant_set(variant_string):
|
||||
""" Deserializes a variant string that was previously created with
|
||||
pickle_variant_set() into a set of ICUVariants.
|
||||
"""
|
||||
data = json.loads(variant_string)
|
||||
|
||||
properties = {int(k): ICUVariantProperties.from_rules(v)
|
||||
for k, v in data['properties'].items()}
|
||||
|
||||
return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
|
||||
|
@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
||||
self._init_db_tables(config)
|
||||
|
||||
|
||||
def init_from_project(self):
|
||||
def init_from_project(self, _):
|
||||
""" Initialise the tokenizer from the project directory.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
||||
modulepath=modulepath)
|
||||
|
||||
|
||||
def check_database(self):
|
||||
def check_database(self, _):
|
||||
""" Check that the tokenizer is set up correctly.
|
||||
"""
|
||||
hint = """\
|
||||
|
@ -166,7 +166,7 @@ def check_tokenizer(_, config):
|
||||
return CheckState.FAIL, dict(msg="""\
|
||||
Cannot load tokenizer. Did the import finish sucessfully?""")
|
||||
|
||||
result = tokenizer.check_database()
|
||||
result = tokenizer.check_database(config)
|
||||
|
||||
if result is None:
|
||||
return CheckState.OK
|
||||
|
@ -2,6 +2,7 @@
|
||||
Tokenizer for testing.
|
||||
"""
|
||||
from nominatim.indexer.place_info import PlaceInfo
|
||||
from nominatim.config import Configuration
|
||||
|
||||
def create(dsn, data_dir):
|
||||
""" Create a new instance of the tokenizer provided by this module.
|
||||
@ -22,7 +23,8 @@ class DummyTokenizer:
|
||||
self.init_state = "new"
|
||||
|
||||
|
||||
def init_from_project(self):
|
||||
def init_from_project(self, config):
|
||||
assert isinstance(config, Configuration)
|
||||
assert self.init_state is None
|
||||
self.init_state = "loaded"
|
||||
|
||||
|
@ -7,7 +7,6 @@ import yaml
|
||||
import pytest
|
||||
|
||||
from nominatim.tokenizer import icu_tokenizer
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.db import properties
|
||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||
@ -72,7 +71,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||
cfgstr = {'normalization' : list(norm),
|
||||
'transliteration' : list(trans),
|
||||
'variants' : [ {'words': list(variants)}]}
|
||||
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||
tok.loader = ICURuleLoader(test_config)
|
||||
|
||||
return tok.name_analyzer()
|
||||
|
||||
@ -178,9 +178,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
||||
monkeypatch.undo()
|
||||
|
||||
tok = tokenizer_factory()
|
||||
tok.init_from_project()
|
||||
tok.init_from_project(test_config)
|
||||
|
||||
assert tok.naming_rules is not None
|
||||
assert tok.loader is not None
|
||||
assert tok.term_normalization == ':: lower();'
|
||||
|
||||
|
||||
|
@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
|
||||
from textwrap import dedent
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
@pytest.fixture
|
||||
def cfgfile():
|
||||
def cfgfile(def_config, tmp_path):
|
||||
project_dir = tmp_path / 'project_dir'
|
||||
project_dir.mkdir()
|
||||
def_config.project_dir = project_dir
|
||||
|
||||
def _create_config(*variants, **kwargs):
|
||||
content = dedent("""\
|
||||
normalization:
|
||||
@ -30,7 +32,9 @@ def cfgfile():
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
return yaml.safe_load(content)
|
||||
(project_dir / 'icu_tokenizer.yaml').write_text(content)
|
||||
|
||||
return def_config
|
||||
|
||||
return _create_config
|
||||
|
||||
@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
|
||||
|
||||
|
||||
def test_variants_empty(cfgfile):
|
||||
fpath = cfgfile('saint -> 🜵', 'street -> st')
|
||||
config = cfgfile('saint -> 🜵', 'street -> st')
|
||||
|
||||
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
|
||||
proc = ICUNameProcessor(rules)
|
||||
proc = ICURuleLoader(config).make_token_analysis()
|
||||
|
||||
assert get_normalized_variants(proc, '🜵') == []
|
||||
assert get_normalized_variants(proc, '🜳') == []
|
||||
@ -83,8 +86,8 @@ VARIANT_TESTS = [
|
||||
|
||||
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
|
||||
def test_variants(cfgfile, rules, name, variants):
|
||||
fpath = cfgfile(*rules)
|
||||
proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
|
||||
config = cfgfile(*rules)
|
||||
proc = ICURuleLoader(config).make_token_analysis()
|
||||
|
||||
result = get_normalized_variants(proc, name)
|
||||
|
||||
@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
|
||||
|
||||
|
||||
def test_search_normalized(cfgfile):
|
||||
fpath = cfgfile('~street => s,st', 'master => mstr')
|
||||
|
||||
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
|
||||
proc = ICUNameProcessor(rules)
|
||||
config = cfgfile('~street => s,st', 'master => mstr')
|
||||
proc = ICURuleLoader(config).make_token_analysis()
|
||||
|
||||
assert proc.get_search_normalized('Master Street') == 'master street'
|
||||
assert proc.get_search_normalized('Earnes St') == 'earnes st'
|
||||
|
@ -12,7 +12,16 @@ from nominatim.errors import UsageError
|
||||
from icu import Transliterator
|
||||
|
||||
@pytest.fixture
|
||||
def cfgrules():
|
||||
def test_config(def_config, tmp_path):
|
||||
project_dir = tmp_path / 'project_dir'
|
||||
project_dir.mkdir()
|
||||
def_config.project_dir = project_dir
|
||||
|
||||
return def_config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cfgrules(test_config):
|
||||
def _create_config(*variants, **kwargs):
|
||||
content = dedent("""\
|
||||
normalization:
|
||||
@ -29,19 +38,21 @@ def cfgrules():
|
||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||
for k, v in kwargs:
|
||||
content += " {}: {}\n".format(k, v)
|
||||
return yaml.safe_load(content)
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
|
||||
|
||||
return test_config
|
||||
|
||||
return _create_config
|
||||
|
||||
|
||||
def test_empty_rule_set():
|
||||
rule_cfg = yaml.safe_load(dedent("""\
|
||||
def test_empty_rule_set(test_config):
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
|
||||
normalization:
|
||||
transliteration:
|
||||
variants:
|
||||
"""))
|
||||
|
||||
rules = ICURuleLoader(rule_cfg)
|
||||
rules = ICURuleLoader(test_config)
|
||||
assert rules.get_search_rules() == ''
|
||||
assert rules.get_normalization_rules() == ''
|
||||
assert rules.get_transliteration_rules() == ''
|
||||
@ -50,11 +61,12 @@ def test_empty_rule_set():
|
||||
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
|
||||
|
||||
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
|
||||
def test_missing_section(section):
|
||||
def test_missing_section(section, test_config):
|
||||
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
|
||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
|
||||
|
||||
with pytest.raises(UsageError):
|
||||
ICURuleLoader(rule_cfg)
|
||||
ICURuleLoader(test_config)
|
||||
|
||||
|
||||
def test_get_search_rules(cfgrules):
|
||||
@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
|
||||
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
||||
|
||||
|
||||
def test_transliteration_rules_from_file(def_config, tmp_path):
|
||||
def_config.project_dir = tmp_path
|
||||
cfgpath = tmp_path / ('test_config.yaml')
|
||||
def test_transliteration_rules_from_file(test_config):
|
||||
cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
|
||||
cfgpath.write_text(dedent("""\
|
||||
normalization:
|
||||
transliteration:
|
||||
@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
|
||||
- !include transliteration.yaml
|
||||
variants:
|
||||
"""))
|
||||
transpath = tmp_path / ('transliteration.yaml')
|
||||
transpath = test_config.project_dir / ('transliteration.yaml')
|
||||
transpath.write_text('- "x > y"')
|
||||
|
||||
loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
|
||||
loader = ICURuleLoader(test_config)
|
||||
rules = loader.get_transliteration_rules()
|
||||
trans = Transliterator.createFromRules("test", rules)
|
||||
|
||||
|
@ -132,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
|
||||
assert not (test_config.project_dir / 'module').exists()
|
||||
|
||||
|
||||
def test_init_from_project(tokenizer_setup, tokenizer_factory):
|
||||
def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
|
||||
tok = tokenizer_factory()
|
||||
|
||||
tok.init_from_project()
|
||||
tok.init_from_project(test_config)
|
||||
|
||||
assert tok.normalization is not None
|
||||
|
||||
|
@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
|
||||
check_result, state):
|
||||
class _TestTokenizer:
|
||||
@staticmethod
|
||||
def check_database():
|
||||
def check_database(_):
|
||||
return check_result
|
||||
|
||||
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
|
||||
|
Loading…
Reference in New Issue
Block a user