From 630ad38a6768f8379328795f8aa0e127d7105c44 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 11 Jan 2022 17:51:05 +0100 Subject: [PATCH 1/6] refactor variant production to use generators --- nominatim/tokenizer/token_analysis/generic.py | 37 +++++++++---------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index f790dad2..05ba885b 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -176,14 +176,26 @@ class GenericTokenAnalysis: """ Compute the spelling variants for the given normalized name and transliterate the result. """ + results = set() + for variant in self._generate_word_variants(norm_name): + if not self.variant_only or variant.strip() != norm_name: + trans_name = self.to_ascii.transliterate(variant).strip() + if trans_name: + results.add(trans_name) + + return list(results) + + + def _generate_word_variants(self, norm_name): baseform = '^ ' + norm_name + ' ^' + baselen = len(baseform) partials = [''] startpos = 0 if self.replacements is not None: pos = 0 force_space = False - while pos < len(baseform): + while pos < baselen: full, repl = self.replacements.longest_prefix_item(baseform[pos:], (None, None)) if full is not None: @@ -207,24 +219,9 @@ class GenericTokenAnalysis: # No variants detected? Fast return. if startpos == 0: - if self.variant_only: - return [] + return (norm_name, ) - trans_name = self.to_ascii.transliterate(norm_name).strip() - return [trans_name] if trans_name else [] + if startpos < baselen: + return (part[1:] + baseform[startpos:-1] for part in partials) - return self._compute_result_set(partials, baseform[startpos:], - norm_name if self.variant_only else '') - - - def _compute_result_set(self, partials, prefix, exclude): - results = set() - - for variant in partials: - vname = (variant + prefix)[1:-1].strip() - if vname != exclude: - trans_name = self.to_ascii.transliterate(vname).strip() - if trans_name: - results.add(trans_name) - - return list(results) + return (part[1:-1] for part in partials) From 0192a7af96d32faf5dd319469d376bf4140dfcbb Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 12 Jan 2022 09:53:32 +0100 Subject: [PATCH 2/6] move variant configuration reading in separate file --- .../token_analysis/config_variants.py | 134 ++++++++++++++++++ nominatim/tokenizer/token_analysis/generic.py | 125 +--------------- 2 files changed, 137 insertions(+), 122 deletions(-) create mode 100644 nominatim/tokenizer/token_analysis/config_variants.py diff --git a/nominatim/tokenizer/token_analysis/config_variants.py b/nominatim/tokenizer/token_analysis/config_variants.py new file mode 100644 index 00000000..59ceeb22 --- /dev/null +++ b/nominatim/tokenizer/token_analysis/config_variants.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Parser for configuration for variants. +""" +from collections import defaultdict, namedtuple +import itertools +import re + +from icu import Transliterator + +from nominatim.config import flatten_config_list +from nominatim.errors import UsageError + +ICUVariant = namedtuple('ICUVariant', ['source', 'replacement']) + +def get_variant_config(rules, normalization_rules): + """ Convert the variant definition from the configuration into + replacement sets. + + Returns a tuple containing the replacement set and the list of characters + used in the replacements. + """ + immediate = defaultdict(list) + chars = set() + + if rules: + vset = set() + rules = flatten_config_list(rules, 'variants') + + vmaker = _VariantMaker(normalization_rules) + + for section in rules: + for rule in (section.get('words') or []): + vset.update(vmaker.compute(rule)) + + # Intermediate reorder by source. Also compute required character set. + for variant in vset: + if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': + replstr = variant.replacement[:-1] + else: + replstr = variant.replacement + immediate[variant.source].append(replstr) + chars.update(variant.source) + + return list(immediate.items()), ''.join(chars) + + +class _VariantMaker: + """ Generater for all necessary ICUVariants from a single variant rule. + + All text in rules is normalized to make sure the variants match later. + """ + + def __init__(self, norm_rules): + self.norm = Transliterator.createFromRules("rule_loader_normalization", + norm_rules) + + + def compute(self, rule): + """ Generator for all ICUVariant tuples from a single variant rule. + """ + parts = re.split(r'(\|)?([=-])>', rule) + if len(parts) != 4: + raise UsageError("Syntax error in variant rule: " + rule) + + decompose = parts[1] is None + src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')] + repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(',')) + + # If the source should be kept, add a 1:1 replacement + if parts[2] == '-': + for src in src_terms: + if src: + for froms, tos in _create_variants(*src, src[0], decompose): + yield ICUVariant(froms, tos) + + for src, repl in itertools.product(src_terms, repl_terms): + if src and repl: + for froms, tos in _create_variants(*src, repl, decompose): + yield ICUVariant(froms, tos) + + + def _parse_variant_word(self, name): + name = name.strip() + match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) + if match is None or (match.group(1) == '~' and match.group(3) == '~'): + raise UsageError("Invalid variant word descriptor '{}'".format(name)) + norm_name = self.norm.transliterate(match.group(2)).strip() + if not norm_name: + return None + + return norm_name, match.group(1), match.group(3) + + +_FLAG_MATCH = {'^': '^ ', + '$': ' ^', + '': ' '} + + +def _create_variants(src, preflag, postflag, repl, decompose): + if preflag == '~': + postfix = _FLAG_MATCH[postflag] + # suffix decomposition + src = src + postfix + repl = repl + postfix + + yield src, repl + yield ' ' + src, ' ' + repl + + if decompose: + yield src, ' ' + repl + yield ' ' + src, repl + elif postflag == '~': + # prefix decomposition + prefix = _FLAG_MATCH[preflag] + src = prefix + src + repl = prefix + repl + + yield src, repl + yield src + ' ', repl + ' ' + + if decompose: + yield src, repl + ' ' + yield src + ' ', repl + else: + prefix = _FLAG_MATCH[preflag] + postfix = _FLAG_MATCH[postflag] + + yield prefix + src + postfix, prefix + repl + postfix diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index 05ba885b..b9bd9bdf 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -7,145 +7,26 @@ """ Generic processor for names that creates abbreviation variants. """ -from collections import defaultdict, namedtuple import itertools -import re -from icu import Transliterator import datrie -from nominatim.config import flatten_config_list -from nominatim.errors import UsageError +from nominatim.tokenizer.token_analysis.config_variants import get_variant_config ### Configuration section -ICUVariant = namedtuple('ICUVariant', ['source', 'replacement']) - def configure(rules, normalization_rules): """ Extract and preprocess the configuration for this module. """ config = {} - config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'), - normalization_rules) + config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), + normalization_rules) config['variant_only'] = rules.get('mode', '') == 'variant-only' return config -def _get_variant_config(rules, normalization_rules): - """ Convert the variant definition from the configuration into - replacement sets. - """ - immediate = defaultdict(list) - chars = set() - - if rules: - vset = set() - rules = flatten_config_list(rules, 'variants') - - vmaker = _VariantMaker(normalization_rules) - - for section in rules: - for rule in (section.get('words') or []): - vset.update(vmaker.compute(rule)) - - # Intermediate reorder by source. Also compute required character set. - for variant in vset: - if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': - replstr = variant.replacement[:-1] - else: - replstr = variant.replacement - immediate[variant.source].append(replstr) - chars.update(variant.source) - - return list(immediate.items()), ''.join(chars) - - -class _VariantMaker: - """ Generater for all necessary ICUVariants from a single variant rule. - - All text in rules is normalized to make sure the variants match later. - """ - - def __init__(self, norm_rules): - self.norm = Transliterator.createFromRules("rule_loader_normalization", - norm_rules) - - - def compute(self, rule): - """ Generator for all ICUVariant tuples from a single variant rule. - """ - parts = re.split(r'(\|)?([=-])>', rule) - if len(parts) != 4: - raise UsageError("Syntax error in variant rule: " + rule) - - decompose = parts[1] is None - src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')] - repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(',')) - - # If the source should be kept, add a 1:1 replacement - if parts[2] == '-': - for src in src_terms: - if src: - for froms, tos in _create_variants(*src, src[0], decompose): - yield ICUVariant(froms, tos) - - for src, repl in itertools.product(src_terms, repl_terms): - if src and repl: - for froms, tos in _create_variants(*src, repl, decompose): - yield ICUVariant(froms, tos) - - - def _parse_variant_word(self, name): - name = name.strip() - match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) - if match is None or (match.group(1) == '~' and match.group(3) == '~'): - raise UsageError("Invalid variant word descriptor '{}'".format(name)) - norm_name = self.norm.transliterate(match.group(2)).strip() - if not norm_name: - return None - - return norm_name, match.group(1), match.group(3) - - -_FLAG_MATCH = {'^': '^ ', - '$': ' ^', - '': ' '} - - -def _create_variants(src, preflag, postflag, repl, decompose): - if preflag == '~': - postfix = _FLAG_MATCH[postflag] - # suffix decomposition - src = src + postfix - repl = repl + postfix - - yield src, repl - yield ' ' + src, ' ' + repl - - if decompose: - yield src, ' ' + repl - yield ' ' + src, repl - elif postflag == '~': - # prefix decomposition - prefix = _FLAG_MATCH[preflag] - src = prefix + src - repl = prefix + repl - - yield src, repl - yield src + ' ', repl + ' ' - - if decompose: - yield src, repl + ' ' - yield src + ' ', repl - else: - prefix = _FLAG_MATCH[preflag] - postfix = _FLAG_MATCH[postflag] - - yield prefix + src + postfix, prefix + repl + postfix - - ### Analysis section def create(transliterator, config): From b453b0ea95e7b1244912b7bc9fc26f58acb8ec80 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 12 Jan 2022 16:25:47 +0100 Subject: [PATCH 3/6] introduce mutation variants to generic token analyser Mutations are regular-expression-based replacements that are applied after variants have been computed. They are meant to be used for variations on character level. Add spelling variations for German umlauts. --- nominatim/tokenizer/token_analysis/generic.py | 43 +++++++-- .../token_analysis/generic_mutation.py | 56 ++++++++++++ settings/icu_tokenizer.yaml | 7 ++ test/bdd/db/import/naming.feature | 45 ++++++++++ .../token_analysis/test_generic_mutation.py | 89 +++++++++++++++++++ 5 files changed, 233 insertions(+), 7 deletions(-) create mode 100644 nominatim/tokenizer/token_analysis/generic_mutation.py create mode 100644 test/python/tokenizer/token_analysis/test_generic_mutation.py diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index b9bd9bdf..1e7b75a1 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -11,7 +11,9 @@ import itertools import datrie +from nominatim.errors import UsageError from nominatim.tokenizer.token_analysis.config_variants import get_variant_config +from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator ### Configuration section @@ -23,6 +25,7 @@ def configure(rules, normalization_rules): config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), normalization_rules) config['variant_only'] = rules.get('mode', '') == 'variant-only' + config['mutations'] = rules.get('mutations', []) return config @@ -52,19 +55,45 @@ class GenericTokenAnalysis: else: self.replacements = None + # set up mutation rules + self.mutations = [] + for cfg in config['mutations']: + if 'pattern' not in cfg: + raise UsageError("Missing field 'pattern' in mutation configuration.") + if not isinstance(cfg['pattern'], str): + raise UsageError("Field 'pattern' in mutation configuration " + "must be a simple text field.") + if 'replacements' not in cfg: + raise UsageError("Missing field 'replacements' in mutation configuration.") + if not isinstance(cfg['replacements'], list): + raise UsageError("Field 'replacements' in mutation configuration " + "must be a list of texts.") + + self.mutations.append(MutationVariantGenerator(cfg['pattern'], + cfg['replacements'])) + def get_variants_ascii(self, norm_name): """ Compute the spelling variants for the given normalized name and transliterate the result. """ - results = set() - for variant in self._generate_word_variants(norm_name): - if not self.variant_only or variant.strip() != norm_name: - trans_name = self.to_ascii.transliterate(variant).strip() - if trans_name: - results.add(trans_name) + variants = self._generate_word_variants(norm_name) - return list(results) + for mutation in self.mutations: + variants = mutation.generate(variants) + + return [name for name in self._transliterate_unique_list(norm_name, variants) if name] + + + def _transliterate_unique_list(self, norm_name, iterable): + seen = set() + if self.variant_only: + seen.add(norm_name) + + for variant in map(str.strip, iterable): + if variant not in seen: + seen.add(variant) + yield self.to_ascii.transliterate(variant).strip() def _generate_word_variants(self, norm_name): diff --git a/nominatim/tokenizer/token_analysis/generic_mutation.py b/nominatim/tokenizer/token_analysis/generic_mutation.py new file mode 100644 index 00000000..d23d5cd4 --- /dev/null +++ b/nominatim/tokenizer/token_analysis/generic_mutation.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Creator for mutation variants for the generic token analysis. +""" +import itertools +import logging +import re + +from nominatim.errors import UsageError + +LOG = logging.getLogger() + +def _zigzag(outer, inner): + return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue='')) + + +class MutationVariantGenerator: + """ Generates name variants by applying a regular expression to the name + and replacing it with one or more variants. When the regular expression + matches more than once, each occurence is replaced with all replacement + patterns. + """ + + def __init__(self, pattern, replacements): + self.pattern = re.compile(pattern) + self.replacements = replacements + + if self.pattern.groups > 0: + LOG.fatal("The mutation pattern %s contains a capturing group. " + "This is not allowed.", pattern) + raise UsageError("Bad mutation pattern in configuration.") + + + def generate(self, names): + """ Generator function for the name variants. 'names' is an iterable + over a set of names for which the variants are to be generated. + """ + for name in names: + parts = self.pattern.split(name) + if len(parts) == 1: + yield name + else: + for seps in self._fillers(len(parts)): + yield ''.join(_zigzag(parts, seps)) + + + def _fillers(self, num_parts): + """ Returns a generator for strings to join the given number of string + parts in all possible combinations. + """ + return itertools.product(self.replacements, repeat=num_parts - 1) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index a3c62e67..c6601faf 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -59,6 +59,13 @@ token-analysis: mode: variant-only variants: - !include icu-rules/variants-de.yaml + mutations: + - pattern: ä + replacements: ["ä", "ae"] + - pattern: ö + replacements: ["ö", "oe"] + - pattern: ü + replacements: ["ü", "ue"] - id: el analyzer: generic mode: variant-only diff --git a/test/bdd/db/import/naming.feature b/test/bdd/db/import/naming.feature index bb29d2a3..b739cbae 100644 --- a/test/bdd/db/import/naming.feature +++ b/test/bdd/db/import/naming.feature @@ -58,3 +58,48 @@ Feature: Import and search of names | រាជធានីភ្នំពេញ | | 東京都 | | ပုဗ္ဗသီရိ | + + + Scenario: German umlauts can be found when expanded + Given the places + | osm | class | type | name+name:de | + | N1 | place | city | Münster | + | N2 | place | city | Köln | + | N3 | place | city | Gräfenroda | + When importing + When sending search query "münster" + Then results contain + | osm | + | N1 | + When sending search query "muenster" + Then results contain + | osm | + | N1 | + When sending search query "munster" + Then results contain + | osm | + | N1 | + When sending search query "Köln" + Then results contain + | osm | + | N2 | + When sending search query "Koeln" + Then results contain + | osm | + | N2 | + When sending search query "Koln" + Then results contain + | osm | + | N2 | + When sending search query "gräfenroda" + Then results contain + | osm | + | N3 | + When sending search query "graefenroda" + Then results contain + | osm | + | N3 | + When sending search query "grafenroda" + Then results contain + | osm | + | N3 | diff --git a/test/python/tokenizer/token_analysis/test_generic_mutation.py b/test/python/tokenizer/token_analysis/test_generic_mutation.py new file mode 100644 index 00000000..757f0311 --- /dev/null +++ b/test/python/tokenizer/token_analysis/test_generic_mutation.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for generic token analysis, mutation part. +""" +import pytest + +from icu import Transliterator + +import nominatim.tokenizer.token_analysis.generic as module +from nominatim.errors import UsageError + +DEFAULT_NORMALIZATION = """ '🜳' > ' '; + [[:Nonspacing Mark:] [:Cf:]] >; + :: lower (); + [[:Punctuation:][:Space:]]+ > ' ' + """ + +DEFAULT_TRANSLITERATION = """ :: Latin (); + '🜵' > ' '; + """ + +class TestMutationNoVariants: + + def make_analyser(self, *mutations): + rules = { 'analyzer': 'generic', + 'mutations': [ {'pattern': m[0], 'replacements': m[1]} + for m in mutations] + } + config = module.configure(rules, DEFAULT_NORMALIZATION) + trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + + self.analysis = module.create(trans, config) + + + def variants(self, name): + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip())) + + + @pytest.mark.parametrize('pattern', ('(capture)', ['a list'])) + def test_bad_pattern(self, pattern): + with pytest.raises(UsageError): + self.make_analyser((pattern, ['b'])) + + + @pytest.mark.parametrize('replacements', (None, 'a string')) + def test_bad_replacement(self, replacements): + with pytest.raises(UsageError): + self.make_analyser(('a', replacements)) + + + def test_simple_replacement(self): + self.make_analyser(('a', ['b'])) + + assert self.variants('none') == {'none'} + assert self.variants('abba') == {'bbbb'} + assert self.variants('2 aar') == {'2 bbr'} + + + def test_multichar_replacement(self): + self.make_analyser(('1 1', ['1 1 1'])) + + assert self.variants('1 1456') == {'1 1 1456'} + assert self.variants('1 1 1') == {'1 1 1 1'} + + + def test_removement_replacement(self): + self.make_analyser((' ', [' ', ''])) + + assert self.variants('A 345') == {'a 345', 'a345'} + assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'} + + + def test_regex_pattern(self): + self.make_analyser(('[^a-z]+', ['XXX', ' '])) + + assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'} + + + def test_multiple_mutations(self): + self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe'])) + + assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr', + 'längenoehr', 'laengenoehr'} From 4a41bff3ab52d64bd76275e6a337a3fab3922065 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 12 Jan 2022 17:37:06 +0100 Subject: [PATCH 4/6] add documentation for new mutation feature --- docs/customize/Tokenizers.md | 38 +++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index 4d5fbb15..5c766f50 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -99,6 +99,9 @@ token-analysis: - words: - road -> rd - bridge -> bdge,br,brdg,bri,brg + mutations: + - pattern: 'ä' + replacements: ['ä', 'ae'] ``` The configuration file contains four sections: @@ -205,12 +208,11 @@ the `analyzer` parameter must be set. Currently there is only one implementation ##### Generic token analyzer The generic analyzer is able to create variants from a list of given -abbreviation and decomposition replacements. It takes one optional parameter -`variants` which lists the replacements to apply. If the section is -omitted, then the generic analyzer becomes a simple analyzer that only -applies the transliteration. +abbreviation and decomposition replacements and introduce spelling variations. -The variants section defines lists of replacements which create alternative +###### Variants + +The optional 'variants' section defines lists of replacements which create alternative spellings of a name. To create the variants, a name is scanned from left to right and the longest matching replacement is applied until the end of the string is reached. @@ -296,6 +298,32 @@ decomposition has an effect here on the source as well. So a rule means that for a word like `hauptstrasse` four variants are created: `hauptstrasse`, `haupt strasse`, `hauptstr` and `haupt str`. +###### Mutations + +The 'mutation' section in the configuration describes an additional set of +replacements to be applied after the variants have been computed. + +Each mutation is described by two parameters: `pattern` and `replacements`. +The pattern must contain a single regular expression to search for in the +variant name. The regular expressions need to follow the syntax for +[Python regular expressions](file:///usr/share/doc/python3-doc/html/library/re.html#regular-expression-syntax). +Capturing groups are not permitted. +`replacements` must contain a list of strings that the pattern +should be replaced with. Each occurrence of the pattern is replaced with +all given replacements. Be mindful of combinatorial explosion of variants. + +###### Modes + +The generic analyser supports a special mode `variant-only`. When configured +then it consumes the input token and emits only variants (if any exist). Enable +the mode by adding: + +``` + mode: variant-only +``` + +to the analyser configuration. + ### Reconfiguration Changing the configuration after the import is currently not possible, although From adbaf700cdcec5cac10046fde671b615ab127b5a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 12 Jan 2022 19:41:16 +0100 Subject: [PATCH 5/6] move parsing of mutation config to setup phase --- nominatim/tokenizer/token_analysis/generic.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index 1e7b75a1..a5c9f493 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -25,7 +25,22 @@ def configure(rules, normalization_rules): config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), normalization_rules) config['variant_only'] = rules.get('mode', '') == 'variant-only' - config['mutations'] = rules.get('mutations', []) + + # parse mutation rules + config['mutations'] = [] + for rule in rules.get('mutations', []): + if 'pattern' not in rule: + raise UsageError("Missing field 'pattern' in mutation configuration.") + if not isinstance(rule['pattern'], str): + raise UsageError("Field 'pattern' in mutation configuration " + "must be a simple text field.") + if 'replacements' not in rule: + raise UsageError("Missing field 'replacements' in mutation configuration.") + if not isinstance(rule['replacements'], list): + raise UsageError("Field 'replacements' in mutation configuration " + "must be a list of texts.") + + config['mutations'].append((rule['pattern'], rule['replacements'])) return config @@ -56,21 +71,7 @@ class GenericTokenAnalysis: self.replacements = None # set up mutation rules - self.mutations = [] - for cfg in config['mutations']: - if 'pattern' not in cfg: - raise UsageError("Missing field 'pattern' in mutation configuration.") - if not isinstance(cfg['pattern'], str): - raise UsageError("Field 'pattern' in mutation configuration " - "must be a simple text field.") - if 'replacements' not in cfg: - raise UsageError("Missing field 'replacements' in mutation configuration.") - if not isinstance(cfg['replacements'], list): - raise UsageError("Field 'replacements' in mutation configuration " - "must be a list of texts.") - - self.mutations.append(MutationVariantGenerator(cfg['pattern'], - cfg['replacements'])) + self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] def get_variants_ascii(self, norm_name): From 3df560ea388ecd2f2dfdb0b6d45a46fc75aed73f Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 13 Jan 2022 09:30:31 +0100 Subject: [PATCH 6/6] fix linting error --- nominatim/tokenizer/token_analysis/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index a5c9f493..d4eae312 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -38,7 +38,7 @@ def configure(rules, normalization_rules): raise UsageError("Missing field 'replacements' in mutation configuration.") if not isinstance(rule['replacements'], list): raise UsageError("Field 'replacements' in mutation configuration " - "must be a list of texts.") + "must be a list of texts.") config['mutations'].append((rule['pattern'], rule['replacements']))