Merge pull request #2585 from lonvia/name-mutations

Introduce character mutations to token analysis
This commit is contained in:
Sarah Hoffmann 2022-01-19 17:09:36 +01:00 committed by GitHub
commit a7e048484b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 413 additions and 146 deletions

View File

@ -99,6 +99,9 @@ token-analysis:
- words:
- road -> rd
- bridge -> bdge,br,brdg,bri,brg
mutations:
- pattern: 'ä'
replacements: ['ä', 'ae']
```
The configuration file contains four sections:
@ -205,12 +208,11 @@ the `analyzer` parameter must be set. Currently there is only one implementation
##### Generic token analyzer
The generic analyzer is able to create variants from a list of given
abbreviation and decomposition replacements. It takes one optional parameter
`variants` which lists the replacements to apply. If the section is
omitted, then the generic analyzer becomes a simple analyzer that only
applies the transliteration.
abbreviation and decomposition replacements and introduce spelling variations.
The variants section defines lists of replacements which create alternative
###### Variants
The optional 'variants' section defines lists of replacements which create alternative
spellings of a name. To create the variants, a name is scanned from left to
right and the longest matching replacement is applied until the end of the
string is reached.
@ -296,6 +298,32 @@ decomposition has an effect here on the source as well. So a rule
means that for a word like `hauptstrasse` four variants are created:
`hauptstrasse`, `haupt strasse`, `hauptstr` and `haupt str`.
###### Mutations
The 'mutation' section in the configuration describes an additional set of
replacements to be applied after the variants have been computed.
Each mutation is described by two parameters: `pattern` and `replacements`.
The pattern must contain a single regular expression to search for in the
variant name. The regular expressions need to follow the syntax for
[Python regular expressions](file:///usr/share/doc/python3-doc/html/library/re.html#regular-expression-syntax).
Capturing groups are not permitted.
`replacements` must contain a list of strings that the pattern
should be replaced with. Each occurrence of the pattern is replaced with
all given replacements. Be mindful of combinatorial explosion of variants.
###### Modes
The generic analyser supports a special mode `variant-only`. When configured
then it consumes the input token and emits only variants (if any exist). Enable
the mode by adding:
```
mode: variant-only
```
to the analyser configuration.
### Reconfiguration
Changing the configuration after the import is currently not possible, although

View File

@ -0,0 +1,134 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Parser for configuration for variants.
"""
from collections import defaultdict, namedtuple
import itertools
import re
from icu import Transliterator
from nominatim.config import flatten_config_list
from nominatim.errors import UsageError
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
def get_variant_config(rules, normalization_rules):
""" Convert the variant definition from the configuration into
replacement sets.
Returns a tuple containing the replacement set and the list of characters
used in the replacements.
"""
immediate = defaultdict(list)
chars = set()
if rules:
vset = set()
rules = flatten_config_list(rules, 'variants')
vmaker = _VariantMaker(normalization_rules)
for section in rules:
for rule in (section.get('words') or []):
vset.update(vmaker.compute(rule))
# Intermediate reorder by source. Also compute required character set.
for variant in vset:
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
replstr = variant.replacement[:-1]
else:
replstr = variant.replacement
immediate[variant.source].append(replstr)
chars.update(variant.source)
return list(immediate.items()), ''.join(chars)
class _VariantMaker:
""" Generater for all necessary ICUVariants from a single variant rule.
All text in rules is normalized to make sure the variants match later.
"""
def __init__(self, norm_rules):
self.norm = Transliterator.createFromRules("rule_loader_normalization",
norm_rules)
def compute(self, rule):
""" Generator for all ICUVariant tuples from a single variant rule.
"""
parts = re.split(r'(\|)?([=-])>', rule)
if len(parts) != 4:
raise UsageError("Syntax error in variant rule: " + rule)
decompose = parts[1] is None
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
# If the source should be kept, add a 1:1 replacement
if parts[2] == '-':
for src in src_terms:
if src:
for froms, tos in _create_variants(*src, src[0], decompose):
yield ICUVariant(froms, tos)
for src, repl in itertools.product(src_terms, repl_terms):
if src and repl:
for froms, tos in _create_variants(*src, repl, decompose):
yield ICUVariant(froms, tos)
def _parse_variant_word(self, name):
name = name.strip()
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
raise UsageError("Invalid variant word descriptor '{}'".format(name))
norm_name = self.norm.transliterate(match.group(2)).strip()
if not norm_name:
return None
return norm_name, match.group(1), match.group(3)
_FLAG_MATCH = {'^': '^ ',
'$': ' ^',
'': ' '}
def _create_variants(src, preflag, postflag, repl, decompose):
if preflag == '~':
postfix = _FLAG_MATCH[postflag]
# suffix decomposition
src = src + postfix
repl = repl + postfix
yield src, repl
yield ' ' + src, ' ' + repl
if decompose:
yield src, ' ' + repl
yield ' ' + src, repl
elif postflag == '~':
# prefix decomposition
prefix = _FLAG_MATCH[preflag]
src = prefix + src
repl = prefix + repl
yield src, repl
yield src + ' ', repl + ' '
if decompose:
yield src, repl + ' '
yield src + ' ', repl
else:
prefix = _FLAG_MATCH[preflag]
postfix = _FLAG_MATCH[postflag]
yield prefix + src + postfix, prefix + repl + postfix

View File

@ -7,145 +7,44 @@
"""
Generic processor for names that creates abbreviation variants.
"""
from collections import defaultdict, namedtuple
import itertools
import re
from icu import Transliterator
import datrie
from nominatim.config import flatten_config_list
from nominatim.errors import UsageError
from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
### Configuration section
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
def configure(rules, normalization_rules):
""" Extract and preprocess the configuration for this module.
"""
config = {}
config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
normalization_rules)
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
normalization_rules)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
# parse mutation rules
config['mutations'] = []
for rule in rules.get('mutations', []):
if 'pattern' not in rule:
raise UsageError("Missing field 'pattern' in mutation configuration.")
if not isinstance(rule['pattern'], str):
raise UsageError("Field 'pattern' in mutation configuration "
"must be a simple text field.")
if 'replacements' not in rule:
raise UsageError("Missing field 'replacements' in mutation configuration.")
if not isinstance(rule['replacements'], list):
raise UsageError("Field 'replacements' in mutation configuration "
"must be a list of texts.")
config['mutations'].append((rule['pattern'], rule['replacements']))
return config
def _get_variant_config(rules, normalization_rules):
""" Convert the variant definition from the configuration into
replacement sets.
"""
immediate = defaultdict(list)
chars = set()
if rules:
vset = set()
rules = flatten_config_list(rules, 'variants')
vmaker = _VariantMaker(normalization_rules)
for section in rules:
for rule in (section.get('words') or []):
vset.update(vmaker.compute(rule))
# Intermediate reorder by source. Also compute required character set.
for variant in vset:
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
replstr = variant.replacement[:-1]
else:
replstr = variant.replacement
immediate[variant.source].append(replstr)
chars.update(variant.source)
return list(immediate.items()), ''.join(chars)
class _VariantMaker:
""" Generater for all necessary ICUVariants from a single variant rule.
All text in rules is normalized to make sure the variants match later.
"""
def __init__(self, norm_rules):
self.norm = Transliterator.createFromRules("rule_loader_normalization",
norm_rules)
def compute(self, rule):
""" Generator for all ICUVariant tuples from a single variant rule.
"""
parts = re.split(r'(\|)?([=-])>', rule)
if len(parts) != 4:
raise UsageError("Syntax error in variant rule: " + rule)
decompose = parts[1] is None
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
# If the source should be kept, add a 1:1 replacement
if parts[2] == '-':
for src in src_terms:
if src:
for froms, tos in _create_variants(*src, src[0], decompose):
yield ICUVariant(froms, tos)
for src, repl in itertools.product(src_terms, repl_terms):
if src and repl:
for froms, tos in _create_variants(*src, repl, decompose):
yield ICUVariant(froms, tos)
def _parse_variant_word(self, name):
name = name.strip()
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
raise UsageError("Invalid variant word descriptor '{}'".format(name))
norm_name = self.norm.transliterate(match.group(2)).strip()
if not norm_name:
return None
return norm_name, match.group(1), match.group(3)
_FLAG_MATCH = {'^': '^ ',
'$': ' ^',
'': ' '}
def _create_variants(src, preflag, postflag, repl, decompose):
if preflag == '~':
postfix = _FLAG_MATCH[postflag]
# suffix decomposition
src = src + postfix
repl = repl + postfix
yield src, repl
yield ' ' + src, ' ' + repl
if decompose:
yield src, ' ' + repl
yield ' ' + src, repl
elif postflag == '~':
# prefix decomposition
prefix = _FLAG_MATCH[preflag]
src = prefix + src
repl = prefix + repl
yield src, repl
yield src + ' ', repl + ' '
if decompose:
yield src, repl + ' '
yield src + ' ', repl
else:
prefix = _FLAG_MATCH[preflag]
postfix = _FLAG_MATCH[postflag]
yield prefix + src + postfix, prefix + repl + postfix
### Analysis section
def create(transliterator, config):
@ -171,19 +70,43 @@ class GenericTokenAnalysis:
else:
self.replacements = None
# set up mutation rules
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
variants = self._generate_word_variants(norm_name)
for mutation in self.mutations:
variants = mutation.generate(variants)
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
def _transliterate_unique_list(self, norm_name, iterable):
seen = set()
if self.variant_only:
seen.add(norm_name)
for variant in map(str.strip, iterable):
if variant not in seen:
seen.add(variant)
yield self.to_ascii.transliterate(variant).strip()
def _generate_word_variants(self, norm_name):
baseform = '^ ' + norm_name + ' ^'
baselen = len(baseform)
partials = ['']
startpos = 0
if self.replacements is not None:
pos = 0
force_space = False
while pos < len(baseform):
while pos < baselen:
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
(None, None))
if full is not None:
@ -207,24 +130,9 @@ class GenericTokenAnalysis:
# No variants detected? Fast return.
if startpos == 0:
if self.variant_only:
return []
return (norm_name, )
trans_name = self.to_ascii.transliterate(norm_name).strip()
return [trans_name] if trans_name else []
if startpos < baselen:
return (part[1:] + baseform[startpos:-1] for part in partials)
return self._compute_result_set(partials, baseform[startpos:],
norm_name if self.variant_only else '')
def _compute_result_set(self, partials, prefix, exclude):
results = set()
for variant in partials:
vname = (variant + prefix)[1:-1].strip()
if vname != exclude:
trans_name = self.to_ascii.transliterate(vname).strip()
if trans_name:
results.add(trans_name)
return list(results)
return (part[1:-1] for part in partials)

View File

@ -0,0 +1,56 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Creator for mutation variants for the generic token analysis.
"""
import itertools
import logging
import re
from nominatim.errors import UsageError
LOG = logging.getLogger()
def _zigzag(outer, inner):
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
class MutationVariantGenerator:
""" Generates name variants by applying a regular expression to the name
and replacing it with one or more variants. When the regular expression
matches more than once, each occurence is replaced with all replacement
patterns.
"""
def __init__(self, pattern, replacements):
self.pattern = re.compile(pattern)
self.replacements = replacements
if self.pattern.groups > 0:
LOG.fatal("The mutation pattern %s contains a capturing group. "
"This is not allowed.", pattern)
raise UsageError("Bad mutation pattern in configuration.")
def generate(self, names):
""" Generator function for the name variants. 'names' is an iterable
over a set of names for which the variants are to be generated.
"""
for name in names:
parts = self.pattern.split(name)
if len(parts) == 1:
yield name
else:
for seps in self._fillers(len(parts)):
yield ''.join(_zigzag(parts, seps))
def _fillers(self, num_parts):
""" Returns a generator for strings to join the given number of string
parts in all possible combinations.
"""
return itertools.product(self.replacements, repeat=num_parts - 1)

View File

@ -59,6 +59,13 @@ token-analysis:
mode: variant-only
variants:
- !include icu-rules/variants-de.yaml
mutations:
- pattern: ä
replacements: ["ä", "ae"]
- pattern: ö
replacements: ["ö", "oe"]
- pattern: ü
replacements: ["ü", "ue"]
- id: el
analyzer: generic
mode: variant-only

View File

@ -58,3 +58,48 @@ Feature: Import and search of names
| |
| |
| |
Scenario: German umlauts can be found when expanded
Given the places
| osm | class | type | name+name:de |
| N1 | place | city | Münster |
| N2 | place | city | Köln |
| N3 | place | city | Gräfenroda |
When importing
When sending search query "münster"
Then results contain
| osm |
| N1 |
When sending search query "muenster"
Then results contain
| osm |
| N1 |
When sending search query "munster"
Then results contain
| osm |
| N1 |
When sending search query "Köln"
Then results contain
| osm |
| N2 |
When sending search query "Koeln"
Then results contain
| osm |
| N2 |
When sending search query "Koln"
Then results contain
| osm |
| N2 |
When sending search query "gräfenroda"
Then results contain
| osm |
| N3 |
When sending search query "graefenroda"
Then results contain
| osm |
| N3 |
When sending search query "grafenroda"
Then results contain
| osm |
| N3 |

View File

@ -0,0 +1,89 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for generic token analysis, mutation part.
"""
import pytest
from icu import Transliterator
import nominatim.tokenizer.token_analysis.generic as module
from nominatim.errors import UsageError
DEFAULT_NORMALIZATION = """ '🜳' > ' ';
[[:Nonspacing Mark:] [:Cf:]] >;
:: lower ();
[[:Punctuation:][:Space:]]+ > ' '
"""
DEFAULT_TRANSLITERATION = """ :: Latin ();
'🜵' > ' ';
"""
class TestMutationNoVariants:
def make_analyser(self, *mutations):
rules = { 'analyzer': 'generic',
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
for m in mutations]
}
config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
self.analysis = module.create(trans, config)
def variants(self, name):
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip()))
@pytest.mark.parametrize('pattern', ('(capture)', ['a list']))
def test_bad_pattern(self, pattern):
with pytest.raises(UsageError):
self.make_analyser((pattern, ['b']))
@pytest.mark.parametrize('replacements', (None, 'a string'))
def test_bad_replacement(self, replacements):
with pytest.raises(UsageError):
self.make_analyser(('a', replacements))
def test_simple_replacement(self):
self.make_analyser(('a', ['b']))
assert self.variants('none') == {'none'}
assert self.variants('abba') == {'bbbb'}
assert self.variants('2 aar') == {'2 bbr'}
def test_multichar_replacement(self):
self.make_analyser(('1 1', ['1 1 1']))
assert self.variants('1 1456') == {'1 1 1456'}
assert self.variants('1 1 1') == {'1 1 1 1'}
def test_removement_replacement(self):
self.make_analyser((' ', [' ', '']))
assert self.variants('A 345') == {'a 345', 'a345'}
assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'}
def test_regex_pattern(self):
self.make_analyser(('[^a-z]+', ['XXX', ' ']))
assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'}
def test_multiple_mutations(self):
self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe']))
assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr',
'längenoehr', 'laengenoehr'}