mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-23 05:35:13 +03:00
Merge pull request #2585 from lonvia/name-mutations
Introduce character mutations to token analysis
This commit is contained in:
commit
a7e048484b
@ -99,6 +99,9 @@ token-analysis:
|
||||
- words:
|
||||
- road -> rd
|
||||
- bridge -> bdge,br,brdg,bri,brg
|
||||
mutations:
|
||||
- pattern: 'ä'
|
||||
replacements: ['ä', 'ae']
|
||||
```
|
||||
|
||||
The configuration file contains four sections:
|
||||
@ -205,12 +208,11 @@ the `analyzer` parameter must be set. Currently there is only one implementation
|
||||
##### Generic token analyzer
|
||||
|
||||
The generic analyzer is able to create variants from a list of given
|
||||
abbreviation and decomposition replacements. It takes one optional parameter
|
||||
`variants` which lists the replacements to apply. If the section is
|
||||
omitted, then the generic analyzer becomes a simple analyzer that only
|
||||
applies the transliteration.
|
||||
abbreviation and decomposition replacements and introduce spelling variations.
|
||||
|
||||
The variants section defines lists of replacements which create alternative
|
||||
###### Variants
|
||||
|
||||
The optional 'variants' section defines lists of replacements which create alternative
|
||||
spellings of a name. To create the variants, a name is scanned from left to
|
||||
right and the longest matching replacement is applied until the end of the
|
||||
string is reached.
|
||||
@ -296,6 +298,32 @@ decomposition has an effect here on the source as well. So a rule
|
||||
means that for a word like `hauptstrasse` four variants are created:
|
||||
`hauptstrasse`, `haupt strasse`, `hauptstr` and `haupt str`.
|
||||
|
||||
###### Mutations
|
||||
|
||||
The 'mutation' section in the configuration describes an additional set of
|
||||
replacements to be applied after the variants have been computed.
|
||||
|
||||
Each mutation is described by two parameters: `pattern` and `replacements`.
|
||||
The pattern must contain a single regular expression to search for in the
|
||||
variant name. The regular expressions need to follow the syntax for
|
||||
[Python regular expressions](file:///usr/share/doc/python3-doc/html/library/re.html#regular-expression-syntax).
|
||||
Capturing groups are not permitted.
|
||||
`replacements` must contain a list of strings that the pattern
|
||||
should be replaced with. Each occurrence of the pattern is replaced with
|
||||
all given replacements. Be mindful of combinatorial explosion of variants.
|
||||
|
||||
###### Modes
|
||||
|
||||
The generic analyser supports a special mode `variant-only`. When configured
|
||||
then it consumes the input token and emits only variants (if any exist). Enable
|
||||
the mode by adding:
|
||||
|
||||
```
|
||||
mode: variant-only
|
||||
```
|
||||
|
||||
to the analyser configuration.
|
||||
|
||||
### Reconfiguration
|
||||
|
||||
Changing the configuration after the import is currently not possible, although
|
||||
|
134
nominatim/tokenizer/token_analysis/config_variants.py
Normal file
134
nominatim/tokenizer/token_analysis/config_variants.py
Normal file
@ -0,0 +1,134 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Parser for configuration for variants.
|
||||
"""
|
||||
from collections import defaultdict, namedtuple
|
||||
import itertools
|
||||
import re
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
from nominatim.config import flatten_config_list
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
|
||||
|
||||
def get_variant_config(rules, normalization_rules):
|
||||
""" Convert the variant definition from the configuration into
|
||||
replacement sets.
|
||||
|
||||
Returns a tuple containing the replacement set and the list of characters
|
||||
used in the replacements.
|
||||
"""
|
||||
immediate = defaultdict(list)
|
||||
chars = set()
|
||||
|
||||
if rules:
|
||||
vset = set()
|
||||
rules = flatten_config_list(rules, 'variants')
|
||||
|
||||
vmaker = _VariantMaker(normalization_rules)
|
||||
|
||||
for section in rules:
|
||||
for rule in (section.get('words') or []):
|
||||
vset.update(vmaker.compute(rule))
|
||||
|
||||
# Intermediate reorder by source. Also compute required character set.
|
||||
for variant in vset:
|
||||
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
||||
replstr = variant.replacement[:-1]
|
||||
else:
|
||||
replstr = variant.replacement
|
||||
immediate[variant.source].append(replstr)
|
||||
chars.update(variant.source)
|
||||
|
||||
return list(immediate.items()), ''.join(chars)
|
||||
|
||||
|
||||
class _VariantMaker:
|
||||
""" Generater for all necessary ICUVariants from a single variant rule.
|
||||
|
||||
All text in rules is normalized to make sure the variants match later.
|
||||
"""
|
||||
|
||||
def __init__(self, norm_rules):
|
||||
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||
norm_rules)
|
||||
|
||||
|
||||
def compute(self, rule):
|
||||
""" Generator for all ICUVariant tuples from a single variant rule.
|
||||
"""
|
||||
parts = re.split(r'(\|)?([=-])>', rule)
|
||||
if len(parts) != 4:
|
||||
raise UsageError("Syntax error in variant rule: " + rule)
|
||||
|
||||
decompose = parts[1] is None
|
||||
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
|
||||
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
|
||||
|
||||
# If the source should be kept, add a 1:1 replacement
|
||||
if parts[2] == '-':
|
||||
for src in src_terms:
|
||||
if src:
|
||||
for froms, tos in _create_variants(*src, src[0], decompose):
|
||||
yield ICUVariant(froms, tos)
|
||||
|
||||
for src, repl in itertools.product(src_terms, repl_terms):
|
||||
if src and repl:
|
||||
for froms, tos in _create_variants(*src, repl, decompose):
|
||||
yield ICUVariant(froms, tos)
|
||||
|
||||
|
||||
def _parse_variant_word(self, name):
|
||||
name = name.strip()
|
||||
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
|
||||
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
|
||||
raise UsageError("Invalid variant word descriptor '{}'".format(name))
|
||||
norm_name = self.norm.transliterate(match.group(2)).strip()
|
||||
if not norm_name:
|
||||
return None
|
||||
|
||||
return norm_name, match.group(1), match.group(3)
|
||||
|
||||
|
||||
_FLAG_MATCH = {'^': '^ ',
|
||||
'$': ' ^',
|
||||
'': ' '}
|
||||
|
||||
|
||||
def _create_variants(src, preflag, postflag, repl, decompose):
|
||||
if preflag == '~':
|
||||
postfix = _FLAG_MATCH[postflag]
|
||||
# suffix decomposition
|
||||
src = src + postfix
|
||||
repl = repl + postfix
|
||||
|
||||
yield src, repl
|
||||
yield ' ' + src, ' ' + repl
|
||||
|
||||
if decompose:
|
||||
yield src, ' ' + repl
|
||||
yield ' ' + src, repl
|
||||
elif postflag == '~':
|
||||
# prefix decomposition
|
||||
prefix = _FLAG_MATCH[preflag]
|
||||
src = prefix + src
|
||||
repl = prefix + repl
|
||||
|
||||
yield src, repl
|
||||
yield src + ' ', repl + ' '
|
||||
|
||||
if decompose:
|
||||
yield src, repl + ' '
|
||||
yield src + ' ', repl
|
||||
else:
|
||||
prefix = _FLAG_MATCH[preflag]
|
||||
postfix = _FLAG_MATCH[postflag]
|
||||
|
||||
yield prefix + src + postfix, prefix + repl + postfix
|
@ -7,145 +7,44 @@
|
||||
"""
|
||||
Generic processor for names that creates abbreviation variants.
|
||||
"""
|
||||
from collections import defaultdict, namedtuple
|
||||
import itertools
|
||||
import re
|
||||
|
||||
from icu import Transliterator
|
||||
import datrie
|
||||
|
||||
from nominatim.config import flatten_config_list
|
||||
from nominatim.errors import UsageError
|
||||
from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
|
||||
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
||||
|
||||
### Configuration section
|
||||
|
||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
|
||||
|
||||
def configure(rules, normalization_rules):
|
||||
""" Extract and preprocess the configuration for this module.
|
||||
"""
|
||||
config = {}
|
||||
|
||||
config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
|
||||
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
|
||||
normalization_rules)
|
||||
config['variant_only'] = rules.get('mode', '') == 'variant-only'
|
||||
|
||||
# parse mutation rules
|
||||
config['mutations'] = []
|
||||
for rule in rules.get('mutations', []):
|
||||
if 'pattern' not in rule:
|
||||
raise UsageError("Missing field 'pattern' in mutation configuration.")
|
||||
if not isinstance(rule['pattern'], str):
|
||||
raise UsageError("Field 'pattern' in mutation configuration "
|
||||
"must be a simple text field.")
|
||||
if 'replacements' not in rule:
|
||||
raise UsageError("Missing field 'replacements' in mutation configuration.")
|
||||
if not isinstance(rule['replacements'], list):
|
||||
raise UsageError("Field 'replacements' in mutation configuration "
|
||||
"must be a list of texts.")
|
||||
|
||||
config['mutations'].append((rule['pattern'], rule['replacements']))
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _get_variant_config(rules, normalization_rules):
|
||||
""" Convert the variant definition from the configuration into
|
||||
replacement sets.
|
||||
"""
|
||||
immediate = defaultdict(list)
|
||||
chars = set()
|
||||
|
||||
if rules:
|
||||
vset = set()
|
||||
rules = flatten_config_list(rules, 'variants')
|
||||
|
||||
vmaker = _VariantMaker(normalization_rules)
|
||||
|
||||
for section in rules:
|
||||
for rule in (section.get('words') or []):
|
||||
vset.update(vmaker.compute(rule))
|
||||
|
||||
# Intermediate reorder by source. Also compute required character set.
|
||||
for variant in vset:
|
||||
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
||||
replstr = variant.replacement[:-1]
|
||||
else:
|
||||
replstr = variant.replacement
|
||||
immediate[variant.source].append(replstr)
|
||||
chars.update(variant.source)
|
||||
|
||||
return list(immediate.items()), ''.join(chars)
|
||||
|
||||
|
||||
class _VariantMaker:
|
||||
""" Generater for all necessary ICUVariants from a single variant rule.
|
||||
|
||||
All text in rules is normalized to make sure the variants match later.
|
||||
"""
|
||||
|
||||
def __init__(self, norm_rules):
|
||||
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||
norm_rules)
|
||||
|
||||
|
||||
def compute(self, rule):
|
||||
""" Generator for all ICUVariant tuples from a single variant rule.
|
||||
"""
|
||||
parts = re.split(r'(\|)?([=-])>', rule)
|
||||
if len(parts) != 4:
|
||||
raise UsageError("Syntax error in variant rule: " + rule)
|
||||
|
||||
decompose = parts[1] is None
|
||||
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
|
||||
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
|
||||
|
||||
# If the source should be kept, add a 1:1 replacement
|
||||
if parts[2] == '-':
|
||||
for src in src_terms:
|
||||
if src:
|
||||
for froms, tos in _create_variants(*src, src[0], decompose):
|
||||
yield ICUVariant(froms, tos)
|
||||
|
||||
for src, repl in itertools.product(src_terms, repl_terms):
|
||||
if src and repl:
|
||||
for froms, tos in _create_variants(*src, repl, decompose):
|
||||
yield ICUVariant(froms, tos)
|
||||
|
||||
|
||||
def _parse_variant_word(self, name):
|
||||
name = name.strip()
|
||||
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
|
||||
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
|
||||
raise UsageError("Invalid variant word descriptor '{}'".format(name))
|
||||
norm_name = self.norm.transliterate(match.group(2)).strip()
|
||||
if not norm_name:
|
||||
return None
|
||||
|
||||
return norm_name, match.group(1), match.group(3)
|
||||
|
||||
|
||||
_FLAG_MATCH = {'^': '^ ',
|
||||
'$': ' ^',
|
||||
'': ' '}
|
||||
|
||||
|
||||
def _create_variants(src, preflag, postflag, repl, decompose):
|
||||
if preflag == '~':
|
||||
postfix = _FLAG_MATCH[postflag]
|
||||
# suffix decomposition
|
||||
src = src + postfix
|
||||
repl = repl + postfix
|
||||
|
||||
yield src, repl
|
||||
yield ' ' + src, ' ' + repl
|
||||
|
||||
if decompose:
|
||||
yield src, ' ' + repl
|
||||
yield ' ' + src, repl
|
||||
elif postflag == '~':
|
||||
# prefix decomposition
|
||||
prefix = _FLAG_MATCH[preflag]
|
||||
src = prefix + src
|
||||
repl = prefix + repl
|
||||
|
||||
yield src, repl
|
||||
yield src + ' ', repl + ' '
|
||||
|
||||
if decompose:
|
||||
yield src, repl + ' '
|
||||
yield src + ' ', repl
|
||||
else:
|
||||
prefix = _FLAG_MATCH[preflag]
|
||||
postfix = _FLAG_MATCH[postflag]
|
||||
|
||||
yield prefix + src + postfix, prefix + repl + postfix
|
||||
|
||||
|
||||
### Analysis section
|
||||
|
||||
def create(transliterator, config):
|
||||
@ -171,19 +70,43 @@ class GenericTokenAnalysis:
|
||||
else:
|
||||
self.replacements = None
|
||||
|
||||
# set up mutation rules
|
||||
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
|
||||
|
||||
|
||||
def get_variants_ascii(self, norm_name):
|
||||
""" Compute the spelling variants for the given normalized name
|
||||
and transliterate the result.
|
||||
"""
|
||||
variants = self._generate_word_variants(norm_name)
|
||||
|
||||
for mutation in self.mutations:
|
||||
variants = mutation.generate(variants)
|
||||
|
||||
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
|
||||
|
||||
|
||||
def _transliterate_unique_list(self, norm_name, iterable):
|
||||
seen = set()
|
||||
if self.variant_only:
|
||||
seen.add(norm_name)
|
||||
|
||||
for variant in map(str.strip, iterable):
|
||||
if variant not in seen:
|
||||
seen.add(variant)
|
||||
yield self.to_ascii.transliterate(variant).strip()
|
||||
|
||||
|
||||
def _generate_word_variants(self, norm_name):
|
||||
baseform = '^ ' + norm_name + ' ^'
|
||||
baselen = len(baseform)
|
||||
partials = ['']
|
||||
|
||||
startpos = 0
|
||||
if self.replacements is not None:
|
||||
pos = 0
|
||||
force_space = False
|
||||
while pos < len(baseform):
|
||||
while pos < baselen:
|
||||
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
|
||||
(None, None))
|
||||
if full is not None:
|
||||
@ -207,24 +130,9 @@ class GenericTokenAnalysis:
|
||||
|
||||
# No variants detected? Fast return.
|
||||
if startpos == 0:
|
||||
if self.variant_only:
|
||||
return []
|
||||
return (norm_name, )
|
||||
|
||||
trans_name = self.to_ascii.transliterate(norm_name).strip()
|
||||
return [trans_name] if trans_name else []
|
||||
if startpos < baselen:
|
||||
return (part[1:] + baseform[startpos:-1] for part in partials)
|
||||
|
||||
return self._compute_result_set(partials, baseform[startpos:],
|
||||
norm_name if self.variant_only else '')
|
||||
|
||||
|
||||
def _compute_result_set(self, partials, prefix, exclude):
|
||||
results = set()
|
||||
|
||||
for variant in partials:
|
||||
vname = (variant + prefix)[1:-1].strip()
|
||||
if vname != exclude:
|
||||
trans_name = self.to_ascii.transliterate(vname).strip()
|
||||
if trans_name:
|
||||
results.add(trans_name)
|
||||
|
||||
return list(results)
|
||||
return (part[1:-1] for part in partials)
|
||||
|
56
nominatim/tokenizer/token_analysis/generic_mutation.py
Normal file
56
nominatim/tokenizer/token_analysis/generic_mutation.py
Normal file
@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Creator for mutation variants for the generic token analysis.
|
||||
"""
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _zigzag(outer, inner):
|
||||
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
|
||||
|
||||
|
||||
class MutationVariantGenerator:
|
||||
""" Generates name variants by applying a regular expression to the name
|
||||
and replacing it with one or more variants. When the regular expression
|
||||
matches more than once, each occurence is replaced with all replacement
|
||||
patterns.
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, replacements):
|
||||
self.pattern = re.compile(pattern)
|
||||
self.replacements = replacements
|
||||
|
||||
if self.pattern.groups > 0:
|
||||
LOG.fatal("The mutation pattern %s contains a capturing group. "
|
||||
"This is not allowed.", pattern)
|
||||
raise UsageError("Bad mutation pattern in configuration.")
|
||||
|
||||
|
||||
def generate(self, names):
|
||||
""" Generator function for the name variants. 'names' is an iterable
|
||||
over a set of names for which the variants are to be generated.
|
||||
"""
|
||||
for name in names:
|
||||
parts = self.pattern.split(name)
|
||||
if len(parts) == 1:
|
||||
yield name
|
||||
else:
|
||||
for seps in self._fillers(len(parts)):
|
||||
yield ''.join(_zigzag(parts, seps))
|
||||
|
||||
|
||||
def _fillers(self, num_parts):
|
||||
""" Returns a generator for strings to join the given number of string
|
||||
parts in all possible combinations.
|
||||
"""
|
||||
return itertools.product(self.replacements, repeat=num_parts - 1)
|
@ -59,6 +59,13 @@ token-analysis:
|
||||
mode: variant-only
|
||||
variants:
|
||||
- !include icu-rules/variants-de.yaml
|
||||
mutations:
|
||||
- pattern: ä
|
||||
replacements: ["ä", "ae"]
|
||||
- pattern: ö
|
||||
replacements: ["ö", "oe"]
|
||||
- pattern: ü
|
||||
replacements: ["ü", "ue"]
|
||||
- id: el
|
||||
analyzer: generic
|
||||
mode: variant-only
|
||||
|
@ -58,3 +58,48 @@ Feature: Import and search of names
|
||||
| រាជធានីភ្នំពេញ |
|
||||
| 東京都 |
|
||||
| ပုဗ္ဗသီရိ |
|
||||
|
||||
|
||||
Scenario: German umlauts can be found when expanded
|
||||
Given the places
|
||||
| osm | class | type | name+name:de |
|
||||
| N1 | place | city | Münster |
|
||||
| N2 | place | city | Köln |
|
||||
| N3 | place | city | Gräfenroda |
|
||||
When importing
|
||||
When sending search query "münster"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N1 |
|
||||
When sending search query "muenster"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N1 |
|
||||
When sending search query "munster"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N1 |
|
||||
When sending search query "Köln"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N2 |
|
||||
When sending search query "Koeln"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N2 |
|
||||
When sending search query "Koln"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N2 |
|
||||
When sending search query "gräfenroda"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N3 |
|
||||
When sending search query "graefenroda"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N3 |
|
||||
When sending search query "grafenroda"
|
||||
Then results contain
|
||||
| osm |
|
||||
| N3 |
|
||||
|
@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2022 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for generic token analysis, mutation part.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
import nominatim.tokenizer.token_analysis.generic as module
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
DEFAULT_NORMALIZATION = """ '🜳' > ' ';
|
||||
[[:Nonspacing Mark:] [:Cf:]] >;
|
||||
:: lower ();
|
||||
[[:Punctuation:][:Space:]]+ > ' '
|
||||
"""
|
||||
|
||||
DEFAULT_TRANSLITERATION = """ :: Latin ();
|
||||
'🜵' > ' ';
|
||||
"""
|
||||
|
||||
class TestMutationNoVariants:
|
||||
|
||||
def make_analyser(self, *mutations):
|
||||
rules = { 'analyzer': 'generic',
|
||||
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
|
||||
for m in mutations]
|
||||
}
|
||||
config = module.configure(rules, DEFAULT_NORMALIZATION)
|
||||
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
|
||||
|
||||
self.analysis = module.create(trans, config)
|
||||
|
||||
|
||||
def variants(self, name):
|
||||
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
|
||||
return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip()))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pattern', ('(capture)', ['a list']))
|
||||
def test_bad_pattern(self, pattern):
|
||||
with pytest.raises(UsageError):
|
||||
self.make_analyser((pattern, ['b']))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('replacements', (None, 'a string'))
|
||||
def test_bad_replacement(self, replacements):
|
||||
with pytest.raises(UsageError):
|
||||
self.make_analyser(('a', replacements))
|
||||
|
||||
|
||||
def test_simple_replacement(self):
|
||||
self.make_analyser(('a', ['b']))
|
||||
|
||||
assert self.variants('none') == {'none'}
|
||||
assert self.variants('abba') == {'bbbb'}
|
||||
assert self.variants('2 aar') == {'2 bbr'}
|
||||
|
||||
|
||||
def test_multichar_replacement(self):
|
||||
self.make_analyser(('1 1', ['1 1 1']))
|
||||
|
||||
assert self.variants('1 1456') == {'1 1 1456'}
|
||||
assert self.variants('1 1 1') == {'1 1 1 1'}
|
||||
|
||||
|
||||
def test_removement_replacement(self):
|
||||
self.make_analyser((' ', [' ', '']))
|
||||
|
||||
assert self.variants('A 345') == {'a 345', 'a345'}
|
||||
assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'}
|
||||
|
||||
|
||||
def test_regex_pattern(self):
|
||||
self.make_analyser(('[^a-z]+', ['XXX', ' ']))
|
||||
|
||||
assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'}
|
||||
|
||||
|
||||
def test_multiple_mutations(self):
|
||||
self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe']))
|
||||
|
||||
assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr',
|
||||
'längenoehr', 'laengenoehr'}
|
Loading…
Reference in New Issue
Block a user