move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes
abbreviation of terms in the query. Basic import works but still
needs some thorough testing as well as speed improvements during
import.

New dependency for python library datrie.
This commit is contained in:
Sarah Hoffmann 2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions

View File

@ -47,9 +47,7 @@ class Tokenizer
private function makeStandardWord($sTerm)
{
$sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' ';
return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm));
return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
}
@ -90,6 +88,7 @@ class Tokenizer
foreach ($aPhrases as $iPhrase => $oPhrase) {
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
$sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
Debug::printVar('Phrase', $sPhrase);
if (strlen($sPhrase) > 0) {
$aWords = explode(' ', $sPhrase);
Tokenizer::addTokens($aTokens, $aWords);

View File

@ -87,25 +87,48 @@ $$ LANGUAGE SQL IMMUTABLE STRICT;
--------------- private functions ----------------------------------------------
CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
RETURNS INTEGER
CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[],
OUT full_token INT,
OUT partial_tokens INT[])
AS $$
DECLARE
return_id INTEGER;
partial_terms TEXT[] = '{}'::TEXT[];
term TEXT;
term_id INTEGER;
term_count INTEGER;
BEGIN
SELECT min(word_id), max(search_name_count) INTO return_id, term_count
FROM word WHERE word_token = lookup_term and class is null and type is null;
SELECT min(word_id) INTO full_token
FROM word WHERE word = norm_term and class is null and country_code is null;
IF return_id IS NULL THEN
return_id := nextval('seq_word');
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (return_id, lookup_term, 0);
ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN
return_id := 0;
IF full_token IS NULL THEN
full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, word, search_name_count)
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
END IF;
RETURN return_id;
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
term := trim(term);
IF NOT (ARRAY[term] <@ partial_terms) THEN
partial_terms := partial_terms || term;
END IF;
END LOOP;
partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null;
IF term_id IS NULL THEN
term_id := nextval('seq_word');
term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (term_id, term, 0);
END IF;
IF term_count < {{ max_word_freq }} THEN
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
END IF;
END LOOP;
END;
$$
LANGUAGE plpgsql;

View File

@ -0,0 +1,111 @@
"""
Processor for names that are imported into the database based on the
ICU library.
"""
import json
import itertools
from icu import Transliterator
import datrie
from nominatim.db.properties import set_property, get_property
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
class ICUNameProcessorRules:
""" Data object that saves the rules needed for the name processor.
The rules can either be initialised through an ICURuleLoader or
be loaded from a database when a connection is given.
"""
def __init__(self, loader=None, conn=None):
if loader is not None:
self.norm_rules = loader.get_normalization_rules()
self.trans_rules = loader.get_transliteration_rules()
self.replacements = loader.get_replacement_pairs()
self.search_rules = loader.get_search_rules()
elif conn is not None:
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
else:
assert False, "Parameter loader or conn required."
# Compute the set of characters used in the replacement list.
# We need this later when computing the tree.
chars = set()
for full, repl in self.replacements:
chars.update(full)
for word in repl:
chars.update(word)
self.replacement_charset = ''.join(chars)
def save_rules(self, conn):
""" Save the rules in the property table of the given database.
the rules can be loaded again by handing in a connection into
the constructor of the class.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements))
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
class ICUNameProcessor:
def __init__(self, rules):
self.normalizer = Transliterator.createFromRules("icu_normalization",
rules.norm_rules)
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
rules.trans_rules)
self.search = Transliterator.createFromRules("icu_search",
rules.search_rules)
self.replacements = datrie.Trie(rules.replacement_charset)
for full, repl in rules.replacements:
self.replacements[full] = repl
def get_normalized(self, name):
""" Normalize the given name, i.e. remove all elements not relevant
for search.
"""
return self.normalizer.transliterate(name)
def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
baseform = ' ' + norm_name + ' '
variants = ['']
startpos = 0
pos = 0
while pos < len(baseform):
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
(None, None))
if full is not None:
done = baseform[startpos:pos]
variants = [v + done + r for v, r in itertools.product(variants, repl)]
startpos = pos + len(full)
pos = startpos
else:
pos += 1
if startpos == 0:
return [self.to_ascii.transliterate(norm_name)]
return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants]
def get_search_normalized(self, name):
""" Return the normalized version of the name (including transliteration)
to be applied at search time.
"""
return self.search.transliterate(name)

View File

@ -0,0 +1,161 @@
"""
Helper class to create ICU rules from a configuration file.
"""
import io
import yaml
import logging
from collections import defaultdict
import itertools
from icu import Transliterator
from nominatim.errors import UsageError
LOG = logging.getLogger()
class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, configfile):
self.configfile = configfile
if configfile.suffix == '.yaml':
self._load_from_yaml()
else:
raise UsageError("Unknown format of tokenizer configuration.")
def get_search_rules(self):
""" Returns the ICU rules to be used during search.
The rules combine normalization, compound decomposition (including
abbreviated compounds) and transliteration.
"""
# First apply the normalization rules.
rules = io.StringIO()
rules.write(self.normalization_rules)
# For all compound suffixes: add them in their full and any abbreviated form.
suffixes = set()
for suffix in self.compound_suffixes:
suffixes.add(suffix)
suffixes.update(self.abbreviations.get(suffix, []))
for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
rules.write("'{0} ' > ' {0} ';".format(suffix))
# Finally add transliteration.
rules.write(self.transliteration_rules)
return rules.getvalue()
def get_normalization_rules(self):
""" Return rules for normalisation of a term.
"""
return self.normalization_rules
def get_transliteration_rules(self):
""" Return the rules for converting a string into its asciii representation.
"""
return self.transliteration_rules
def get_replacement_pairs(self):
""" Returns the list of possible compound decompositions with
application of abbreviations included.
The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements.
"""
synonyms = defaultdict(set)
for full, abbr in self.abbreviations.items():
key = ' ' + full + ' '
# Entries in the abbreviation list always apply to full words:
synonyms[key].update((' ' + a + ' ' for a in abbr))
# Replacements are optional, so add a noop
synonyms[key].add(key)
# Entries in the compound list expand to themselves and to
# abbreviations.
for suffix in self.compound_suffixes:
keyset = synonyms[suffix + ' ']
keyset.add(' ' + suffix + ' ')
keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
# The terms the entries are shortended to, need to be decompunded as well.
for abbr in self.abbreviations.get(suffix, []):
synonyms[abbr + ' '].add(' ' + abbr + ' ')
# sort the resulting list by descending length (longer matches are prefered).
sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
return [(k, list(synonyms[k])) for k in sorted_keys]
def _load_from_yaml(self):
rules = yaml.load(self.configfile.read_text())
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
def _get_section(self, rules, section):
""" Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message.
"""
if section not in rules:
LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
section, str(self.configfile))
raise UsageError("Syntax error in tokenizer configuration file.")
return rules[section]
def _cfg_to_icu_rules(self, rules, section):
""" Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be
relative to the tokenizer rule file. If the section is a list then
each line is assumed to be a rule. All rules are concatenated and returned.
"""
content = self._get_section(rules, section)
if isinstance(content, str):
return (self.configfile.parent / content).read_text().replace('\n', ' ')
return ';'.join(content) + ';'
def _parse_compound_suffix_list(self, rules):
if not rules:
self.compound_suffixes = set()
return
norm = Transliterator.createFromRules("rule_loader_normalization",
self.normalization_rules)
# Make sure all suffixes are in their normalised form.
self.compound_suffixes = set((norm.transliterate(s) for s in rules))
def _parse_abbreviation_list(self, rules):
self.abbreviations = defaultdict(list)
if not rules:
return
norm = Transliterator.createFromRules("rule_loader_normalization",
self.normalization_rules)
for rule in rules:
parts = rule.split('=>')
if len(parts) != 2:
LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
raise UsageError("Syntax error in tokenizer configuration file.")
# Make sure all terms match the normalised version.
fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
for full, abbr in itertools.product(fullterms, abbrterms):
self.abbreviations[full].append(abbr)

View File

@ -18,11 +18,11 @@ import psycopg2.extras
from nominatim.db.connection import connect
from nominatim.db.properties import set_property, get_property
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
DBCFG_NORMALIZATION = "tokenizer_normalization"
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
DBCFG_TRANSLITERATION = "tokenizer_transliteration"
DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
LOG = logging.getLogger()
@ -41,9 +41,9 @@ class LegacyICUTokenizer:
def __init__(self, dsn, data_dir):
self.dsn = dsn
self.data_dir = data_dir
self.normalization = None
self.transliteration = None
self.abbreviations = None
self.naming_rules = None
self.term_normalization = None
self.max_word_frequency = None
def init_new_db(self, config, init_db=True):
@ -55,14 +55,14 @@ class LegacyICUTokenizer:
if config.TOKENIZER_CONFIG:
cfgfile = Path(config.TOKENIZER_CONFIG)
else:
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
rules = json.loads(cfgfile.read_text())
self._load_transliteration(rules['normalization'], cfgfile.parent)
self.abbreviations = rules["abbreviations"]
self.normalization = config.TERM_NORMALIZATION
loader = ICURuleLoader(cfgfile)
self.naming_rules = ICUNameProcessorRules(loader=loader)
self.term_normalization = config.TERM_NORMALIZATION
self.max_word_frequency = config.MAX_WORD_FREQUENCY
self._install_php(config)
self._install_php(config.lib_dir.php)
self._save_config(config)
if init_db:
@ -70,19 +70,13 @@ class LegacyICUTokenizer:
self._init_db_tables(config)
def _load_transliteration(self, rules, cfg_path):
if isinstance(rules, str):
self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
else:
self.transliteration = ';'.join(rules) + ';'
def init_from_project(self):
""" Initialise the tokenizer from the project directory.
"""
with connect(self.dsn) as conn:
self.normalization = get_property(conn, DBCFG_NORMALIZATION)
self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
self.naming_rules = ICUNameProcessorRules(conn=conn)
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
def finalize_import(self, config):
@ -132,26 +126,20 @@ class LegacyICUTokenizer:
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
norm = Transliterator.createFromRules("normalizer", self.normalization)
trans = Transliterator.createFromRules("trans", self.transliteration)
return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
def _install_php(self, config):
def _install_php(self, phpdir):
""" Install the php script for the tokenizer.
"""
abbr_inverse = list(zip(*self.abbreviations))
php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent("""\
<?php
@define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{0.normalization}");
@define('CONST_Transliteration', "{0.transliteration}");
@define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
""".format(self, config,
"','".join(abbr_inverse[0]),
"','".join(abbr_inverse[1]))))
@define('CONST_Max_Word_Frequency', {0.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
@define('CONST_Transliteration', "{0.naming_rules.search_rules}");
require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
""".format(self, phpdir)))
def _save_config(self, config):
@ -159,10 +147,10 @@ class LegacyICUTokenizer:
database as database properties.
"""
with connect(self.dsn) as conn:
set_property(conn, DBCFG_NORMALIZATION, self.normalization)
self.naming_rules.save_rules(conn)
set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
def _init_db_tables(self, config):
@ -178,15 +166,14 @@ class LegacyICUTokenizer:
# get partial words and their frequencies
words = Counter()
with self.name_analyzer() as analyzer:
name_proc = ICUNameProcessor(self.naming_rules)
with conn.cursor(name="words") as cur:
cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
for name, cnt in cur:
term = analyzer.make_standard_word(name)
if term:
for word in term.split():
words[word] += cnt
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
for term in word.split():
words[term] += cnt
# copy them back into the word table
copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
@ -208,12 +195,10 @@ class LegacyICUNameAnalyzer:
normalization.
"""
def __init__(self, dsn, normalizer, transliterator, abbreviations):
def __init__(self, dsn, name_proc):
self.conn = connect(dsn).connection
self.conn.autocommit = True
self.normalizer = normalizer
self.transliterator = transliterator
self.abbreviations = abbreviations
self.name_processor = name_proc
self._cache = _TokenCache()
@ -248,9 +233,9 @@ class LegacyICUNameAnalyzer:
tokens = {}
for word in words:
if word.startswith('#'):
tokens[word] = ' ' + self.make_standard_word(word[1:])
tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
else:
tokens[word] = self.make_standard_word(word)
tokens[word] = self.name_processor.get_normalized(word)
with conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
@ -263,12 +248,6 @@ class LegacyICUNameAnalyzer:
return [(k, v, ids[v]) for k, v in tokens.items()]
def normalize(self, phrase):
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.
"""
return self.normalizer.transliterate(phrase)
@staticmethod
def normalize_postcode(postcode):
""" Convert the postcode to a standardized form.
@ -279,27 +258,12 @@ class LegacyICUNameAnalyzer:
return postcode.strip().upper()
@functools.lru_cache(maxsize=1024)
def make_standard_word(self, name):
""" Create the normalised version of the input.
"""
norm = ' ' + self.transliterator.transliterate(name) + ' '
for full, abbr in self.abbreviations:
if full in norm:
norm = norm.replace(full, abbr)
return norm.strip()
def _make_standard_hnr(self, hnr):
""" Create a normalised version of a housenumber.
This function takes minor shortcuts on transliteration.
"""
if hnr.isdigit():
return hnr
return self.transliterator.transliterate(hnr)
return self.name_processor.get_search_normalized(hnr)
def update_postcodes_from_db(self):
""" Update postcode tokens in the word table from the location_postcode
@ -325,7 +289,7 @@ class LegacyICUNameAnalyzer:
else:
copystr.write(postcode)
copystr.write('\t ')
copystr.write(self.transliterator.transliterate(postcode))
copystr.write(self.name_processor.get_search_normalized(postcode))
copystr.write('\tplace\tpostcode\t0\n')
if to_delete:
@ -344,7 +308,7 @@ class LegacyICUNameAnalyzer:
def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases.
"""
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
@ -362,7 +326,7 @@ class LegacyICUNameAnalyzer:
if to_add:
copystr = io.StringIO()
for word, cls, typ, oper in to_add:
term = self.make_standard_word(word)
term = self.name_processor.get_search_normalized(word)
if term:
copystr.write(word)
copystr.write('\t ')
@ -395,15 +359,11 @@ class LegacyICUNameAnalyzer:
def add_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
full_names = set((self.make_standard_word(n) for n in names))
full_names.discard('')
self._add_normalized_country_names(country_code, full_names)
word_tokens = set()
for name in self._compute_full_names(names):
if name:
word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
def _add_normalized_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
word_tokens = set((' ' + name for name in names))
with self.conn.cursor() as cur:
# Get existing names
cur.execute("SELECT word_token FROM word WHERE country_code = %s",
@ -429,14 +389,13 @@ class LegacyICUNameAnalyzer:
names = place.get('name')
if names:
full_names = self._compute_full_names(names)
fulls, partials = self._compute_name_tokens(names)
token_info.add_names(self.conn, full_names)
token_info.add_names(fulls, partials)
country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self._add_normalized_country_names(country_feature.lower(),
full_names)
self.add_country_names(country_feature.lower(), names)
address = place.get('address')
@ -449,38 +408,60 @@ class LegacyICUNameAnalyzer:
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
token_info.add_street(self.conn, self.make_standard_word(value))
token_info.add_street(*self._compute_name_tokens({'name': value}))
elif key == 'place':
token_info.add_place(self.conn, self.make_standard_word(value))
token_info.add_place(*self._compute_name_tokens({'name': value}))
elif not key.startswith('_') and \
key not in ('country', 'full'):
addr_terms.append((key, self.make_standard_word(value)))
addr_terms.append((key, *self._compute_name_tokens({'name': value})))
if hnrs:
hnrs = self._split_housenumbers(hnrs)
token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
if addr_terms:
token_info.add_address_terms(self.conn, addr_terms)
token_info.add_address_terms(addr_terms)
return token_info.data
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
"""
full_names = self._compute_full_names(names)
full_tokens = set()
partial_tokens = set()
for name in full_names:
norm_name = self.name_processor.get_normalized(name)
full, part = self._cache.names.get(norm_name, (None, None))
if full is None:
variants = self.name_processor.get_variants_ascii(norm_name)
with self.conn.cursor() as cur:
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
(norm_name, variants))
full, part = cur.fetchone()
self._cache.names[norm_name] = (full, part)
full_tokens.add(full)
partial_tokens.update(part)
return full_tokens, partial_tokens
def _compute_full_names(self, names):
""" Return the set of all full name word ids to be used with the
given dictionary of names.
"""
full_names = set()
for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
word = self.make_standard_word(name)
if word:
full_names.add(word)
full_names.add(name.strip())
brace_split = name.split('(', 2)
if len(brace_split) > 1:
word = self.make_standard_word(brace_split[0])
if word:
full_names.add(word)
brace_idx = name.find('(')
if brace_idx >= 0:
full_names.add(name[:brace_idx].strip())
return full_names
@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer:
postcode = self.normalize_postcode(postcode)
if postcode not in self._cache.postcodes:
term = self.make_standard_word(postcode)
term = self.name_processor.get_search_normalized(postcode)
if not term:
return
@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer:
""", (' ' + term, postcode))
self._cache.postcodes.add(postcode)
@staticmethod
def _split_housenumbers(hnrs):
if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
@ -530,7 +512,7 @@ class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self, cache):
self.cache = cache
self._cache = cache
self.data = {}
@staticmethod
@ -538,86 +520,44 @@ class _TokenInfo:
return '{%s}' % ','.join((str(s) for s in tokens))
def add_names(self, conn, names):
def add_names(self, fulls, partials):
""" Adds token information for the normalised names.
"""
# Start with all partial names
terms = set((part for ns in names for part in ns.split()))
# Add the full names
terms.update((' ' + n for n in names))
self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
def add_housenumbers(self, conn, hnrs):
""" Extract housenumber information from a list of normalised
housenumbers.
"""
self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
self.data['hnr'] = ';'.join(hnrs)
def add_street(self, conn, street):
def add_street(self, fulls, partials):
""" Add addr:street match terms.
"""
if not street:
return
term = ' ' + street
tid = self.cache.names.get(term)
if tid is None:
with conn.cursor() as cur:
cur.execute("""SELECT word_id FROM word
WHERE word_token = %s
and class is null and type is null""",
(term, ))
if cur.rowcount > 0:
tid = cur.fetchone()[0]
self.cache.names[term] = tid
if tid is not None:
self.data['street'] = '{%d}' % tid
if fulls:
self.data['street'] = self._mk_array(fulls)
def add_place(self, conn, place):
def add_place(self, fulls, partials):
""" Add addr:place search and match terms.
"""
if not place:
return
partial_ids = self.cache.get_term_tokens(conn, place.split())
tid = self.cache.get_term_tokens(conn, [' ' + place])
self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
self.data['place_match'] = '{%s}' % tid[0]
if fulls:
self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
self.data['place_match'] = self._mk_array(fulls)
def add_address_terms(self, conn, terms):
def add_address_terms(self, terms):
""" Add additional address terms.
"""
tokens = {}
for key, value in terms:
if not value:
continue
partial_ids = self.cache.get_term_tokens(conn, value.split())
term = ' ' + value
tid = self.cache.names.get(term)
if tid is None:
with conn.cursor() as cur:
cur.execute("""SELECT word_id FROM word
WHERE word_token = %s
and class is null and type is null""",
(term, ))
if cur.rowcount > 0:
tid = cur.fetchone()[0]
self.cache.names[term] = tid
tokens[key] = [self._mk_array(partial_ids),
'{%s}' % ('' if tid is None else str(tid))]
for key, fulls, partials in terms:
if fulls:
tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
self._mk_array(fulls)]
if tokens:
self.data['addr'] = tokens
@ -635,32 +575,6 @@ class _TokenCache:
self.housenumbers = {}
def get_term_tokens(self, conn, terms):
""" Get token ids for a list of terms, looking them up in the database
if necessary.
"""
tokens = []
askdb = []
for term in terms:
token = self.names.get(term)
if token is None:
askdb.append(term)
elif token != 0:
tokens.append(token)
if askdb:
with conn.cursor() as cur:
cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
(askdb, ))
for term, tid in cur:
self.names[term] = tid
if tid != 0:
tokens.append(tid)
return tokens
def get_hnr_tokens(self, conn, terms):
""" Get token ids for a list of housenumbers, looking them up in the
database if necessary.

View File

@ -404,7 +404,7 @@ class LegacyNameAnalyzer:
FROM unnest(%s)n) y
WHERE NOT EXISTS(SELECT * FROM word
WHERE word_token = lookup_token and country_code = %s))
""", (country_code, names, country_code))
""", (country_code, list(names.values()), country_code))
def process_place(self, place):
@ -422,7 +422,7 @@ class LegacyNameAnalyzer:
country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self.add_country_names(country_feature.lower(), list(names.values()))
self.add_country_names(country_feature.lower(), names)
address = place.get('address')

View File

@ -272,15 +272,15 @@ def create_country_names(conn, tokenizer, languages=None):
with tokenizer.name_analyzer() as analyzer:
for code, name in cur:
names = [code]
names = {'countrycode' : code}
if code == 'gb':
names.append('UK')
names['short_name'] = 'UK'
if code == 'us':
names.append('United States')
names['short_name'] = 'United States'
# country names (only in languages as provided)
if name:
names.extend((v for k, v in name.items() if _include_key(k)))
names.update(((k, v) for k, v in name.items() if _include_key(k)))
analyzer.add_country_names(code, names)

View File

@ -0,0 +1,116 @@
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration: icu_transliteration.rules
compound_suffixes:
# Danish
- hal
- hallen
- hallerne
# German
- berg
- brücke
- fabrik
- gasse
- graben
- haus
- höhle
- hütte
- kapelle
- kogel
- pfad
- platz
- quelle
- spitze
- stiege
- strasse
- teich
- universität
- wald
- weg
- wiese
# Dutch
- gracht
- laan
- markt
- plein
- straat
- vliet
- weg
# Norwegian
- vei
- veien
- veg
- vegen
- gate
- gaten
- gata
- plass
- plassen
- sving
- svingen
# Finnish
- alue
- asema
- aukio
- kaari
- katu
- kuja
- kylä
- penger
- polku
- puistikko
- puisto
- raitti
- ranta
- rinne
- taival
- tie
- tori
- väylä
# Swedish
- väg
- vägen
- gatan
- gata
- gränd
- gränden
- stig
- stigen
- plats
- platsen
abbreviations:
# German
- am => a
- an der => a d
- allgemeines krankenhaus => akh
- altstoffsammelzentrum => asz
- auf der => a d
- bach => b
- bad => b
- bahnhof => bhf,bf
- berg => bg
- bezirk => bez
- brücke => br
- burg => bg
- chaussee => ch
- deutsche,deutscher,deutsches => dt
- dorf => df
- doktor => dr
- fachhochschule => fh
- Freiwillige Feuerwehr => ff
- sankt => st
- strasse => str
- weg => wg
# English
- alley => al
- beach => bch
- street => st
- road => rd
- bridge => brdg

View File

@ -0,0 +1,60 @@
"""
Tests for import name normalisation and variant generation.
"""
from textwrap import dedent
import pytest
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.errors import UsageError
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
def _create_config(suffixes, abbr):
content = dedent("""\
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration:
- ":: Latin ()"
""")
content += "compound_suffixes:\n"
content += '\n'.join((" - " + s for s in suffixes)) + '\n'
content += "abbreviations:\n"
content += '\n'.join((" - " + s for s in abbr)) + '\n'
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
return _create_config
def test_simple_variants(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str',
'prospekt => pr'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Bauwegstraße")) \
== {'bauweg straße', 'bauweg str'}
assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str']
assert proc.get_normalized_variants("holzweg") == ['holz weg']
assert proc.get_normalized_variants("hallo") == ['hallo']
def test_multiple_replacements(cfgfile):
fpath = cfgfile([], ['saint => s,st', 'street => st'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Saint Johns Street")) == \
{'saint johns street', 's johns street', 'st johns street',
'saint johns st', 's johns st', 'st johns st'}

View File

@ -0,0 +1,75 @@
"""
Tests for converting a config file to ICU rules.
"""
import pytest
from textwrap import dedent
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.errors import UsageError
from icu import Transliterator
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
def _create_config(suffixes, abbr):
content = dedent("""\
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration:
- ":: Latin ()"
""")
content += "compound_suffixes:\n"
content += '\n'.join((" - " + s for s in suffixes)) + '\n'
content += "abbreviations:\n"
content += '\n'.join((" - " + s for s in abbr)) + '\n'
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
return _create_config
def test_missing_normalization(tmp_path):
fpath = tmp_path / ('test_config.yaml')
fpath.write_text(dedent("""\
normalizatio:
- ":: NFD ()"
"""))
with pytest.raises(UsageError):
ICURuleLoader(fpath)
def test_get_search_rules(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str',
'prospekt => pr'])
loader = ICURuleLoader(fpath)
rules = loader.get_search_rules()
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" Baumstraße ") == " baum straße "
assert trans.transliterate(" Baumstrasse ") == " baum strasse "
assert trans.transliterate(" Baumstr ") == " baum str "
assert trans.transliterate(" Baumwegstr ") == " baumweg str "
assert trans.transliterate(" Αθήνα ") == " athēna "
assert trans.transliterate(" проспект ") == " prospekt "
def test_get_synonym_pairs(cfgfile):
fpath = cfgfile(['Weg', 'Strasse'],
['Strasse => str,st'])
loader = ICURuleLoader(fpath)
repl = loader.get_replacement_pairs()
assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}),
('strasse ', {' strasse ', ' str ', ' st '}),
('weg ', {' weg '})]