mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-25 05:52:32 +03:00
update unit tests for adapted abbreviation code
This commit is contained in:
parent
1bd9f455fc
commit
2f6e4edcdb
@ -1,7 +1,7 @@
|
|||||||
[MASTER]
|
[MASTER]
|
||||||
|
|
||||||
extension-pkg-whitelist=osmium
|
extension-pkg-whitelist=osmium
|
||||||
ignored-modules=icu
|
ignored-modules=icu,datrie
|
||||||
|
|
||||||
[MESSAGES CONTROL]
|
[MESSAGES CONTROL]
|
||||||
|
|
||||||
|
@ -58,6 +58,9 @@ class ICUNameProcessorRules:
|
|||||||
|
|
||||||
|
|
||||||
class ICUNameProcessor:
|
class ICUNameProcessor:
|
||||||
|
""" Collects the different transformation rules for normalisation of names
|
||||||
|
and provides the functions to aply the transformations.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, rules):
|
def __init__(self, rules):
|
||||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||||
|
@ -2,11 +2,11 @@
|
|||||||
Helper class to create ICU rules from a configuration file.
|
Helper class to create ICU rules from a configuration file.
|
||||||
"""
|
"""
|
||||||
import io
|
import io
|
||||||
import yaml
|
|
||||||
import logging
|
import logging
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
import yaml
|
||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
@ -20,6 +20,8 @@ class ICURuleLoader:
|
|||||||
|
|
||||||
def __init__(self, configfile):
|
def __init__(self, configfile):
|
||||||
self.configfile = configfile
|
self.configfile = configfile
|
||||||
|
self.compound_suffixes = set()
|
||||||
|
self.abbreviations = defaultdict()
|
||||||
|
|
||||||
if configfile.suffix == '.yaml':
|
if configfile.suffix == '.yaml':
|
||||||
self._load_from_yaml()
|
self._load_from_yaml()
|
||||||
@ -42,7 +44,7 @@ class ICURuleLoader:
|
|||||||
suffixes.add(suffix)
|
suffixes.add(suffix)
|
||||||
suffixes.update(self.abbreviations.get(suffix, []))
|
suffixes.update(self.abbreviations.get(suffix, []))
|
||||||
|
|
||||||
for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
|
for suffix in sorted(suffixes, key=len, reverse=True):
|
||||||
rules.write("'{0} ' > ' {0} ';".format(suffix))
|
rules.write("'{0} ' > ' {0} ';".format(suffix))
|
||||||
|
|
||||||
# Finally add transliteration.
|
# Finally add transliteration.
|
||||||
@ -85,7 +87,7 @@ class ICURuleLoader:
|
|||||||
synonyms[abbr + ' '].add(' ' + abbr + ' ')
|
synonyms[abbr + ' '].add(' ' + abbr + ' ')
|
||||||
|
|
||||||
# sort the resulting list by descending length (longer matches are prefered).
|
# sort the resulting list by descending length (longer matches are prefered).
|
||||||
sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
|
sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
|
||||||
|
|
||||||
return [(k, list(synonyms[k])) for k in sorted_keys]
|
return [(k, list(synonyms[k])) for k in sorted_keys]
|
||||||
|
|
||||||
|
@ -3,16 +3,13 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
|
|||||||
libICU instead of the PostgreSQL module.
|
libICU instead of the PostgreSQL module.
|
||||||
"""
|
"""
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import functools
|
|
||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from icu import Transliterator
|
|
||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
@ -103,9 +100,7 @@ class LegacyICUTokenizer:
|
|||||||
"""
|
"""
|
||||||
self.init_from_project()
|
self.init_from_project()
|
||||||
|
|
||||||
if self.normalization is None\
|
if self.naming_rules is None:
|
||||||
or self.transliteration is None\
|
|
||||||
or self.abbreviations is None:
|
|
||||||
return "Configuration for tokenizer 'legacy_icu' are missing."
|
return "Configuration for tokenizer 'legacy_icu' are missing."
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@ -320,40 +315,64 @@ class LegacyICUNameAnalyzer:
|
|||||||
for label, cls, typ, oper in cur:
|
for label, cls, typ, oper in cur:
|
||||||
existing_phrases.add((label, cls, typ, oper or '-'))
|
existing_phrases.add((label, cls, typ, oper or '-'))
|
||||||
|
|
||||||
to_add = norm_phrases - existing_phrases
|
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
|
||||||
to_delete = existing_phrases - norm_phrases
|
if should_replace:
|
||||||
|
deleted = self._remove_special_phrases(cur, norm_phrases,
|
||||||
if to_add:
|
existing_phrases)
|
||||||
copystr = io.StringIO()
|
else:
|
||||||
for word, cls, typ, oper in to_add:
|
deleted = 0
|
||||||
term = self.name_processor.get_search_normalized(word)
|
|
||||||
if term:
|
|
||||||
copystr.write(word)
|
|
||||||
copystr.write('\t ')
|
|
||||||
copystr.write(term)
|
|
||||||
copystr.write('\t')
|
|
||||||
copystr.write(cls)
|
|
||||||
copystr.write('\t')
|
|
||||||
copystr.write(typ)
|
|
||||||
copystr.write('\t')
|
|
||||||
copystr.write(oper if oper in ('in', 'near') else '\\N')
|
|
||||||
copystr.write('\t0\n')
|
|
||||||
|
|
||||||
copystr.seek(0)
|
|
||||||
cur.copy_from(copystr, 'word',
|
|
||||||
columns=['word', 'word_token', 'class', 'type',
|
|
||||||
'operator', 'search_name_count'])
|
|
||||||
|
|
||||||
if to_delete and should_replace:
|
|
||||||
psycopg2.extras.execute_values(
|
|
||||||
cur,
|
|
||||||
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
|
|
||||||
WHERE word = name and class = in_class and type = in_type
|
|
||||||
and ((op = '-' and operator is null) or op = operator)""",
|
|
||||||
to_delete)
|
|
||||||
|
|
||||||
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
|
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
|
||||||
len(norm_phrases), len(to_add), len(to_delete))
|
len(norm_phrases), added, deleted)
|
||||||
|
|
||||||
|
|
||||||
|
def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
|
||||||
|
""" Add all phrases to the database that are not yet there.
|
||||||
|
"""
|
||||||
|
to_add = new_phrases - existing_phrases
|
||||||
|
|
||||||
|
copystr = io.StringIO()
|
||||||
|
added = 0
|
||||||
|
for word, cls, typ, oper in to_add:
|
||||||
|
term = self.name_processor.get_search_normalized(word)
|
||||||
|
if term:
|
||||||
|
copystr.write(word)
|
||||||
|
copystr.write('\t ')
|
||||||
|
copystr.write(term)
|
||||||
|
copystr.write('\t')
|
||||||
|
copystr.write(cls)
|
||||||
|
copystr.write('\t')
|
||||||
|
copystr.write(typ)
|
||||||
|
copystr.write('\t')
|
||||||
|
copystr.write(oper if oper in ('in', 'near') else '\\N')
|
||||||
|
copystr.write('\t0\n')
|
||||||
|
added += 1
|
||||||
|
|
||||||
|
|
||||||
|
if copystr.tell() > 0:
|
||||||
|
copystr.seek(0)
|
||||||
|
cursor.copy_from(copystr, 'word',
|
||||||
|
columns=['word', 'word_token', 'class', 'type',
|
||||||
|
'operator', 'search_name_count'])
|
||||||
|
|
||||||
|
return added
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_special_phrases(self, cursor, new_phrases, existing_phrases):
|
||||||
|
""" Remove all phrases from the databse that are no longer in the
|
||||||
|
new phrase list.
|
||||||
|
"""
|
||||||
|
to_delete = existing_phrases - new_phrases
|
||||||
|
|
||||||
|
if to_delete:
|
||||||
|
psycopg2.extras.execute_values(
|
||||||
|
cursor,
|
||||||
|
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
|
||||||
|
WHERE word = name and class = in_class and type = in_type
|
||||||
|
and ((op = '-' and operator is null) or op = operator)""",
|
||||||
|
to_delete)
|
||||||
|
|
||||||
|
return len(to_delete)
|
||||||
|
|
||||||
|
|
||||||
def add_country_names(self, country_code, names):
|
def add_country_names(self, country_code, names):
|
||||||
@ -451,7 +470,8 @@ class LegacyICUNameAnalyzer:
|
|||||||
return full_tokens, partial_tokens
|
return full_tokens, partial_tokens
|
||||||
|
|
||||||
|
|
||||||
def _compute_full_names(self, names):
|
@staticmethod
|
||||||
|
def _compute_full_names(names):
|
||||||
""" Return the set of all full name word ids to be used with the
|
""" Return the set of all full name word ids to be used with the
|
||||||
given dictionary of names.
|
given dictionary of names.
|
||||||
"""
|
"""
|
||||||
@ -534,7 +554,7 @@ class _TokenInfo:
|
|||||||
self.data['hnr'] = ';'.join(hnrs)
|
self.data['hnr'] = ';'.join(hnrs)
|
||||||
|
|
||||||
|
|
||||||
def add_street(self, fulls, partials):
|
def add_street(self, fulls, _):
|
||||||
""" Add addr:street match terms.
|
""" Add addr:street match terms.
|
||||||
"""
|
"""
|
||||||
if fulls:
|
if fulls:
|
||||||
|
@ -260,7 +260,9 @@ def test_update_special_phrase_modify(analyzer, word_table, make_standard_name):
|
|||||||
|
|
||||||
|
|
||||||
def test_add_country_names(analyzer, word_table, make_standard_name):
|
def test_add_country_names(analyzer, word_table, make_standard_name):
|
||||||
analyzer.add_country_names('de', ['Germany', 'Deutschland', 'germany'])
|
analyzer.add_country_names('de', {'name': 'Germany',
|
||||||
|
'name:de': 'Deutschland',
|
||||||
|
'short_name': 'germany'})
|
||||||
|
|
||||||
assert word_table.get_country() \
|
assert word_table.get_country() \
|
||||||
== {('de', ' #germany#'),
|
== {('de', ' #germany#'),
|
||||||
@ -272,7 +274,7 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
|
|||||||
word_table.add_country('it', ' #italy#')
|
word_table.add_country('it', ' #italy#')
|
||||||
word_table.add_country('it', ' #itala#')
|
word_table.add_country('it', ' #itala#')
|
||||||
|
|
||||||
analyzer.add_country_names('it', ['Italy', 'IT'])
|
analyzer.add_country_names('it', {'name': 'Italy', 'ref': 'IT'})
|
||||||
|
|
||||||
assert word_table.get_country() \
|
assert word_table.get_country() \
|
||||||
== {('fr', ' #france#'),
|
== {('fr', ' #france#'),
|
||||||
|
@ -212,14 +212,14 @@ def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_t
|
|||||||
def test_update_special_phrase_empty_table(analyzer, word_table):
|
def test_update_special_phrase_empty_table(analyzer, word_table):
|
||||||
with analyzer() as anl:
|
with analyzer() as anl:
|
||||||
anl.update_special_phrases([
|
anl.update_special_phrases([
|
||||||
("König bei", "amenity", "royal", "near"),
|
("König bei", "amenity", "royal", "near"),
|
||||||
("Könige", "amenity", "royal", "-"),
|
("Könige ", "amenity", "royal", "-"),
|
||||||
("street", "highway", "primary", "in")
|
("street", "highway", "primary", "in")
|
||||||
], True)
|
], True)
|
||||||
|
|
||||||
assert word_table.get_special() \
|
assert word_table.get_special() \
|
||||||
== {(' KÖNIG BEI', 'könig bei', 'amenity', 'royal', 'near'),
|
== {(' KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
|
||||||
(' KÖNIGE', 'könige', 'amenity', 'royal', None),
|
(' KÖNIGE', 'Könige', 'amenity', 'royal', None),
|
||||||
(' STREET', 'street', 'highway', 'primary', 'in')}
|
(' STREET', 'street', 'highway', 'primary', 'in')}
|
||||||
|
|
||||||
|
|
||||||
|
@ -180,7 +180,7 @@ def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cur
|
|||||||
|
|
||||||
assert len(tokenizer.analyser_cache['countries']) == 2
|
assert len(tokenizer.analyser_cache['countries']) == 2
|
||||||
|
|
||||||
result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
|
result_set = {k: set(v.values()) for k, v in tokenizer.analyser_cache['countries']}
|
||||||
|
|
||||||
if languages:
|
if languages:
|
||||||
assert result_set == {'us' : set(('us', 'us1', 'United States')),
|
assert result_set == {'us' : set(('us', 'us1', 'United States')),
|
||||||
|
Loading…
Reference in New Issue
Block a user