mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-24 13:31:37 +03:00
adapt unit test for new word table
Requires a second wrapper class for the word table with the new layout. This class is interface-compatible, so that later when the ICU tokenizer becomes the default, all tests that depend on behaviour of the default tokenizer can be switched to the other wrapper.
This commit is contained in:
parent
eb6814d74e
commit
e42878eeda
@ -1,5 +1,5 @@
|
||||
DROP TABLE IF EXISTS word;
|
||||
CREATE TABLE word_icu (
|
||||
CREATE TABLE word (
|
||||
word_id INTEGER,
|
||||
word_token text NOT NULL,
|
||||
type text NOT NULL,
|
||||
|
@ -102,8 +102,8 @@ BEGIN
|
||||
|
||||
IF full_token IS NULL THEN
|
||||
full_token := nextval('seq_word');
|
||||
INSERT INTO word (word_id, word_token, info)
|
||||
SELECT full_token, lookup_term,
|
||||
INSERT INTO word (word_id, word_token, type, info)
|
||||
SELECT full_token, lookup_term, 'W',
|
||||
json_build_object('word', norm_term, 'count', 0)
|
||||
FROM unnest(lookup_terms) as lookup_term;
|
||||
END IF;
|
||||
@ -123,8 +123,8 @@ BEGIN
|
||||
IF term_id IS NULL THEN
|
||||
term_id := nextval('seq_word');
|
||||
term_count := 0;
|
||||
INSERT INTO word (word_id, word_token, info)
|
||||
VALUES (term_id, term, json_build_object('count', term_count));
|
||||
INSERT INTO word (word_id, word_token, type, info)
|
||||
VALUES (term_id, term, 'w', json_build_object('count', term_count));
|
||||
END IF;
|
||||
|
||||
IF term_count < {{ max_word_freq }} THEN
|
||||
|
@ -236,17 +236,17 @@ class LegacyICUNameAnalyzer:
|
||||
partial_tokens[word] = self.name_processor.get_search_normalized(word)
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""(SELECT word_token, word_id
|
||||
FROM word WHERE word_token = ANY(%s) and type = 'W')
|
||||
UNION
|
||||
(SELECT word_token, word_id
|
||||
FROM word WHERE word_token = ANY(%s) and type = 'w')""",
|
||||
(list(full_tokens.values()),
|
||||
list(partial_tokens.values())))
|
||||
ids = {r[0]: r[1] for r in cur}
|
||||
cur.execute("""SELECT word_token, word_id
|
||||
FROM word WHERE word_token = ANY(%s) and type = 'W'
|
||||
""", (list(full_tokens.values()),))
|
||||
full_ids = {r[0]: r[1] for r in cur}
|
||||
cur.execute("""SELECT word_token, word_id
|
||||
FROM word WHERE word_token = ANY(%s) and type = 'w'""",
|
||||
(list(partial_tokens.values()),))
|
||||
part_ids = {r[0]: r[1] for r in cur}
|
||||
|
||||
return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \
|
||||
+ [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()]
|
||||
return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
|
||||
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
|
||||
|
||||
|
||||
@staticmethod
|
||||
@ -508,7 +508,7 @@ class LegacyICUNameAnalyzer:
|
||||
FROM (VALUES (%s)) as v(pc)
|
||||
WHERE NOT EXISTS
|
||||
(SELECT * FROM word
|
||||
WHERE type = 'P' and info->>postcode = pc))
|
||||
WHERE type = 'P' and info->>'postcode' = pc))
|
||||
""", (term, postcode))
|
||||
self._cache.postcodes.add(postcode)
|
||||
|
||||
|
84
test/python/mock_icu_word_table.py
Normal file
84
test/python/mock_icu_word_table.py
Normal file
@ -0,0 +1,84 @@
|
||||
"""
|
||||
Legacy word table for testing with functions to prefil and test contents
|
||||
of the table.
|
||||
"""
|
||||
|
||||
class MockIcuWordTable:
|
||||
""" A word table for testing using legacy word table structure.
|
||||
"""
|
||||
def __init__(self, conn):
|
||||
self.conn = conn
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE TABLE word (word_id INTEGER,
|
||||
word_token text NOT NULL,
|
||||
type text NOT NULL,
|
||||
info jsonb)""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def add_special(self, word_token, word, cls, typ, oper):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO word (word_token, type, info)
|
||||
VALUES (%s, 'S',
|
||||
json_build_object('word', %s,
|
||||
'class', %s,
|
||||
'type', %s,
|
||||
'op', %s))
|
||||
""", (word_token, word, cls, typ, oper))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def add_country(self, country_code, word_token):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO word (word_token, type, info)
|
||||
VALUES(%s, 'C', json_build_object('cc', %s))""",
|
||||
(word_token, country_code))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def add_postcode(self, word_token, postcode):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO word (word_token, type, info)
|
||||
VALUES (%s, 'P', json_build_object('postcode', %s))
|
||||
""", (word_token, postcode))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def count(self):
|
||||
with self.conn.cursor() as cur:
|
||||
return cur.scalar("SELECT count(*) FROM word")
|
||||
|
||||
|
||||
def count_special(self):
|
||||
with self.conn.cursor() as cur:
|
||||
return cur.scalar("SELECT count(*) FROM word WHERE type = 'S'")
|
||||
|
||||
|
||||
def get_special(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT word_token, info FROM word WHERE type = 'S'")
|
||||
result = set(((row[0], row[1]['word'], row[1]['class'],
|
||||
row[1]['type'], row[1]['op']) for row in cur))
|
||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||
return result
|
||||
|
||||
|
||||
def get_country(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT info->>'cc', word_token FROM word WHERE type = 'C'")
|
||||
result = set((tuple(row) for row in cur))
|
||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||
return result
|
||||
|
||||
|
||||
def get_postcodes(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT info->>'postcode' FROM word WHERE type = 'P'")
|
||||
return set((row[0] for row in cur))
|
||||
|
||||
|
||||
def get_partial_words(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT word_token, info FROM word WHERE type ='w'")
|
||||
return set(((row[0], row[1]['count']) for row in cur))
|
||||
|
86
test/python/mock_legacy_word_table.py
Normal file
86
test/python/mock_legacy_word_table.py
Normal file
@ -0,0 +1,86 @@
|
||||
"""
|
||||
Legacy word table for testing with functions to prefil and test contents
|
||||
of the table.
|
||||
"""
|
||||
|
||||
class MockLegacyWordTable:
|
||||
""" A word table for testing using legacy word table structure.
|
||||
"""
|
||||
def __init__(self, conn):
|
||||
self.conn = conn
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE TABLE word (word_id INTEGER,
|
||||
word_token text,
|
||||
word text,
|
||||
class text,
|
||||
type text,
|
||||
country_code varchar(2),
|
||||
search_name_count INTEGER,
|
||||
operator TEXT)""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def add_special(self, word_token, word, cls, typ, oper):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO word (word_token, word, class, type, operator)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
""", (word_token, word, cls, typ, oper))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def add_country(self, country_code, word_token):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)",
|
||||
(word_token, country_code))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def add_postcode(self, word_token, postcode):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO word (word_token, word, class, type)
|
||||
VALUES (%s, %s, 'place', 'postcode')
|
||||
""", (word_token, postcode))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def count(self):
|
||||
with self.conn.cursor() as cur:
|
||||
return cur.scalar("SELECT count(*) FROM word")
|
||||
|
||||
|
||||
def count_special(self):
|
||||
with self.conn.cursor() as cur:
|
||||
return cur.scalar("SELECT count(*) FROM word WHERE class != 'place'")
|
||||
|
||||
|
||||
def get_special(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_token, word, class, type, operator
|
||||
FROM word WHERE class != 'place'""")
|
||||
result = set((tuple(row) for row in cur))
|
||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||
return result
|
||||
|
||||
|
||||
def get_country(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT country_code, word_token
|
||||
FROM word WHERE country_code is not null""")
|
||||
result = set((tuple(row) for row in cur))
|
||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||
return result
|
||||
|
||||
|
||||
def get_postcodes(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word FROM word
|
||||
WHERE class = 'place' and type = 'postcode'""")
|
||||
return set((row[0] for row in cur))
|
||||
|
||||
def get_partial_words(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_token, search_name_count FROM word
|
||||
WHERE class is null and country_code is null
|
||||
and not word_token like ' %'""")
|
||||
return set((tuple(row) for row in cur))
|
||||
|
@ -7,6 +7,9 @@ import psycopg2.extras
|
||||
|
||||
from nominatim.db import properties
|
||||
|
||||
# This must always point to the mock word table for the default tokenizer.
|
||||
from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
|
||||
|
||||
class MockParamCapture:
|
||||
""" Mock that records the parameters with which a function was called
|
||||
as well as the number of calls.
|
||||
@ -24,88 +27,6 @@ class MockParamCapture:
|
||||
return self.return_value
|
||||
|
||||
|
||||
class MockWordTable:
|
||||
""" A word table for testing.
|
||||
"""
|
||||
def __init__(self, conn):
|
||||
self.conn = conn
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""CREATE TABLE word (word_id INTEGER,
|
||||
word_token text,
|
||||
word text,
|
||||
class text,
|
||||
type text,
|
||||
country_code varchar(2),
|
||||
search_name_count INTEGER,
|
||||
operator TEXT)""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def add_special(self, word_token, word, cls, typ, oper):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO word (word_token, word, class, type, operator)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
""", (word_token, word, cls, typ, oper))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def add_country(self, country_code, word_token):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("INSERT INTO word (word_token, country_code) VALUES(%s, %s)",
|
||||
(word_token, country_code))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def add_postcode(self, word_token, postcode):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""INSERT INTO word (word_token, word, class, type)
|
||||
VALUES (%s, %s, 'place', 'postcode')
|
||||
""", (word_token, postcode))
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def count(self):
|
||||
with self.conn.cursor() as cur:
|
||||
return cur.scalar("SELECT count(*) FROM word")
|
||||
|
||||
|
||||
def count_special(self):
|
||||
with self.conn.cursor() as cur:
|
||||
return cur.scalar("SELECT count(*) FROM word WHERE class != 'place'")
|
||||
|
||||
|
||||
def get_special(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_token, word, class, type, operator
|
||||
FROM word WHERE class != 'place'""")
|
||||
result = set((tuple(row) for row in cur))
|
||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||
return result
|
||||
|
||||
|
||||
def get_country(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT country_code, word_token
|
||||
FROM word WHERE country_code is not null""")
|
||||
result = set((tuple(row) for row in cur))
|
||||
assert len(result) == cur.rowcount, "Word table has duplicates."
|
||||
return result
|
||||
|
||||
|
||||
def get_postcodes(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word FROM word
|
||||
WHERE class = 'place' and type = 'postcode'""")
|
||||
return set((row[0] for row in cur))
|
||||
|
||||
def get_partial_words(self):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_token, search_name_count FROM word
|
||||
WHERE class is null and country_code is null
|
||||
and not word_token like ' %'""")
|
||||
return set((tuple(row) for row in cur))
|
||||
|
||||
|
||||
class MockPlacexTable:
|
||||
""" A placex table for testing.
|
||||
"""
|
||||
|
@ -11,6 +11,12 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.db import properties
|
||||
|
||||
from mock_icu_word_table import MockIcuWordTable
|
||||
|
||||
@pytest.fixture
|
||||
def word_table(temp_db_conn):
|
||||
return MockIcuWordTable(temp_db_conn)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_config(def_config, tmp_path):
|
||||
@ -21,8 +27,8 @@ def test_config(def_config, tmp_path):
|
||||
sqldir.mkdir()
|
||||
(sqldir / 'tokenizer').mkdir()
|
||||
(sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'")
|
||||
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
|
||||
str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
|
||||
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
|
||||
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
|
||||
|
||||
def_config.lib_dir.sql = sqldir
|
||||
|
||||
@ -88,12 +94,14 @@ DECLARE
|
||||
term_count INTEGER;
|
||||
BEGIN
|
||||
SELECT min(word_id) INTO full_token
|
||||
FROM word WHERE word = norm_term and class is null and country_code is null;
|
||||
FROM word WHERE info->>'word' = norm_term and type = 'W';
|
||||
|
||||
IF full_token IS NULL THEN
|
||||
full_token := nextval('seq_word');
|
||||
INSERT INTO word (word_id, word_token, word, search_name_count)
|
||||
SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
|
||||
INSERT INTO word (word_id, word_token, type, info)
|
||||
SELECT full_token, lookup_term, 'W',
|
||||
json_build_object('word', norm_term, 'count', 0)
|
||||
FROM unnest(lookup_terms) as lookup_term;
|
||||
END IF;
|
||||
|
||||
FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
|
||||
@ -105,18 +113,18 @@ BEGIN
|
||||
|
||||
partial_tokens := '{}'::INT[];
|
||||
FOR term IN SELECT unnest(partial_terms) LOOP
|
||||
SELECT min(word_id), max(search_name_count) INTO term_id, term_count
|
||||
FROM word WHERE word_token = term and class is null and country_code is null;
|
||||
SELECT min(word_id), max(info->>'count') INTO term_id, term_count
|
||||
FROM word WHERE word_token = term and type = 'w';
|
||||
|
||||
IF term_id IS NULL THEN
|
||||
term_id := nextval('seq_word');
|
||||
term_count := 0;
|
||||
INSERT INTO word (word_id, word_token, search_name_count)
|
||||
VALUES (term_id, term, 0);
|
||||
INSERT INTO word (word_id, word_token, type, info)
|
||||
VALUES (term_id, term, 'w', json_build_object('count', term_count));
|
||||
END IF;
|
||||
|
||||
IF NOT (ARRAY[term_id] <@ partial_tokens) THEN
|
||||
partial_tokens := partial_tokens || term_id;
|
||||
partial_tokens := partial_tokens || term_id;
|
||||
END IF;
|
||||
END LOOP;
|
||||
END;
|
||||
@ -232,14 +240,14 @@ def test_update_special_phrase_empty_table(analyzer, word_table):
|
||||
], True)
|
||||
|
||||
assert word_table.get_special() \
|
||||
== {(' KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
|
||||
(' KÖNIGE', 'Könige', 'amenity', 'royal', None),
|
||||
(' STREET', 'street', 'highway', 'primary', 'in')}
|
||||
== {('KÖNIG BEI', 'König bei', 'amenity', 'royal', 'near'),
|
||||
('KÖNIGE', 'Könige', 'amenity', 'royal', None),
|
||||
('STREET', 'street', 'highway', 'primary', 'in')}
|
||||
|
||||
|
||||
def test_update_special_phrase_delete_all(analyzer, word_table):
|
||||
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in')
|
||||
word_table.add_special(' BAR', 'bar', 'highway', 'road', None)
|
||||
word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
|
||||
word_table.add_special('BAR', 'bar', 'highway', 'road', None)
|
||||
|
||||
assert word_table.count_special() == 2
|
||||
|
||||
@ -250,8 +258,8 @@ def test_update_special_phrase_delete_all(analyzer, word_table):
|
||||
|
||||
|
||||
def test_update_special_phrases_no_replace(analyzer, word_table):
|
||||
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in')
|
||||
word_table.add_special(' BAR', 'bar', 'highway', 'road', None)
|
||||
word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
|
||||
word_table.add_special('BAR', 'bar', 'highway', 'road', None)
|
||||
|
||||
assert word_table.count_special() == 2
|
||||
|
||||
@ -262,8 +270,8 @@ def test_update_special_phrases_no_replace(analyzer, word_table):
|
||||
|
||||
|
||||
def test_update_special_phrase_modify(analyzer, word_table):
|
||||
word_table.add_special(' FOO', 'foo', 'amenity', 'prison', 'in')
|
||||
word_table.add_special(' BAR', 'bar', 'highway', 'road', None)
|
||||
word_table.add_special('FOO', 'foo', 'amenity', 'prison', 'in')
|
||||
word_table.add_special('BAR', 'bar', 'highway', 'road', None)
|
||||
|
||||
assert word_table.count_special() == 2
|
||||
|
||||
@ -275,25 +283,25 @@ def test_update_special_phrase_modify(analyzer, word_table):
|
||||
], True)
|
||||
|
||||
assert word_table.get_special() \
|
||||
== {(' PRISON', 'prison', 'amenity', 'prison', 'in'),
|
||||
(' BAR', 'bar', 'highway', 'road', None),
|
||||
(' GARDEN', 'garden', 'leisure', 'garden', 'near')}
|
||||
== {('PRISON', 'prison', 'amenity', 'prison', 'in'),
|
||||
('BAR', 'bar', 'highway', 'road', None),
|
||||
('GARDEN', 'garden', 'leisure', 'garden', 'near')}
|
||||
|
||||
|
||||
def test_add_country_names_new(analyzer, word_table):
|
||||
with analyzer() as anl:
|
||||
anl.add_country_names('es', {'name': 'Espagña', 'name:en': 'Spain'})
|
||||
|
||||
assert word_table.get_country() == {('es', ' ESPAGÑA'), ('es', ' SPAIN')}
|
||||
assert word_table.get_country() == {('es', 'ESPAGÑA'), ('es', 'SPAIN')}
|
||||
|
||||
|
||||
def test_add_country_names_extend(analyzer, word_table):
|
||||
word_table.add_country('ch', ' SCHWEIZ')
|
||||
word_table.add_country('ch', 'SCHWEIZ')
|
||||
|
||||
with analyzer() as anl:
|
||||
anl.add_country_names('ch', {'name': 'Schweiz', 'name:fr': 'Suisse'})
|
||||
|
||||
assert word_table.get_country() == {('ch', ' SCHWEIZ'), ('ch', ' SUISSE')}
|
||||
assert word_table.get_country() == {('ch', 'SCHWEIZ'), ('ch', 'SUISSE')}
|
||||
|
||||
|
||||
class TestPlaceNames:
|
||||
@ -307,6 +315,7 @@ class TestPlaceNames:
|
||||
|
||||
def expect_name_terms(self, info, *expected_terms):
|
||||
tokens = self.analyzer.get_word_token_info(expected_terms)
|
||||
print (tokens)
|
||||
for token in tokens:
|
||||
assert token[2] is not None, "No token for {0}".format(token)
|
||||
|
||||
@ -316,7 +325,7 @@ class TestPlaceNames:
|
||||
def test_simple_names(self):
|
||||
info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
|
||||
|
||||
self.expect_name_terms(info, '#Soft bAr', '#34','Soft', 'bAr', '34')
|
||||
self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sep', [',' , ';'])
|
||||
@ -339,7 +348,7 @@ class TestPlaceNames:
|
||||
'country_feature': 'no'})
|
||||
|
||||
self.expect_name_terms(info, '#norge', 'norge')
|
||||
assert word_table.get_country() == {('no', ' NORGE')}
|
||||
assert word_table.get_country() == {('no', 'NORGE')}
|
||||
|
||||
|
||||
class TestPlaceAddress:
|
||||
|
Loading…
Reference in New Issue
Block a user