From c170d323d93f2ea63ef1c9af8ea11dbc388cbfb2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 20 Jan 2022 23:47:20 +0100 Subject: [PATCH] add tests for cleaning housenumbers --- .github/workflows/ci-tests.yml | 10 ++++- nominatim/clicmd/refresh.py | 1 + test/python/cli/conftest.py | 4 ++ test/python/cli/test_cmd_refresh.py | 5 +++ test/python/mock_icu_word_table.py | 13 ++++++ test/python/tokenizer/test_icu.py | 67 ++++++++++++++++++++++++++++ test/python/tokenizer/test_legacy.py | 7 +++ 7 files changed, 106 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 23d640d7..f326c3ca 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -309,12 +309,20 @@ jobs: NOMINATIM_REPLICATION_MAX_DIFF=1 nominatim replication --once working-directory: /home/nominatim/nominatim-project + - name: Clean up database + run: nominatim refresh --postcodes --word-tokens + working-directory: /home/nominatim/nominatim-project + - name: Run reverse-only import run : | echo 'NOMINATIM_DATABASE_DSN="pgsql:dbname=reverse"' >> .env nominatim import --osm-file ../test.pbf --reverse-only --no-updates working-directory: /home/nominatim/data-env-reverse - - name: Check reverse import + - name: Check reverse-only import run: nominatim admin --check-database working-directory: /home/nominatim/data-env-reverse + + - name: Clean up database (reverse-only import) + run: nominatim refresh --postcodes --word-tokens + working-directory: /home/nominatim/nominatim-project diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index c741dcf6..b8a88b6d 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -79,6 +79,7 @@ class UpdateRefresh: "Postcode updates on a frozen database is not possible.") if args.word_tokens: + LOG.warning('Updating word tokens') tokenizer = self._get_tokenizer(args.config) tokenizer.update_word_tokens() diff --git a/test/python/cli/conftest.py b/test/python/cli/conftest.py index ea45f2a1..420740cf 100644 --- a/test/python/cli/conftest.py +++ b/test/python/cli/conftest.py @@ -30,6 +30,7 @@ class DummyTokenizer: self.update_sql_functions_called = False self.finalize_import_called = False self.update_statistics_called = False + self.update_word_tokens_called = False def update_sql_functions(self, *args): self.update_sql_functions_called = True @@ -40,6 +41,9 @@ class DummyTokenizer: def update_statistics(self): self.update_statistics_called = True + def update_word_tokens(self): + self.update_word_tokens_called = True + @pytest.fixture def cli_call(src_dir): diff --git a/test/python/cli/test_cmd_refresh.py b/test/python/cli/test_cmd_refresh.py index e6dce8b3..b6281c7a 100644 --- a/test/python/cli/test_cmd_refresh.py +++ b/test/python/cli/test_cmd_refresh.py @@ -39,6 +39,11 @@ class TestRefresh: assert self.tokenizer_mock.update_statistics_called + def test_refresh_word_tokens(self): + assert self.call_nominatim('refresh', '--word-tokens') == 0 + assert self.tokenizer_mock.update_word_tokens_called + + def test_refresh_postcodes(self, mock_func_factory, place_table): func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes') idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes') diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py index f5d89e4f..a7363958 100644 --- a/test/python/mock_icu_word_table.py +++ b/test/python/mock_icu_word_table.py @@ -58,6 +58,14 @@ class MockIcuWordTable: self.conn.commit() + def add_housenumber(self, word_id, word_token): + with self.conn.cursor() as cur: + cur.execute("""INSERT INTO word (word_id, word_token, type) + VALUES (%s, %s, 'H') + """, (word_id, word_token)) + self.conn.commit() + + def count(self): with self.conn.cursor() as cur: return cur.scalar("SELECT count(*) FROM word") @@ -68,6 +76,11 @@ class MockIcuWordTable: return cur.scalar("SELECT count(*) FROM word WHERE type = 'S'") + def count_housenumbers(self): + with self.conn.cursor() as cur: + return cur.scalar("SELECT count(*) FROM word WHERE type = 'H'") + + def get_special(self): with self.conn.cursor() as cur: cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'") diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index a3839365..372df9d2 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -9,6 +9,7 @@ Tests for ICU tokenizer. """ import shutil import yaml +import itertools import pytest @@ -554,3 +555,69 @@ class TestPlaceAddress: assert 'addr' not in info + +class TestUpdateWordTokens: + + @pytest.fixture(autouse=True) + def setup(self, tokenizer_factory, table_factory, placex_table, word_table): + table_factory('search_name', 'place_id BIGINT, name_vector INT[]') + self.tok = tokenizer_factory() + + + @pytest.fixture + def search_entry(self, temp_db_cursor): + place_id = itertools.count(1000) + + def _insert(*args): + temp_db_cursor.execute("INSERT INTO search_name VALUES (%s, %s)", + (next(place_id), list(args))) + + return _insert + + + @pytest.mark.parametrize('hnr', ('1a', '1234567', '34 5')) + def test_remove_unused_housenumbers(self, word_table, hnr): + word_table.add_housenumber(1000, hnr) + + assert word_table.count_housenumbers() == 1 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 0 + + + def test_keep_unused_numeral_housenumbers(self, word_table): + word_table.add_housenumber(1000, '5432') + + assert word_table.count_housenumbers() == 1 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 1 + + + def test_keep_housenumbers_from_search_name_table(self, word_table, search_entry): + word_table.add_housenumber(9999, '5432a') + word_table.add_housenumber(9991, '9 a') + search_entry(123, 9999, 34) + + assert word_table.count_housenumbers() == 2 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 1 + + + def test_keep_housenumbers_from_placex_table(self, word_table, placex_table): + word_table.add_housenumber(9999, '5432a') + word_table.add_housenumber(9990, '34z') + placex_table.add(housenumber='34z') + placex_table.add(housenumber='25432a') + + assert word_table.count_housenumbers() == 2 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 1 + + + def test_keep_housenumbers_from_placex_table_hnr_list(self, word_table, placex_table): + word_table.add_housenumber(9991, '9 b') + word_table.add_housenumber(9990, '34z') + placex_table.add(housenumber='9 a;9 b;9 c') + + assert word_table.count_housenumbers() == 2 + self.tok.update_word_tokens() + assert word_table.count_housenumbers() == 1 diff --git a/test/python/tokenizer/test_legacy.py b/test/python/tokenizer/test_legacy.py index 4addb282..0e46f1dc 100644 --- a/test/python/tokenizer/test_legacy.py +++ b/test/python/tokenizer/test_legacy.py @@ -257,6 +257,13 @@ def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_ search_name_count > 0""") > 0 +def test_update_word_tokens(tokenizer_factory): + tok = tokenizer_factory() + + # This is a noop and should just pass. + tok.update_word_tokens() + + def test_normalize(analyzer): assert analyzer.normalize('TEsT') == 'test'