Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default

Make ICU tokenizer the default
2024-12-25 05:52:32 +03:00 · 2022-05-11 08:52:49 +02:00 · 2022-05-11 08:52:49 +02:00 · 5ff35d9984
commit 5ff35d9984
parent b332b1ae23 c6a426a885
21 changed files with 80 additions and 63 deletions
--- a/.github/actions/build-nominatim/action.yml
+++ b/.github/actions/build-nominatim/action.yml
@ -5,6 +5,10 @@ inputs:
        description: 'Version of Ubuntu to install on'
        required: false
        default: '20'
+    cmake-args:
+        description: 'Additional options to hand to cmake'
+        required: false
+        default: ''

 runs:
    using: "composite"
@ -21,18 +25,13 @@ runs:
          shell: bash
          env:
            UBUNTUVER: ${{ inputs.ubuntu }}
-
-        - name: Download dependencies
-          run: |
-              if [ ! -f country_grid.sql.gz ]; then
-                  wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz
-              fi
-              cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz
-          shell: bash
+            CMAKE_ARGS: ${{ inputs.cmake-args }}

        - name: Configure
-          run: mkdir build && cd build && cmake ../Nominatim
+          run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim
          shell: bash
+          env:
+            CMAKE_ARGS: ${{ inputs.cmake-args }}

        - name: Build
          run: |
--- a/.github/actions/setup-postgresql/action.yml
+++ b/.github/actions/setup-postgresql/action.yml
@ -22,7 +22,7 @@ runs:

        - name: Install PostgreSQL
          run: |
-              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER}
+              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER}
          shell: bash
          env:
              PGVER: ${{ inputs.postgresql-version }}
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@ -113,19 +113,9 @@ jobs:
              working-directory: Nominatim/test/bdd


-    icu-test:
+    legacy-test:
        needs: create-archive
-        strategy:
-            matrix:
-                ubuntu: [20]
-                include:
-                    - ubuntu: 20
-                      postgresql: 13
-                      postgis: 3
-                      pytest: py.test-3
-                      php: 7.4
-
-        runs-on: ubuntu-${{ matrix.ubuntu }}.04
+        runs-on: ubuntu-20.04

        steps:
            - uses: actions/download-artifact@v2
@ -138,35 +128,27 @@ jobs:
            - name: Setup PHP
              uses: shivammathur/setup-php@v2
              with:
-                  php-version: ${{ matrix.php }}
-                  coverage: xdebug
-                  tools: phpunit, phpcs, composer
-
-            - uses: actions/setup-python@v2
-              with:
-                python-version: 3.6
-              if: matrix.ubuntu == 18
+                  php-version: 7.4

            - uses: ./Nominatim/.github/actions/setup-postgresql
              with:
-                  postgresql-version: ${{ matrix.postgresql }}
-                  postgis-version: ${{ matrix.postgis }}
+                  postgresql-version: 13
+                  postgis-version: 3
+
+            - name: Install Postgresql server dev
+              run: sudo apt-get install postgresql-server-dev-13

            - uses: ./Nominatim/.github/actions/build-nominatim
              with:
-                  ubuntu: ${{ matrix.ubuntu }}
+                  ubuntu: 20
+                  cmake-args: -DBUILD_MODULE=on

            - name: Install test prerequsites
              run: sudo apt-get install -y -qq python3-behave
-              if: matrix.ubuntu == 20

-            - name: Install test prerequsites
-              run: pip3 install behave==1.2.6
-              if: matrix.ubuntu == 18
-
-            - name: BDD tests (icu tokenizer)
+            - name: BDD tests (legacy tokenizer)
              run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3
              working-directory: Nominatim/test/bdd


--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -44,7 +44,7 @@ endif()

 set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
 set(BUILD_API on CACHE BOOL "Build everything for the API server")
-set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module")
+set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
 set(BUILD_TESTS on CACHE BOOL "Build test suite")
 set(BUILD_DOCS on CACHE BOOL "Build documentation")
 set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
--- a/docs/admin/Installation.md
+++ b/docs/admin/Installation.md
@ -158,6 +158,17 @@ make
 sudo make install
 ```

+!!! warning
+    The default installation no longer compiles the PostgreSQL module that
+    is needed for the legacy tokenizer from older Nominatim versions. If you
+    are upgrading an older database or want to run the
+    [legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
+    some other reason, you need to enable the PostgreSQL module via
+    cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module
+    you need to have the server development headers for PostgreSQL installed.
+    On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-<postgresql version>`
+
+
 Nominatim installs itself into `/usr/local` per default. To choose a different
 installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
 cmake command. Make sure that the `bin` directory is available in your path
--- a/docs/admin/Migration.md
+++ b/docs/admin/Migration.md
@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.**

 ## 4.0.0 -> master

+### ICU tokenizer is the new default
+
+Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer)
+by default. This only has an effect on newly installed databases. When
+updating older databases, it keeps its installed tokenizer. If you still
+run with the legacy tokenizer, make sure to compile Nominatim with the
+PostgreSQL module, see [Installation](Installation.md#building-nominatim).
+
 ### geocodejson output changed

 The `type` field of the geocodejson output has changed. It now contains
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@ -19,7 +19,22 @@ they can be configured.

 The legacy tokenizer implements the analysis algorithms of older Nominatim
 versions. It uses a special Postgresql module to normalize names and queries.
-This tokenizer is currently the default.
+This tokenizer is automatically installed and used when upgrading an older
+database. It should not be used for new installations anymore.
+
+### Compiling the PostgreSQL module
+
+The tokeinzer needs a special C module for PostgreSQL which is not compiled
+by default. If you need the legacy tokenizer, compile Nominatim as follows:
+
+```
+mkdir build
+cd build
+cmake -DBUILD_MODULE=on
+make
+```
+
+### Enabling the tokenizer

 To enable the tokenizer add the following line to your project configuration:

@ -47,6 +62,7 @@ normalization functions are hard-coded.
 The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.
+This tokenizer is currently the default.

 To enable the tokenizer add the following line to your project configuration:

--- a/nominatim/config.py
+++ b/nominatim/config.py
@ -187,7 +187,7 @@ class Configuration:
        if configfile.suffix in ('.yaml', '.yml'):
            result = self._load_from_yaml(configfile)
        elif configfile.suffix == '.json':
-            with configfile.open('r') as cfg:
+            with configfile.open('r', encoding='utf-8') as cfg:
                result = json.load(cfg)
        else:
            raise UsageError(f"Config file '{configfile}' has unknown format.")
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
                @define('CONST_Max_Word_Frequency', 10000000);
                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')


    def _save_config(self):
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer):
                @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
                @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
                require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-                """.format(config)))
+                """.format(config)), encoding='utf-8')


    def _init_db_tables(self, config):
--- a/settings/env.defaults
+++ b/settings/env.defaults
@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
 # Tokenizer used for normalizing and parsing queries and names.
 # The tokenizer is set up during import and cannot be changed afterwards
 # without a reimport.
-# Currently available tokenizers: legacy
-NOMINATIM_TOKENIZER="legacy"
+# Currently available tokenizers: icu, legacy
+NOMINATIM_TOKENIZER="icu"

 # Number of occurrences of a word before it is considered frequent.
 # Similar to the concept of stop words. Frequent partial words get ignored
--- a/test/bdd/environment.py
+++ b/test/bdd/environment.py
@ -59,5 +59,5 @@ def after_scenario(context, scenario):

 def before_tag(context, tag):
    if tag == 'fail-legacy':
-        if context.config.userdata['TOKENIZER'] in (None, 'legacy'):
+        if context.config.userdata['TOKENIZER'] == 'legacy':
            context.scenario.skip("Not implemented in legacy tokenizer")
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@ -207,7 +207,7 @@ class NominatimEnvironment:
                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                    self.run_nominatim('freeze')

-                    if self.tokenizer != 'icu':
+                    if self.tokenizer == 'legacy':
                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                        run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                    else:
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
    plist.sort()

    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        if nctx.tokenizer == 'icu':
+        if nctx.tokenizer != 'legacy':
            cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                        (plist,))
        else:
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory):
                     country_code VARCHAR(2)""")


-@pytest.fixture
-def word_table(temp_db_conn):
-    return mocks.MockWordTable(temp_db_conn)
-
-
@pytest.fixture
 def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions):
    table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
--- a/test/python/mocks.py
+++ b/test/python/mocks.py
@ -14,7 +14,7 @@ import psycopg2.extras
 from nominatim.db import properties

 # This must always point to the mock word table for the default tokenizer.
-from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
+from mock_icu_word_table import MockIcuWordTable as MockWordTable

 class MockPlacexTable:
    """ A placex table for testing.
--- a/test/python/tools/test_database_import.py
+++ b/test/python/tools/test_database_import.py
@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w

@pytest.mark.parametrize("threads", (1, 5))
 def test_load_data(dsn, place_row, placex_table, osmline_table,
-                   word_table, temp_db_cursor, threads):
+                   temp_db_cursor, threads):
    for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
        temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT)
                                  RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
--- a/test/python/tools/test_migration.py
+++ b/test/python/tools/test_migration.py
@ -14,6 +14,8 @@ from nominatim.tools import migration
 from nominatim.errors import UsageError
 import nominatim.version

+from mock_legacy_word_table import MockLegacyWordTable
+
 class DummyTokenizer:

    def update_sql_functions(self, config):
@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch):
    monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
                        lambda *args: DummyTokenizer())

+@pytest.fixture
+def legacy_word_table(temp_db_conn):
+    return MockLegacyWordTable(temp_db_conn)
+

 def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
    table_factory('country_name', 'name HSTORE, country_code TEXT')
@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,


 def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
-                                            word_table, placex_table):
+                                            legacy_word_table, placex_table):
    placex_table.add(housenumber='3A')

    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
--- a/test/python/tools/test_postcodes.py
+++ b/test/python/tools/test_postcodes.py
@ -65,7 +65,7 @@ def tokenizer():
    return dummy_tokenizer.DummyTokenizer(None, None)

@pytest.fixture
-def postcode_table(temp_db_conn, placex_table, word_table):
+def postcode_table(temp_db_conn, placex_table):
    return MockPostcodeTable(temp_db_conn)


--- a/vagrant/Install-on-Ubuntu-18.sh
+++ b/vagrant/Install-on-Ubuntu-18.sh
@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
    sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                        libboost-filesystem-dev libexpat1-dev zlib1g-dev\
                        libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-10 postgresql-10-postgis-2.4 \
+                        postgresql-10-postgis-2.4 \
                        postgresql-contrib-10 postgresql-10-postgis-scripts \
                        php php-pgsql php-intl libicu-dev python3-pip \
-                        python3-psutil python3-jinja2 python3-icu git
+                        python3-psutil python3-jinja2 python3-yaml python3-icu git

 # Some of the Python packages that come with Ubuntu 18.04 are too old, so
 # install the latest version from pip:
--- a/vagrant/Install-on-Ubuntu-20.sh
+++ b/vagrant/Install-on-Ubuntu-20.sh
@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
    sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                        libboost-filesystem-dev libexpat1-dev zlib1g-dev \
                        libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-12 postgresql-12-postgis-3 \
+                        postgresql-12-postgis-3 \
                        postgresql-contrib-12 postgresql-12-postgis-3-scripts \
                        php php-pgsql php-intl libicu-dev python3-dotenv \
                        python3-psycopg2 python3-psutil python3-jinja2 \
-                        python3-icu python3-datrie git
+                        python3-icu python3-datrie python3-yaml git

 #
 # System Configuration