Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default

Make ICU tokenizer the default
This commit is contained in:
Sarah Hoffmann 2022-05-11 08:52:49 +02:00 committed by GitHub
commit 5ff35d9984
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 80 additions and 63 deletions

View File

@ -5,6 +5,10 @@ inputs:
description: 'Version of Ubuntu to install on'
required: false
default: '20'
cmake-args:
description: 'Additional options to hand to cmake'
required: false
default: ''
runs:
using: "composite"
@ -21,18 +25,13 @@ runs:
shell: bash
env:
UBUNTUVER: ${{ inputs.ubuntu }}
- name: Download dependencies
run: |
if [ ! -f country_grid.sql.gz ]; then
wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz
fi
cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz
shell: bash
CMAKE_ARGS: ${{ inputs.cmake-args }}
- name: Configure
run: mkdir build && cd build && cmake ../Nominatim
run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim
shell: bash
env:
CMAKE_ARGS: ${{ inputs.cmake-args }}
- name: Build
run: |

View File

@ -22,7 +22,7 @@ runs:
- name: Install PostgreSQL
run: |
sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER}
sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER}
shell: bash
env:
PGVER: ${{ inputs.postgresql-version }}

View File

@ -113,19 +113,9 @@ jobs:
working-directory: Nominatim/test/bdd
icu-test:
legacy-test:
needs: create-archive
strategy:
matrix:
ubuntu: [20]
include:
- ubuntu: 20
postgresql: 13
postgis: 3
pytest: py.test-3
php: 7.4
runs-on: ubuntu-${{ matrix.ubuntu }}.04
runs-on: ubuntu-20.04
steps:
- uses: actions/download-artifact@v2
@ -138,35 +128,27 @@ jobs:
- name: Setup PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php }}
coverage: xdebug
tools: phpunit, phpcs, composer
- uses: actions/setup-python@v2
with:
python-version: 3.6
if: matrix.ubuntu == 18
php-version: 7.4
- uses: ./Nominatim/.github/actions/setup-postgresql
with:
postgresql-version: ${{ matrix.postgresql }}
postgis-version: ${{ matrix.postgis }}
postgresql-version: 13
postgis-version: 3
- name: Install Postgresql server dev
run: sudo apt-get install postgresql-server-dev-13
- uses: ./Nominatim/.github/actions/build-nominatim
with:
ubuntu: ${{ matrix.ubuntu }}
ubuntu: 20
cmake-args: -DBUILD_MODULE=on
- name: Install test prerequsites
run: sudo apt-get install -y -qq python3-behave
if: matrix.ubuntu == 20
- name: Install test prerequsites
run: pip3 install behave==1.2.6
if: matrix.ubuntu == 18
- name: BDD tests (icu tokenizer)
- name: BDD tests (legacy tokenizer)
run: |
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3
working-directory: Nominatim/test/bdd

View File

@ -44,7 +44,7 @@ endif()
set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
set(BUILD_API on CACHE BOOL "Build everything for the API server")
set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module")
set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
set(BUILD_TESTS on CACHE BOOL "Build test suite")
set(BUILD_DOCS on CACHE BOOL "Build documentation")
set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")

View File

@ -158,6 +158,17 @@ make
sudo make install
```
!!! warning
The default installation no longer compiles the PostgreSQL module that
is needed for the legacy tokenizer from older Nominatim versions. If you
are upgrading an older database or want to run the
[legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
some other reason, you need to enable the PostgreSQL module via
cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module
you need to have the server development headers for PostgreSQL installed.
On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-<postgresql version>`
Nominatim installs itself into `/usr/local` per default. To choose a different
installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
cmake command. Make sure that the `bin` directory is available in your path

View File

@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.**
## 4.0.0 -> master
### ICU tokenizer is the new default
Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer)
by default. This only has an effect on newly installed databases. When
updating older databases, it keeps its installed tokenizer. If you still
run with the legacy tokenizer, make sure to compile Nominatim with the
PostgreSQL module, see [Installation](Installation.md#building-nominatim).
### geocodejson output changed
The `type` field of the geocodejson output has changed. It now contains

View File

@ -19,7 +19,22 @@ they can be configured.
The legacy tokenizer implements the analysis algorithms of older Nominatim
versions. It uses a special Postgresql module to normalize names and queries.
This tokenizer is currently the default.
This tokenizer is automatically installed and used when upgrading an older
database. It should not be used for new installations anymore.
### Compiling the PostgreSQL module
The tokeinzer needs a special C module for PostgreSQL which is not compiled
by default. If you need the legacy tokenizer, compile Nominatim as follows:
```
mkdir build
cd build
cmake -DBUILD_MODULE=on
make
```
### Enabling the tokenizer
To enable the tokenizer add the following line to your project configuration:
@ -47,6 +62,7 @@ normalization functions are hard-coded.
The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
normalize names and queries. It also offers configurable decomposition and
abbreviation handling.
This tokenizer is currently the default.
To enable the tokenizer add the following line to your project configuration:

View File

@ -187,7 +187,7 @@ class Configuration:
if configfile.suffix in ('.yaml', '.yml'):
result = self._load_from_yaml(configfile)
elif configfile.suffix == '.json':
with configfile.open('r') as cfg:
with configfile.open('r', encoding='utf-8') as cfg:
result = json.load(cfg)
else:
raise UsageError(f"Config file '{configfile}' has unknown format.")

View File

@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
def _save_config(self):

View File

@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer):
@define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
""".format(config)))
""".format(config)), encoding='utf-8')
def _init_db_tables(self, config):

View File

@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
# Tokenizer used for normalizing and parsing queries and names.
# The tokenizer is set up during import and cannot be changed afterwards
# without a reimport.
# Currently available tokenizers: legacy
NOMINATIM_TOKENIZER="legacy"
# Currently available tokenizers: icu, legacy
NOMINATIM_TOKENIZER="icu"
# Number of occurrences of a word before it is considered frequent.
# Similar to the concept of stop words. Frequent partial words get ignored

View File

@ -59,5 +59,5 @@ def after_scenario(context, scenario):
def before_tag(context, tag):
if tag == 'fail-legacy':
if context.config.userdata['TOKENIZER'] in (None, 'legacy'):
if context.config.userdata['TOKENIZER'] == 'legacy':
context.scenario.skip("Not implemented in legacy tokenizer")

View File

@ -207,7 +207,7 @@ class NominatimEnvironment:
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze')
if self.tokenizer != 'icu':
if self.tokenizer == 'legacy':
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
else:

View File

@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
plist.sort()
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
if nctx.tokenizer == 'icu':
if nctx.tokenizer != 'legacy':
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
(plist,))
else:

View File

@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory):
country_code VARCHAR(2)""")
@pytest.fixture
def word_table(temp_db_conn):
return mocks.MockWordTable(temp_db_conn)
@pytest.fixture
def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions):
table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))

View File

@ -14,7 +14,7 @@ import psycopg2.extras
from nominatim.db import properties
# This must always point to the mock word table for the default tokenizer.
from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
from mock_icu_word_table import MockIcuWordTable as MockWordTable
class MockPlacexTable:
""" A placex table for testing.

View File

@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w
@pytest.mark.parametrize("threads", (1, 5))
def test_load_data(dsn, place_row, placex_table, osmline_table,
word_table, temp_db_cursor, threads):
temp_db_cursor, threads):
for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT)
RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL

View File

@ -14,6 +14,8 @@ from nominatim.tools import migration
from nominatim.errors import UsageError
import nominatim.version
from mock_legacy_word_table import MockLegacyWordTable
class DummyTokenizer:
def update_sql_functions(self, config):
@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch):
monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
lambda *args: DummyTokenizer())
@pytest.fixture
def legacy_word_table(temp_db_conn):
return MockLegacyWordTable(temp_db_conn)
def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
table_factory('country_name', 'name HSTORE, country_code TEXT')
@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,
def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
word_table, placex_table):
legacy_word_table, placex_table):
placex_table.add(housenumber='3A')
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)

View File

@ -65,7 +65,7 @@ def tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None)
@pytest.fixture
def postcode_table(temp_db_conn, placex_table, word_table):
def postcode_table(temp_db_conn, placex_table):
return MockPostcodeTable(temp_db_conn)

View File

@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
libboost-filesystem-dev libexpat1-dev zlib1g-dev\
libbz2-dev libpq-dev libproj-dev \
postgresql-server-dev-10 postgresql-10-postgis-2.4 \
postgresql-10-postgis-2.4 \
postgresql-contrib-10 postgresql-10-postgis-scripts \
php php-pgsql php-intl libicu-dev python3-pip \
python3-psutil python3-jinja2 python3-icu git
python3-psutil python3-jinja2 python3-yaml python3-icu git
# Some of the Python packages that come with Ubuntu 18.04 are too old, so
# install the latest version from pip:

View File

@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
libboost-filesystem-dev libexpat1-dev zlib1g-dev \
libbz2-dev libpq-dev libproj-dev \
postgresql-server-dev-12 postgresql-12-postgis-3 \
postgresql-12-postgis-3 \
postgresql-contrib-12 postgresql-12-postgis-3-scripts \
php php-pgsql php-intl libicu-dev python3-dotenv \
python3-psycopg2 python3-psutil python3-jinja2 \
python3-icu python3-datrie git
python3-icu python3-datrie python3-yaml git
#
# System Configuration