mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-25 14:02:12 +03:00
Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default
Make ICU tokenizer the default
This commit is contained in:
commit
5ff35d9984
17
.github/actions/build-nominatim/action.yml
vendored
17
.github/actions/build-nominatim/action.yml
vendored
@ -5,6 +5,10 @@ inputs:
|
||||
description: 'Version of Ubuntu to install on'
|
||||
required: false
|
||||
default: '20'
|
||||
cmake-args:
|
||||
description: 'Additional options to hand to cmake'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@ -21,18 +25,13 @@ runs:
|
||||
shell: bash
|
||||
env:
|
||||
UBUNTUVER: ${{ inputs.ubuntu }}
|
||||
|
||||
- name: Download dependencies
|
||||
run: |
|
||||
if [ ! -f country_grid.sql.gz ]; then
|
||||
wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz
|
||||
fi
|
||||
cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz
|
||||
shell: bash
|
||||
CMAKE_ARGS: ${{ inputs.cmake-args }}
|
||||
|
||||
- name: Configure
|
||||
run: mkdir build && cd build && cmake ../Nominatim
|
||||
run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim
|
||||
shell: bash
|
||||
env:
|
||||
CMAKE_ARGS: ${{ inputs.cmake-args }}
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
|
2
.github/actions/setup-postgresql/action.yml
vendored
2
.github/actions/setup-postgresql/action.yml
vendored
@ -22,7 +22,7 @@ runs:
|
||||
|
||||
- name: Install PostgreSQL
|
||||
run: |
|
||||
sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER}
|
||||
sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER}
|
||||
shell: bash
|
||||
env:
|
||||
PGVER: ${{ inputs.postgresql-version }}
|
||||
|
42
.github/workflows/ci-tests.yml
vendored
42
.github/workflows/ci-tests.yml
vendored
@ -113,19 +113,9 @@ jobs:
|
||||
working-directory: Nominatim/test/bdd
|
||||
|
||||
|
||||
icu-test:
|
||||
legacy-test:
|
||||
needs: create-archive
|
||||
strategy:
|
||||
matrix:
|
||||
ubuntu: [20]
|
||||
include:
|
||||
- ubuntu: 20
|
||||
postgresql: 13
|
||||
postgis: 3
|
||||
pytest: py.test-3
|
||||
php: 7.4
|
||||
|
||||
runs-on: ubuntu-${{ matrix.ubuntu }}.04
|
||||
runs-on: ubuntu-20.04
|
||||
|
||||
steps:
|
||||
- uses: actions/download-artifact@v2
|
||||
@ -138,35 +128,27 @@ jobs:
|
||||
- name: Setup PHP
|
||||
uses: shivammathur/setup-php@v2
|
||||
with:
|
||||
php-version: ${{ matrix.php }}
|
||||
coverage: xdebug
|
||||
tools: phpunit, phpcs, composer
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.6
|
||||
if: matrix.ubuntu == 18
|
||||
php-version: 7.4
|
||||
|
||||
- uses: ./Nominatim/.github/actions/setup-postgresql
|
||||
with:
|
||||
postgresql-version: ${{ matrix.postgresql }}
|
||||
postgis-version: ${{ matrix.postgis }}
|
||||
postgresql-version: 13
|
||||
postgis-version: 3
|
||||
|
||||
- name: Install Postgresql server dev
|
||||
run: sudo apt-get install postgresql-server-dev-13
|
||||
|
||||
- uses: ./Nominatim/.github/actions/build-nominatim
|
||||
with:
|
||||
ubuntu: ${{ matrix.ubuntu }}
|
||||
ubuntu: 20
|
||||
cmake-args: -DBUILD_MODULE=on
|
||||
|
||||
- name: Install test prerequsites
|
||||
run: sudo apt-get install -y -qq python3-behave
|
||||
if: matrix.ubuntu == 20
|
||||
|
||||
- name: Install test prerequsites
|
||||
run: pip3 install behave==1.2.6
|
||||
if: matrix.ubuntu == 18
|
||||
|
||||
- name: BDD tests (icu tokenizer)
|
||||
- name: BDD tests (legacy tokenizer)
|
||||
run: |
|
||||
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
|
||||
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3
|
||||
working-directory: Nominatim/test/bdd
|
||||
|
||||
|
||||
|
@ -44,7 +44,7 @@ endif()
|
||||
|
||||
set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
|
||||
set(BUILD_API on CACHE BOOL "Build everything for the API server")
|
||||
set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module")
|
||||
set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
|
||||
set(BUILD_TESTS on CACHE BOOL "Build test suite")
|
||||
set(BUILD_DOCS on CACHE BOOL "Build documentation")
|
||||
set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
|
||||
|
@ -158,6 +158,17 @@ make
|
||||
sudo make install
|
||||
```
|
||||
|
||||
!!! warning
|
||||
The default installation no longer compiles the PostgreSQL module that
|
||||
is needed for the legacy tokenizer from older Nominatim versions. If you
|
||||
are upgrading an older database or want to run the
|
||||
[legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
|
||||
some other reason, you need to enable the PostgreSQL module via
|
||||
cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module
|
||||
you need to have the server development headers for PostgreSQL installed.
|
||||
On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-<postgresql version>`
|
||||
|
||||
|
||||
Nominatim installs itself into `/usr/local` per default. To choose a different
|
||||
installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
|
||||
cmake command. Make sure that the `bin` directory is available in your path
|
||||
|
@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.**
|
||||
|
||||
## 4.0.0 -> master
|
||||
|
||||
### ICU tokenizer is the new default
|
||||
|
||||
Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer)
|
||||
by default. This only has an effect on newly installed databases. When
|
||||
updating older databases, it keeps its installed tokenizer. If you still
|
||||
run with the legacy tokenizer, make sure to compile Nominatim with the
|
||||
PostgreSQL module, see [Installation](Installation.md#building-nominatim).
|
||||
|
||||
### geocodejson output changed
|
||||
|
||||
The `type` field of the geocodejson output has changed. It now contains
|
||||
|
@ -19,7 +19,22 @@ they can be configured.
|
||||
|
||||
The legacy tokenizer implements the analysis algorithms of older Nominatim
|
||||
versions. It uses a special Postgresql module to normalize names and queries.
|
||||
This tokenizer is currently the default.
|
||||
This tokenizer is automatically installed and used when upgrading an older
|
||||
database. It should not be used for new installations anymore.
|
||||
|
||||
### Compiling the PostgreSQL module
|
||||
|
||||
The tokeinzer needs a special C module for PostgreSQL which is not compiled
|
||||
by default. If you need the legacy tokenizer, compile Nominatim as follows:
|
||||
|
||||
```
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DBUILD_MODULE=on
|
||||
make
|
||||
```
|
||||
|
||||
### Enabling the tokenizer
|
||||
|
||||
To enable the tokenizer add the following line to your project configuration:
|
||||
|
||||
@ -47,6 +62,7 @@ normalization functions are hard-coded.
|
||||
The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
|
||||
normalize names and queries. It also offers configurable decomposition and
|
||||
abbreviation handling.
|
||||
This tokenizer is currently the default.
|
||||
|
||||
To enable the tokenizer add the following line to your project configuration:
|
||||
|
||||
|
@ -187,7 +187,7 @@ class Configuration:
|
||||
if configfile.suffix in ('.yaml', '.yml'):
|
||||
result = self._load_from_yaml(configfile)
|
||||
elif configfile.suffix == '.json':
|
||||
with configfile.open('r') as cfg:
|
||||
with configfile.open('r', encoding='utf-8') as cfg:
|
||||
result = json.load(cfg)
|
||||
else:
|
||||
raise UsageError(f"Config file '{configfile}' has unknown format.")
|
||||
|
@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
||||
@define('CONST_Max_Word_Frequency', 10000000);
|
||||
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
|
||||
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
|
||||
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
||||
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
|
||||
|
||||
|
||||
def _save_config(self):
|
||||
|
@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
||||
@define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
|
||||
@define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
|
||||
require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
|
||||
""".format(config)))
|
||||
""".format(config)), encoding='utf-8')
|
||||
|
||||
|
||||
def _init_db_tables(self, config):
|
||||
|
@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
|
||||
# Tokenizer used for normalizing and parsing queries and names.
|
||||
# The tokenizer is set up during import and cannot be changed afterwards
|
||||
# without a reimport.
|
||||
# Currently available tokenizers: legacy
|
||||
NOMINATIM_TOKENIZER="legacy"
|
||||
# Currently available tokenizers: icu, legacy
|
||||
NOMINATIM_TOKENIZER="icu"
|
||||
|
||||
# Number of occurrences of a word before it is considered frequent.
|
||||
# Similar to the concept of stop words. Frequent partial words get ignored
|
||||
|
@ -59,5 +59,5 @@ def after_scenario(context, scenario):
|
||||
|
||||
def before_tag(context, tag):
|
||||
if tag == 'fail-legacy':
|
||||
if context.config.userdata['TOKENIZER'] in (None, 'legacy'):
|
||||
if context.config.userdata['TOKENIZER'] == 'legacy':
|
||||
context.scenario.skip("Not implemented in legacy tokenizer")
|
||||
|
@ -207,7 +207,7 @@ class NominatimEnvironment:
|
||||
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
|
||||
self.run_nominatim('freeze')
|
||||
|
||||
if self.tokenizer != 'icu':
|
||||
if self.tokenizer == 'legacy':
|
||||
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
|
||||
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
|
||||
else:
|
||||
|
@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
|
||||
plist.sort()
|
||||
|
||||
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
if nctx.tokenizer == 'icu':
|
||||
if nctx.tokenizer != 'legacy':
|
||||
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
|
||||
(plist,))
|
||||
else:
|
||||
|
@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory):
|
||||
country_code VARCHAR(2)""")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def word_table(temp_db_conn):
|
||||
return mocks.MockWordTable(temp_db_conn)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions):
|
||||
table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
|
||||
|
@ -14,7 +14,7 @@ import psycopg2.extras
|
||||
from nominatim.db import properties
|
||||
|
||||
# This must always point to the mock word table for the default tokenizer.
|
||||
from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
|
||||
from mock_icu_word_table import MockIcuWordTable as MockWordTable
|
||||
|
||||
class MockPlacexTable:
|
||||
""" A placex table for testing.
|
||||
|
@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w
|
||||
|
||||
@pytest.mark.parametrize("threads", (1, 5))
|
||||
def test_load_data(dsn, place_row, placex_table, osmline_table,
|
||||
word_table, temp_db_cursor, threads):
|
||||
temp_db_cursor, threads):
|
||||
for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
|
||||
temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT)
|
||||
RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
|
||||
|
@ -14,6 +14,8 @@ from nominatim.tools import migration
|
||||
from nominatim.errors import UsageError
|
||||
import nominatim.version
|
||||
|
||||
from mock_legacy_word_table import MockLegacyWordTable
|
||||
|
||||
class DummyTokenizer:
|
||||
|
||||
def update_sql_functions(self, config):
|
||||
@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch):
|
||||
monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
|
||||
lambda *args: DummyTokenizer())
|
||||
|
||||
@pytest.fixture
|
||||
def legacy_word_table(temp_db_conn):
|
||||
return MockLegacyWordTable(temp_db_conn)
|
||||
|
||||
|
||||
def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
|
||||
table_factory('country_name', 'name HSTORE, country_code TEXT')
|
||||
@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,
|
||||
|
||||
|
||||
def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
|
||||
word_table, placex_table):
|
||||
legacy_word_table, placex_table):
|
||||
placex_table.add(housenumber='3A')
|
||||
|
||||
temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
|
||||
|
@ -65,7 +65,7 @@ def tokenizer():
|
||||
return dummy_tokenizer.DummyTokenizer(None, None)
|
||||
|
||||
@pytest.fixture
|
||||
def postcode_table(temp_db_conn, placex_table, word_table):
|
||||
def postcode_table(temp_db_conn, placex_table):
|
||||
return MockPostcodeTable(temp_db_conn)
|
||||
|
||||
|
||||
|
@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
|
||||
sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
|
||||
libboost-filesystem-dev libexpat1-dev zlib1g-dev\
|
||||
libbz2-dev libpq-dev libproj-dev \
|
||||
postgresql-server-dev-10 postgresql-10-postgis-2.4 \
|
||||
postgresql-10-postgis-2.4 \
|
||||
postgresql-contrib-10 postgresql-10-postgis-scripts \
|
||||
php php-pgsql php-intl libicu-dev python3-pip \
|
||||
python3-psutil python3-jinja2 python3-icu git
|
||||
python3-psutil python3-jinja2 python3-yaml python3-icu git
|
||||
|
||||
# Some of the Python packages that come with Ubuntu 18.04 are too old, so
|
||||
# install the latest version from pip:
|
||||
|
@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
|
||||
sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
|
||||
libboost-filesystem-dev libexpat1-dev zlib1g-dev \
|
||||
libbz2-dev libpq-dev libproj-dev \
|
||||
postgresql-server-dev-12 postgresql-12-postgis-3 \
|
||||
postgresql-12-postgis-3 \
|
||||
postgresql-contrib-12 postgresql-12-postgis-3-scripts \
|
||||
php php-pgsql php-intl libicu-dev python3-dotenv \
|
||||
python3-psycopg2 python3-psutil python3-jinja2 \
|
||||
python3-icu python3-datrie git
|
||||
python3-icu python3-datrie python3-yaml git
|
||||
|
||||
#
|
||||
# System Configuration
|
||||
|
Loading…
Reference in New Issue
Block a user