Merge pull request #2428 from lonvia/rename-icu-tokenizer

Rename legacy_icu tokenizer to icu tokenizer
2024-12-26 14:36:23 +03:00 · 2021-08-18 15:02:19 +02:00 · 2021-08-18 15:02:19 +02:00 · 925195725d
commit 925195725d
parent 656c1291b1 f6d22df76e
11 changed files with 24 additions and 18 deletions
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@ -103,9 +103,9 @@ jobs:
              working-directory: Nominatim/test/bdd
              if: matrix.ubuntu == 18

-            - name: BDD tests (legacy_icu tokenizer)
+            - name: BDD tests (icu tokenizer)
              run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy_icu --format=progress3
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
              working-directory: Nominatim/test/bdd

            - name: Upload coverage to Codecov
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -258,6 +258,6 @@ install(FILES settings/env.defaults
              settings/import-address.style
              settings/import-full.style
              settings/import-extratags.style
-              settings/legacy_icu_tokenizer.yaml
+              settings/icu_tokenizer.yaml
              settings/icu-rules/extended-unicode-to-asccii.yaml
        DESTINATION ${NOMINATIM_CONFIGDIR})
--- a/docs/admin/Tokenizers.md
+++ b/docs/admin/Tokenizers.md
@ -52,6 +52,12 @@ The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.

+To enable the tokenizer add the following line to your project configuration:
+
+```
+NOMINATIM_TOKENIZER=icu
+```
+
 ### How it works

 On import the tokenizer processes names in the following four stages:
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
--- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@ -52,7 +52,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
        if config.TOKENIZER_CONFIG:
            cfgfile = Path(config.TOKENIZER_CONFIG)
        else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
+            cfgfile = config.config_dir / 'icu_tokenizer.yaml'

        loader = ICURuleLoader(cfgfile)
        self.naming_rules = ICUNameProcessorRules(loader=loader)
@ -88,7 +88,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
        with connect(self.dsn) as conn:
            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
                              max_word_freq=max_word_freq)


@ -98,7 +98,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
        self.init_from_project()

        if self.naming_rules is None:
-            return "Configuration for tokenizer 'legacy_icu' are missing."
+            return "Configuration for tokenizer 'icu' are missing."

        return None

@ -130,7 +130,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
-            require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
+            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))


    def _save_config(self, config):
--- a/settings/legacy_icu_tokenizer.yaml
+++ b/settings/legacy_icu_tokenizer.yaml
--- a/test/Makefile
+++ b/test/Makefile
@ -5,7 +5,7 @@ bdd:
 	cd bdd && behave -DREMOVE_TEMPLATE=1

 icu:
-	cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=legacy_icu
+	cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=icu

 php:
 	cd php && phpunit ./
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@ -201,7 +201,7 @@ class NominatimEnvironment:
                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                    self.run_nominatim('freeze')

-                    if self.tokenizer != 'legacy_icu':
+                    if self.tokenizer != 'icu':
                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                        run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                    else:
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@ -280,7 +280,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
    plist.sort()

    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        if nctx.tokenizer == 'legacy_icu':
+        if nctx.tokenizer == 'icu':
            cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                        (plist,))
        else:
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@ -6,7 +6,7 @@ import yaml

 import pytest

-from nominatim.tokenizer import legacy_icu_tokenizer
+from nominatim.tokenizer import icu_tokenizer
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
@ -26,7 +26,7 @@ def test_config(def_config, tmp_path):
    sqldir = tmp_path / 'sql'
    sqldir.mkdir()
    (sqldir / 'tokenizer').mkdir()
-    (sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'")
+    (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
    shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
                str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))

@ -41,7 +41,7 @@ def tokenizer_factory(dsn, tmp_path, property_table,
    (tmp_path / 'tokenizer').mkdir()

    def _maker():
-        return legacy_icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
+        return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')

    return _maker

@ -57,7 +57,7 @@ def db_prop(temp_db_conn):
@pytest.fixture
 def analyzer(tokenizer_factory, test_config, monkeypatch,
             temp_db_with_extensions, tmp_path):
-    sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_icu_tokenizer.sql'
+    sql = tmp_path / 'sql' / 'tokenizer' / 'icu_tokenizer.sql'
    sql.write_text("SELECT 'a';")

    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
@ -146,8 +146,8 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
    tok = tokenizer_factory()
    tok.init_new_db(test_config)

-    assert db_prop(legacy_icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
-    assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
+    assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
+    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None


 def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
@ -187,11 +187,11 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
    tok.init_new_db(test_config)
    monkeypatch.undo()

-    assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
+    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'

    table_factory('test', 'txt TEXT')

-    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_icu_tokenizer.sql'
+    func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")

    tok.update_sql_functions(test_config)