Merge pull request #2428 from lonvia/rename-icu-tokenizer

Rename legacy_icu tokenizer to icu tokenizer
This commit is contained in:
Sarah Hoffmann 2021-08-18 15:02:19 +02:00 committed by GitHub
commit 925195725d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 24 additions and 18 deletions

View File

@ -103,9 +103,9 @@ jobs:
working-directory: Nominatim/test/bdd working-directory: Nominatim/test/bdd
if: matrix.ubuntu == 18 if: matrix.ubuntu == 18
- name: BDD tests (legacy_icu tokenizer) - name: BDD tests (icu tokenizer)
run: | run: |
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy_icu --format=progress3 behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
working-directory: Nominatim/test/bdd working-directory: Nominatim/test/bdd
- name: Upload coverage to Codecov - name: Upload coverage to Codecov

View File

@ -258,6 +258,6 @@ install(FILES settings/env.defaults
settings/import-address.style settings/import-address.style
settings/import-full.style settings/import-full.style
settings/import-extratags.style settings/import-extratags.style
settings/legacy_icu_tokenizer.yaml settings/icu_tokenizer.yaml
settings/icu-rules/extended-unicode-to-asccii.yaml settings/icu-rules/extended-unicode-to-asccii.yaml
DESTINATION ${NOMINATIM_CONFIGDIR}) DESTINATION ${NOMINATIM_CONFIGDIR})

View File

@ -52,6 +52,12 @@ The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
normalize names and queries. It also offers configurable decomposition and normalize names and queries. It also offers configurable decomposition and
abbreviation handling. abbreviation handling.
To enable the tokenizer add the following line to your project configuration:
```
NOMINATIM_TOKENIZER=icu
```
### How it works ### How it works
On import the tokenizer processes names in the following four stages: On import the tokenizer processes names in the following four stages:

View File

@ -52,7 +52,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
if config.TOKENIZER_CONFIG: if config.TOKENIZER_CONFIG:
cfgfile = Path(config.TOKENIZER_CONFIG) cfgfile = Path(config.TOKENIZER_CONFIG)
else: else:
cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml' cfgfile = config.config_dir / 'icu_tokenizer.yaml'
loader = ICURuleLoader(cfgfile) loader = ICURuleLoader(cfgfile)
self.naming_rules = ICUNameProcessorRules(loader=loader) self.naming_rules = ICUNameProcessorRules(loader=loader)
@ -88,7 +88,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
sqlp = SQLPreprocessor(conn, config) sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql', sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
max_word_freq=max_word_freq) max_word_freq=max_word_freq)
@ -98,7 +98,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
self.init_from_project() self.init_from_project()
if self.naming_rules is None: if self.naming_rules is None:
return "Configuration for tokenizer 'legacy_icu' are missing." return "Configuration for tokenizer 'icu' are missing."
return None return None
@ -130,7 +130,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
@define('CONST_Max_Word_Frequency', {self.max_word_frequency}); @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}"); @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}"); @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');""")) require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
def _save_config(self, config): def _save_config(self, config):

View File

@ -5,7 +5,7 @@ bdd:
cd bdd && behave -DREMOVE_TEMPLATE=1 cd bdd && behave -DREMOVE_TEMPLATE=1
icu: icu:
cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=legacy_icu cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=icu
php: php:
cd php && phpunit ./ cd php && phpunit ./

View File

@ -201,7 +201,7 @@ class NominatimEnvironment:
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve())) self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze') self.run_nominatim('freeze')
if self.tokenizer != 'legacy_icu': if self.tokenizer != 'icu':
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
else: else:

View File

@ -280,7 +280,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
plist.sort() plist.sort()
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
if nctx.tokenizer == 'legacy_icu': if nctx.tokenizer == 'icu':
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)", cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
(plist,)) (plist,))
else: else:

View File

@ -6,7 +6,7 @@ import yaml
import pytest import pytest
from nominatim.tokenizer import legacy_icu_tokenizer from nominatim.tokenizer import icu_tokenizer
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties from nominatim.db import properties
@ -26,7 +26,7 @@ def test_config(def_config, tmp_path):
sqldir = tmp_path / 'sql' sqldir = tmp_path / 'sql'
sqldir.mkdir() sqldir.mkdir()
(sqldir / 'tokenizer').mkdir() (sqldir / 'tokenizer').mkdir()
(sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'") (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'), shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql')) str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
@ -41,7 +41,7 @@ def tokenizer_factory(dsn, tmp_path, property_table,
(tmp_path / 'tokenizer').mkdir() (tmp_path / 'tokenizer').mkdir()
def _maker(): def _maker():
return legacy_icu_tokenizer.create(dsn, tmp_path / 'tokenizer') return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
return _maker return _maker
@ -57,7 +57,7 @@ def db_prop(temp_db_conn):
@pytest.fixture @pytest.fixture
def analyzer(tokenizer_factory, test_config, monkeypatch, def analyzer(tokenizer_factory, test_config, monkeypatch,
temp_db_with_extensions, tmp_path): temp_db_with_extensions, tmp_path):
sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_icu_tokenizer.sql' sql = tmp_path / 'sql' / 'tokenizer' / 'icu_tokenizer.sql'
sql.write_text("SELECT 'a';") sql.write_text("SELECT 'a';")
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();') monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
@ -146,8 +146,8 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
tok = tokenizer_factory() tok = tokenizer_factory()
tok.init_new_db(test_config) tok.init_new_db(test_config)
assert db_prop(legacy_icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();' assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) is not None assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table): def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
@ -187,11 +187,11 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
tok.init_new_db(test_config) tok.init_new_db(test_config)
monkeypatch.undo() monkeypatch.undo()
assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133' assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
table_factory('test', 'txt TEXT') table_factory('test', 'txt TEXT')
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_icu_tokenizer.sql' func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""") func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
tok.update_sql_functions(test_config) tok.update_sql_functions(test_config)