mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-12-27 06:51:42 +03:00
Merge pull request #2428 from lonvia/rename-icu-tokenizer
Rename legacy_icu tokenizer to icu tokenizer
This commit is contained in:
commit
925195725d
4
.github/workflows/ci-tests.yml
vendored
4
.github/workflows/ci-tests.yml
vendored
@ -103,9 +103,9 @@ jobs:
|
|||||||
working-directory: Nominatim/test/bdd
|
working-directory: Nominatim/test/bdd
|
||||||
if: matrix.ubuntu == 18
|
if: matrix.ubuntu == 18
|
||||||
|
|
||||||
- name: BDD tests (legacy_icu tokenizer)
|
- name: BDD tests (icu tokenizer)
|
||||||
run: |
|
run: |
|
||||||
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy_icu --format=progress3
|
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
|
||||||
working-directory: Nominatim/test/bdd
|
working-directory: Nominatim/test/bdd
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
|
@ -258,6 +258,6 @@ install(FILES settings/env.defaults
|
|||||||
settings/import-address.style
|
settings/import-address.style
|
||||||
settings/import-full.style
|
settings/import-full.style
|
||||||
settings/import-extratags.style
|
settings/import-extratags.style
|
||||||
settings/legacy_icu_tokenizer.yaml
|
settings/icu_tokenizer.yaml
|
||||||
settings/icu-rules/extended-unicode-to-asccii.yaml
|
settings/icu-rules/extended-unicode-to-asccii.yaml
|
||||||
DESTINATION ${NOMINATIM_CONFIGDIR})
|
DESTINATION ${NOMINATIM_CONFIGDIR})
|
||||||
|
@ -52,6 +52,12 @@ The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
|
|||||||
normalize names and queries. It also offers configurable decomposition and
|
normalize names and queries. It also offers configurable decomposition and
|
||||||
abbreviation handling.
|
abbreviation handling.
|
||||||
|
|
||||||
|
To enable the tokenizer add the following line to your project configuration:
|
||||||
|
|
||||||
|
```
|
||||||
|
NOMINATIM_TOKENIZER=icu
|
||||||
|
```
|
||||||
|
|
||||||
### How it works
|
### How it works
|
||||||
|
|
||||||
On import the tokenizer processes names in the following four stages:
|
On import the tokenizer processes names in the following four stages:
|
||||||
|
@ -52,7 +52,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
if config.TOKENIZER_CONFIG:
|
if config.TOKENIZER_CONFIG:
|
||||||
cfgfile = Path(config.TOKENIZER_CONFIG)
|
cfgfile = Path(config.TOKENIZER_CONFIG)
|
||||||
else:
|
else:
|
||||||
cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
|
cfgfile = config.config_dir / 'icu_tokenizer.yaml'
|
||||||
|
|
||||||
loader = ICURuleLoader(cfgfile)
|
loader = ICURuleLoader(cfgfile)
|
||||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
||||||
@ -88,7 +88,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
|
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
|
||||||
sqlp = SQLPreprocessor(conn, config)
|
sqlp = SQLPreprocessor(conn, config)
|
||||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
|
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
|
||||||
max_word_freq=max_word_freq)
|
max_word_freq=max_word_freq)
|
||||||
|
|
||||||
|
|
||||||
@ -98,7 +98,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
self.init_from_project()
|
self.init_from_project()
|
||||||
|
|
||||||
if self.naming_rules is None:
|
if self.naming_rules is None:
|
||||||
return "Configuration for tokenizer 'legacy_icu' are missing."
|
return "Configuration for tokenizer 'icu' are missing."
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
@define('CONST_Max_Word_Frequency', {self.max_word_frequency});
|
@define('CONST_Max_Word_Frequency', {self.max_word_frequency});
|
||||||
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
||||||
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
|
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
|
||||||
require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
|
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
||||||
|
|
||||||
|
|
||||||
def _save_config(self, config):
|
def _save_config(self, config):
|
@ -5,7 +5,7 @@ bdd:
|
|||||||
cd bdd && behave -DREMOVE_TEMPLATE=1
|
cd bdd && behave -DREMOVE_TEMPLATE=1
|
||||||
|
|
||||||
icu:
|
icu:
|
||||||
cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=legacy_icu
|
cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=icu
|
||||||
|
|
||||||
php:
|
php:
|
||||||
cd php && phpunit ./
|
cd php && phpunit ./
|
||||||
|
@ -201,7 +201,7 @@ class NominatimEnvironment:
|
|||||||
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
|
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
|
||||||
self.run_nominatim('freeze')
|
self.run_nominatim('freeze')
|
||||||
|
|
||||||
if self.tokenizer != 'legacy_icu':
|
if self.tokenizer != 'icu':
|
||||||
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
|
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
|
||||||
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
|
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
|
||||||
else:
|
else:
|
||||||
|
@ -280,7 +280,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
|
|||||||
plist.sort()
|
plist.sort()
|
||||||
|
|
||||||
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||||
if nctx.tokenizer == 'legacy_icu':
|
if nctx.tokenizer == 'icu':
|
||||||
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
|
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
|
||||||
(plist,))
|
(plist,))
|
||||||
else:
|
else:
|
||||||
|
@ -6,7 +6,7 @@ import yaml
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from nominatim.tokenizer import legacy_icu_tokenizer
|
from nominatim.tokenizer import icu_tokenizer
|
||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
from nominatim.db import properties
|
from nominatim.db import properties
|
||||||
@ -26,7 +26,7 @@ def test_config(def_config, tmp_path):
|
|||||||
sqldir = tmp_path / 'sql'
|
sqldir = tmp_path / 'sql'
|
||||||
sqldir.mkdir()
|
sqldir.mkdir()
|
||||||
(sqldir / 'tokenizer').mkdir()
|
(sqldir / 'tokenizer').mkdir()
|
||||||
(sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'")
|
(sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
|
||||||
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
|
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
|
||||||
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
|
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ def tokenizer_factory(dsn, tmp_path, property_table,
|
|||||||
(tmp_path / 'tokenizer').mkdir()
|
(tmp_path / 'tokenizer').mkdir()
|
||||||
|
|
||||||
def _maker():
|
def _maker():
|
||||||
return legacy_icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
|
return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
|
||||||
|
|
||||||
return _maker
|
return _maker
|
||||||
|
|
||||||
@ -57,7 +57,7 @@ def db_prop(temp_db_conn):
|
|||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def analyzer(tokenizer_factory, test_config, monkeypatch,
|
def analyzer(tokenizer_factory, test_config, monkeypatch,
|
||||||
temp_db_with_extensions, tmp_path):
|
temp_db_with_extensions, tmp_path):
|
||||||
sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_icu_tokenizer.sql'
|
sql = tmp_path / 'sql' / 'tokenizer' / 'icu_tokenizer.sql'
|
||||||
sql.write_text("SELECT 'a';")
|
sql.write_text("SELECT 'a';")
|
||||||
|
|
||||||
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
|
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
|
||||||
@ -146,8 +146,8 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
|
|||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.init_new_db(test_config)
|
tok.init_new_db(test_config)
|
||||||
|
|
||||||
assert db_prop(legacy_icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
|
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
|
||||||
assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
|
assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
|
||||||
|
|
||||||
|
|
||||||
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
|
||||||
@ -187,11 +187,11 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
|
|||||||
tok.init_new_db(test_config)
|
tok.init_new_db(test_config)
|
||||||
monkeypatch.undo()
|
monkeypatch.undo()
|
||||||
|
|
||||||
assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
|
assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
|
||||||
|
|
||||||
table_factory('test', 'txt TEXT')
|
table_factory('test', 'txt TEXT')
|
||||||
|
|
||||||
func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_icu_tokenizer.sql'
|
func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
|
||||||
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
|
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
|
||||||
|
|
||||||
tok.update_sql_functions(test_config)
|
tok.update_sql_functions(test_config)
|
Loading…
Reference in New Issue
Block a user