mirror of
https://github.com/osm-search/Nominatim.git
synced 2024-11-27 00:49:55 +03:00
boilerplate for PHP code of tokenizer
This adds an installation step for PHP code for the tokenizer. The PHP code is split in two parts. The updateable code is found in lib-php. The tokenizer installs an additional script in the project directory which then includes the code from lib-php and defines all settings that are static to the database. The website code then always includes the PHP from the project directory.
This commit is contained in:
parent
23fd1d032a
commit
3eb4d88057
@ -8,6 +8,7 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
|
|||||||
require_once(CONST_LibDir.'/SearchDescription.php');
|
require_once(CONST_LibDir.'/SearchDescription.php');
|
||||||
require_once(CONST_LibDir.'/SearchContext.php');
|
require_once(CONST_LibDir.'/SearchContext.php');
|
||||||
require_once(CONST_LibDir.'/TokenList.php');
|
require_once(CONST_LibDir.'/TokenList.php');
|
||||||
|
require_once(CONST_TokenizerDir.'/tokenizer.php');
|
||||||
|
|
||||||
class Geocode
|
class Geocode
|
||||||
{
|
{
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
@define('CONST_LibDir', dirname(dirname(__FILE__)));
|
@define('CONST_LibDir', dirname(dirname(__FILE__)));
|
||||||
|
|
||||||
require_once(CONST_LibDir.'/init-cmd.php');
|
require_once(CONST_LibDir.'/init-cmd.php');
|
||||||
require_once(CONST_LibDir.'/Geocode.php');
|
|
||||||
require_once(CONST_LibDir.'/ParameterParser.php');
|
require_once(CONST_LibDir.'/ParameterParser.php');
|
||||||
ini_set('memory_limit', '800M');
|
ini_set('memory_limit', '800M');
|
||||||
|
|
||||||
@ -41,16 +40,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
|
|||||||
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
|
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
|
||||||
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
|
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
|
||||||
@define('CONST_Log_File', getSetting('LOG_FILE', false));
|
@define('CONST_Log_File', getSetting('LOG_FILE', false));
|
||||||
@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
|
|
||||||
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
|
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
|
||||||
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
|
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
|
||||||
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
|
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
|
||||||
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
|
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
|
||||||
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
|
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
|
||||||
@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
|
|
||||||
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
|
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
|
||||||
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
|
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
|
||||||
|
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
|
||||||
|
|
||||||
|
require_once(CONST_LibDir.'/Geocode.php');
|
||||||
|
|
||||||
$oDB = new Nominatim\DB;
|
$oDB = new Nominatim\DB;
|
||||||
$oDB->connect();
|
$oDB->connect();
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
require_once(CONST_LibDir.'/init-cmd.php');
|
require_once(CONST_LibDir.'/init-cmd.php');
|
||||||
require_once(CONST_LibDir.'/log.php');
|
require_once(CONST_LibDir.'/log.php');
|
||||||
require_once(CONST_LibDir.'/Geocode.php');
|
|
||||||
require_once(CONST_LibDir.'/PlaceLookup.php');
|
require_once(CONST_LibDir.'/PlaceLookup.php');
|
||||||
require_once(CONST_LibDir.'/ReverseGeocode.php');
|
require_once(CONST_LibDir.'/ReverseGeocode.php');
|
||||||
|
|
||||||
@ -26,16 +25,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
|
|||||||
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
|
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
|
||||||
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
|
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
|
||||||
@define('CONST_Log_File', getSetting('LOG_FILE', false));
|
@define('CONST_Log_File', getSetting('LOG_FILE', false));
|
||||||
@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
|
|
||||||
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
|
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
|
||||||
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
|
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
|
||||||
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
|
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
|
||||||
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
|
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
|
||||||
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
|
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
|
||||||
@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
|
|
||||||
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
|
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
|
||||||
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
|
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
|
||||||
|
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
|
||||||
|
|
||||||
|
require_once(CONST_LibDir.'/Geocode.php');
|
||||||
|
|
||||||
$oDB = new Nominatim\DB();
|
$oDB = new Nominatim\DB();
|
||||||
$oDB->connect();
|
$oDB->connect();
|
||||||
|
1
lib-php/tokenizer/legacy_tokenizer.php
Normal file
1
lib-php/tokenizer/legacy_tokenizer.php
Normal file
@ -0,0 +1 @@
|
|||||||
|
<?php
|
@ -54,8 +54,7 @@ def create_tokenizer(config, init_db=True, module_name=None):
|
|||||||
tokenizer_module = _import_tokenizer(module_name)
|
tokenizer_module = _import_tokenizer(module_name)
|
||||||
|
|
||||||
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
||||||
if init_db:
|
tokenizer.init_new_db(config, init_db=init_db)
|
||||||
tokenizer.init_new_db(config)
|
|
||||||
|
|
||||||
with connect(config.get_libpq_dsn()) as conn:
|
with connect(config.get_libpq_dsn()) as conn:
|
||||||
properties.set_property(conn, 'tokenizer', module_name)
|
properties.set_property(conn, 'tokenizer', module_name)
|
||||||
|
@ -5,6 +5,7 @@ from collections import OrderedDict
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
from textwrap import dedent
|
||||||
|
|
||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
import psycopg2
|
import psycopg2
|
||||||
@ -87,7 +88,7 @@ class LegacyTokenizer:
|
|||||||
self.normalization = None
|
self.normalization = None
|
||||||
|
|
||||||
|
|
||||||
def init_new_db(self, config):
|
def init_new_db(self, config, init_db=True):
|
||||||
""" Set up a new tokenizer for the database.
|
""" Set up a new tokenizer for the database.
|
||||||
|
|
||||||
This copies all necessary data in the project directory to make
|
This copies all necessary data in the project directory to make
|
||||||
@ -99,13 +100,16 @@ class LegacyTokenizer:
|
|||||||
|
|
||||||
self.normalization = config.TERM_NORMALIZATION
|
self.normalization = config.TERM_NORMALIZATION
|
||||||
|
|
||||||
|
self._install_php(config)
|
||||||
|
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
_check_module(module_dir, conn)
|
_check_module(module_dir, conn)
|
||||||
self._save_config(conn, config)
|
self._save_config(conn, config)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
self.update_sql_functions(config)
|
if init_db:
|
||||||
self._init_db_tables(config)
|
self.update_sql_functions(config)
|
||||||
|
self._init_db_tables(config)
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self):
|
def init_from_project(self):
|
||||||
@ -165,6 +169,18 @@ class LegacyTokenizer:
|
|||||||
return LegacyNameAnalyzer(self.dsn, normalizer)
|
return LegacyNameAnalyzer(self.dsn, normalizer)
|
||||||
|
|
||||||
|
|
||||||
|
def _install_php(self, config):
|
||||||
|
""" Install the php script for the tokenizer.
|
||||||
|
"""
|
||||||
|
php_file = self.data_dir / "tokenizer.php"
|
||||||
|
php_file.write_text(dedent("""\
|
||||||
|
<?php
|
||||||
|
@define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
|
||||||
|
@define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
|
||||||
|
require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
|
||||||
|
""".format(config)))
|
||||||
|
|
||||||
|
|
||||||
def _init_db_tables(self, config):
|
def _init_db_tables(self, config):
|
||||||
""" Set up the word table and fill it with pre-computed word
|
""" Set up the word table and fill it with pre-computed word
|
||||||
frequencies.
|
frequencies.
|
||||||
|
@ -104,13 +104,11 @@ PHP_CONST_DEFS = (
|
|||||||
('Default_Language', 'DEFAULT_LANGUAGE', str),
|
('Default_Language', 'DEFAULT_LANGUAGE', str),
|
||||||
('Log_DB', 'LOG_DB', bool),
|
('Log_DB', 'LOG_DB', bool),
|
||||||
('Log_File', 'LOG_FILE', str),
|
('Log_File', 'LOG_FILE', str),
|
||||||
('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
|
|
||||||
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
|
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
|
||||||
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
|
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
|
||||||
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
|
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
|
||||||
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
|
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
|
||||||
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
|
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
|
||||||
('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
|
|
||||||
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
|
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
|
||||||
('MapIcon_URL', 'MAPICON_URL', str),
|
('MapIcon_URL', 'MAPICON_URL', str),
|
||||||
)
|
)
|
||||||
@ -175,9 +173,11 @@ def setup_website(basedir, config):
|
|||||||
|
|
||||||
@define('CONST_Debug', $_GET['debug'] ?? false);
|
@define('CONST_Debug', $_GET['debug'] ?? false);
|
||||||
@define('CONST_LibDir', '{0}');
|
@define('CONST_LibDir', '{0}');
|
||||||
|
@define('CONST_TokenizerDir', '{2}');
|
||||||
@define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
|
@define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
|
||||||
|
|
||||||
""".format(config.lib_dir.php, NOMINATIM_VERSION))
|
""".format(config.lib_dir.php, NOMINATIM_VERSION,
|
||||||
|
config.project_dir / 'tokenizer'))
|
||||||
|
|
||||||
for php_name, conf_name, var_type in PHP_CONST_DEFS:
|
for php_name, conf_name, var_type in PHP_CONST_DEFS:
|
||||||
if var_type == bool:
|
if var_type == bool:
|
||||||
|
@ -10,6 +10,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
|
|||||||
from nominatim import cli
|
from nominatim import cli
|
||||||
from nominatim.config import Configuration
|
from nominatim.config import Configuration
|
||||||
from nominatim.tools import refresh
|
from nominatim.tools import refresh
|
||||||
|
from nominatim.tokenizer import factory as tokenizer_factory
|
||||||
from steps.utils import run_script
|
from steps.utils import run_script
|
||||||
|
|
||||||
class NominatimEnvironment:
|
class NominatimEnvironment:
|
||||||
@ -179,27 +180,25 @@ class NominatimEnvironment:
|
|||||||
"""
|
"""
|
||||||
self.write_nominatim_config(self.api_test_db)
|
self.write_nominatim_config(self.api_test_db)
|
||||||
|
|
||||||
if self.api_db_done:
|
if not self.api_db_done:
|
||||||
return
|
self.api_db_done = True
|
||||||
|
|
||||||
self.api_db_done = True
|
if not self._reuse_or_drop_db(self.api_test_db):
|
||||||
|
testdata = Path('__file__') / '..' / '..' / 'testdb'
|
||||||
|
self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
|
||||||
|
|
||||||
if self._reuse_or_drop_db(self.api_test_db):
|
try:
|
||||||
return
|
self.run_nominatim('import', '--osm-file', str(self.api_test_file))
|
||||||
|
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
|
||||||
|
self.run_nominatim('freeze')
|
||||||
|
|
||||||
testdata = Path('__file__') / '..' / '..' / 'testdb'
|
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
|
||||||
self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
|
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
|
||||||
|
except:
|
||||||
|
self.db_drop_database(self.api_test_db)
|
||||||
|
raise
|
||||||
|
|
||||||
try:
|
tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
|
||||||
self.run_nominatim('import', '--osm-file', str(self.api_test_file))
|
|
||||||
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
|
|
||||||
self.run_nominatim('freeze')
|
|
||||||
|
|
||||||
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
|
|
||||||
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
|
|
||||||
except:
|
|
||||||
self.db_drop_database(self.api_test_db)
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def setup_unknown_db(self):
|
def setup_unknown_db(self):
|
||||||
|
@ -16,7 +16,7 @@ class DummyTokenizer:
|
|||||||
self.analyser_cache = {}
|
self.analyser_cache = {}
|
||||||
|
|
||||||
|
|
||||||
def init_new_db(self, config):
|
def init_new_db(self, *args, **kwargs):
|
||||||
assert self.init_state == None
|
assert self.init_state == None
|
||||||
self.init_state = "new"
|
self.init_state = "new"
|
||||||
|
|
||||||
|
@ -36,6 +36,7 @@ def test_config(def_config, tmp_path):
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
|
def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
|
||||||
|
(tmp_path / 'tokenizer').mkdir()
|
||||||
|
|
||||||
def _maker():
|
def _maker():
|
||||||
return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
|
return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
|
||||||
|
@ -26,6 +26,7 @@ def test_script(envdir):
|
|||||||
|
|
||||||
def run_website_script(envdir, config):
|
def run_website_script(envdir, config):
|
||||||
config.lib_dir.php = envdir / 'php'
|
config.lib_dir.php = envdir / 'php'
|
||||||
|
config.project_dir = envdir
|
||||||
refresh.setup_website(envdir, config)
|
refresh.setup_website(envdir, config)
|
||||||
|
|
||||||
proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',
|
proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',
|
||||||
|
Loading…
Reference in New Issue
Block a user