boilerplate for PHP code of tokenizer

This adds an installation step for PHP code for the tokenizer. The
PHP code is split in two parts. The updateable code is found in
lib-php. The tokenizer installs an additional script in the
project directory which then includes the code from lib-php and
defines all settings that are static to the database. The website
code then always includes the PHP from the project directory.
This commit is contained in:
Sarah Hoffmann 2021-04-28 10:59:07 +02:00
parent 23fd1d032a
commit 3eb4d88057
11 changed files with 48 additions and 32 deletions

View File

@ -8,6 +8,7 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
require_once(CONST_LibDir.'/SearchDescription.php'); require_once(CONST_LibDir.'/SearchDescription.php');
require_once(CONST_LibDir.'/SearchContext.php'); require_once(CONST_LibDir.'/SearchContext.php');
require_once(CONST_LibDir.'/TokenList.php'); require_once(CONST_LibDir.'/TokenList.php');
require_once(CONST_TokenizerDir.'/tokenizer.php');
class Geocode class Geocode
{ {

View File

@ -2,7 +2,6 @@
@define('CONST_LibDir', dirname(dirname(__FILE__))); @define('CONST_LibDir', dirname(dirname(__FILE__)));
require_once(CONST_LibDir.'/init-cmd.php'); require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/Geocode.php');
require_once(CONST_LibDir.'/ParameterParser.php'); require_once(CONST_LibDir.'/ParameterParser.php');
ini_set('memory_limit', '800M'); ini_set('memory_limit', '800M');
@ -41,16 +40,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false)); @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB')); @define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false)); @define('CONST_Log_File', getSetting('LOG_FILE', false));
@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL')); @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT')); @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES')); @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE')); @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD')); @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA')); @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false)); @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB; $oDB = new Nominatim\DB;
$oDB->connect(); $oDB->connect();

View File

@ -3,7 +3,6 @@
require_once(CONST_LibDir.'/init-cmd.php'); require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/log.php'); require_once(CONST_LibDir.'/log.php');
require_once(CONST_LibDir.'/Geocode.php');
require_once(CONST_LibDir.'/PlaceLookup.php'); require_once(CONST_LibDir.'/PlaceLookup.php');
require_once(CONST_LibDir.'/ReverseGeocode.php'); require_once(CONST_LibDir.'/ReverseGeocode.php');
@ -26,16 +25,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false)); @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB')); @define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false)); @define('CONST_Log_File', getSetting('LOG_FILE', false));
@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL')); @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT')); @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES')); @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE')); @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD')); @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA')); @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false)); @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB(); $oDB = new Nominatim\DB();
$oDB->connect(); $oDB->connect();

View File

@ -0,0 +1 @@
<?php

View File

@ -54,8 +54,7 @@ def create_tokenizer(config, init_db=True, module_name=None):
tokenizer_module = _import_tokenizer(module_name) tokenizer_module = _import_tokenizer(module_name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
if init_db: tokenizer.init_new_db(config, init_db=init_db)
tokenizer.init_new_db(config)
with connect(config.get_libpq_dsn()) as conn: with connect(config.get_libpq_dsn()) as conn:
properties.set_property(conn, 'tokenizer', module_name) properties.set_property(conn, 'tokenizer', module_name)

View File

@ -5,6 +5,7 @@ from collections import OrderedDict
import logging import logging
import re import re
import shutil import shutil
from textwrap import dedent
from icu import Transliterator from icu import Transliterator
import psycopg2 import psycopg2
@ -87,7 +88,7 @@ class LegacyTokenizer:
self.normalization = None self.normalization = None
def init_new_db(self, config): def init_new_db(self, config, init_db=True):
""" Set up a new tokenizer for the database. """ Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make This copies all necessary data in the project directory to make
@ -99,13 +100,16 @@ class LegacyTokenizer:
self.normalization = config.TERM_NORMALIZATION self.normalization = config.TERM_NORMALIZATION
self._install_php(config)
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
_check_module(module_dir, conn) _check_module(module_dir, conn)
self._save_config(conn, config) self._save_config(conn, config)
conn.commit() conn.commit()
self.update_sql_functions(config) if init_db:
self._init_db_tables(config) self.update_sql_functions(config)
self._init_db_tables(config)
def init_from_project(self): def init_from_project(self):
@ -165,6 +169,18 @@ class LegacyTokenizer:
return LegacyNameAnalyzer(self.dsn, normalizer) return LegacyNameAnalyzer(self.dsn, normalizer)
def _install_php(self, config):
""" Install the php script for the tokenizer.
"""
php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent("""\
<?php
@define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
""".format(config)))
def _init_db_tables(self, config): def _init_db_tables(self, config):
""" Set up the word table and fill it with pre-computed word """ Set up the word table and fill it with pre-computed word
frequencies. frequencies.

View File

@ -104,13 +104,11 @@ PHP_CONST_DEFS = (
('Default_Language', 'DEFAULT_LANGUAGE', str), ('Default_Language', 'DEFAULT_LANGUAGE', str),
('Log_DB', 'LOG_DB', bool), ('Log_DB', 'LOG_DB', bool),
('Log_File', 'LOG_FILE', str), ('Log_File', 'LOG_FILE', str),
('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool), ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int), ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int), ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool), ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str), ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool), ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
('MapIcon_URL', 'MAPICON_URL', str), ('MapIcon_URL', 'MAPICON_URL', str),
) )
@ -175,9 +173,11 @@ def setup_website(basedir, config):
@define('CONST_Debug', $_GET['debug'] ?? false); @define('CONST_Debug', $_GET['debug'] ?? false);
@define('CONST_LibDir', '{0}'); @define('CONST_LibDir', '{0}');
@define('CONST_TokenizerDir', '{2}');
@define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}'); @define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
""".format(config.lib_dir.php, NOMINATIM_VERSION)) """.format(config.lib_dir.php, NOMINATIM_VERSION,
config.project_dir / 'tokenizer'))
for php_name, conf_name, var_type in PHP_CONST_DEFS: for php_name, conf_name, var_type in PHP_CONST_DEFS:
if var_type == bool: if var_type == bool:

View File

@ -10,6 +10,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
from nominatim import cli from nominatim import cli
from nominatim.config import Configuration from nominatim.config import Configuration
from nominatim.tools import refresh from nominatim.tools import refresh
from nominatim.tokenizer import factory as tokenizer_factory
from steps.utils import run_script from steps.utils import run_script
class NominatimEnvironment: class NominatimEnvironment:
@ -179,27 +180,25 @@ class NominatimEnvironment:
""" """
self.write_nominatim_config(self.api_test_db) self.write_nominatim_config(self.api_test_db)
if self.api_db_done: if not self.api_db_done:
return self.api_db_done = True
self.api_db_done = True if not self._reuse_or_drop_db(self.api_test_db):
testdata = Path('__file__') / '..' / '..' / 'testdb'
self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
if self._reuse_or_drop_db(self.api_test_db): try:
return self.run_nominatim('import', '--osm-file', str(self.api_test_file))
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze')
testdata = Path('__file__') / '..' / '..' / 'testdb' phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve()) run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
except:
self.db_drop_database(self.api_test_db)
raise
try: tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
self.run_nominatim('import', '--osm-file', str(self.api_test_file))
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze')
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
except:
self.db_drop_database(self.api_test_db)
raise
def setup_unknown_db(self): def setup_unknown_db(self):

View File

@ -16,7 +16,7 @@ class DummyTokenizer:
self.analyser_cache = {} self.analyser_cache = {}
def init_new_db(self, config): def init_new_db(self, *args, **kwargs):
assert self.init_state == None assert self.init_state == None
self.init_state = "new" self.init_state = "new"

View File

@ -36,6 +36,7 @@ def test_config(def_config, tmp_path):
@pytest.fixture @pytest.fixture
def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table): def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
(tmp_path / 'tokenizer').mkdir()
def _maker(): def _maker():
return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer') return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')

View File

@ -26,6 +26,7 @@ def test_script(envdir):
def run_website_script(envdir, config): def run_website_script(envdir, config):
config.lib_dir.php = envdir / 'php' config.lib_dir.php = envdir / 'php'
config.project_dir = envdir
refresh.setup_website(envdir, config) refresh.setup_website(envdir, config)
proc = subprocess.run(['/usr/bin/env', 'php', '-Cq', proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',