move setup function to python

There are still back-calls to PHP for some of the sub-steps.
These needs some larger refactoring to be moved to Python.
This commit is contained in:
Sarah Hoffmann 2021-02-26 15:02:39 +01:00
parent 3ee8d9fa75
commit 15b5906790
10 changed files with 342 additions and 102 deletions

View File

@ -111,72 +111,6 @@ class CommandlineParser:
# pylint: disable=E0012,C0415
class SetupAll:
"""\
Create a new Nominatim database from an OSM file.
"""
@staticmethod
def add_args(parser):
group_name = parser.add_argument_group('Required arguments')
group = group_name.add_mutually_exclusive_group(required=True)
group.add_argument('--osm-file',
help='OSM file to be imported.')
group.add_argument('--continue', dest='continue_at',
choices=['load-data', 'indexing', 'db-postprocess'],
help='Continue an import that was interrupted')
group = parser.add_argument_group('Optional arguments')
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group.add_argument('--reverse-only', action='store_true',
help='Do not create tables and indexes for searching')
group.add_argument('--enable-debug-statements', action='store_true',
help='Include debug warning statements in SQL code')
group.add_argument('--no-partitions', action='store_true',
help="""Do not partition search indices
(speeds up import of single country extracts)""")
group.add_argument('--no-updates', action='store_true',
help="""Do not keep tables that are only needed for
updating the database later""")
group = parser.add_argument_group('Expert options')
group.add_argument('--ignore-errors', action='store_true',
help='Continue import even when errors in SQL are present')
group.add_argument('--index-noanalyse', action='store_true',
help='Do not perform analyse operations during index')
@staticmethod
def run(args):
params = ['setup.php']
if args.osm_file:
params.extend(('--all', '--osm-file', args.osm_file))
else:
if args.continue_at == 'load-data':
params.append('--load-data')
if args.continue_at in ('load-data', 'indexing'):
params.append('--index')
params.extend(('--create-search-indices', '--create-country-names',
'--setup-website'))
if args.osm2pgsql_cache:
params.extend(('--osm2pgsql-cache', args.osm2pgsql_cache))
if args.reverse_only:
params.append('--reverse-only')
if args.enable_debug_statements:
params.append('--enable-debug-statements')
if args.no_partitions:
params.append('--no-partitions')
if args.no_updates:
params.append('--drop')
if args.ignore_errors:
params.append('--ignore-errors')
if args.index_noanalyse:
params.append('--index-noanalyse')
if args.threads:
params.extend(('--threads', args.threads))
return run_legacy_script(*params, nominatim_env=args)
class SetupSpecialPhrases:
"""\
Maintain special phrases.
@ -334,7 +268,7 @@ def nominatim(**kwargs):
"""
parser = CommandlineParser('nominatim', nominatim.__doc__)
parser.add_subcommand('import', SetupAll)
parser.add_subcommand('import', clicmd.SetupAll)
parser.add_subcommand('freeze', clicmd.SetupFreeze)
parser.add_subcommand('replication', clicmd.UpdateReplication)

View File

@ -2,6 +2,7 @@
Subcommand definitions for the command-line tool.
"""
from .setup import SetupAll
from .replication import UpdateReplication
from .api import APISearch, APIReverse, APILookup, APIDetails, APIStatus
from .index import UpdateIndex

140
nominatim/clicmd/setup.py Normal file
View File

@ -0,0 +1,140 @@
"""
Implementation of the 'import' subcommand.
"""
import logging
from pathlib import Path
import psutil
from ..tools.exec_utils import run_legacy_script
from ..db.connection import connect
from ..db import status
from ..errors import UsageError
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
LOG = logging.getLogger()
class SetupAll:
"""\
Create a new Nominatim database from an OSM file.
"""
@staticmethod
def add_args(parser):
group_name = parser.add_argument_group('Required arguments')
group = group_name.add_mutually_exclusive_group(required=True)
group.add_argument('--osm-file', metavar='FILE',
help='OSM file to be imported.')
group.add_argument('--continue', dest='continue_at',
choices=['load-data', 'indexing', 'db-postprocess'],
help='Continue an import that was interrupted')
group = parser.add_argument_group('Optional arguments')
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group.add_argument('--reverse-only', action='store_true',
help='Do not create tables and indexes for searching')
group.add_argument('--no-partitions', action='store_true',
help="""Do not partition search indices
(speeds up import of single country extracts)""")
group.add_argument('--no-updates', action='store_true',
help="""Do not keep tables that are only needed for
updating the database later""")
group = parser.add_argument_group('Expert options')
group.add_argument('--ignore-errors', action='store_true',
help='Continue import even when errors in SQL are present')
group.add_argument('--index-noanalyse', action='store_true',
help='Do not perform analyse operations during index')
@staticmethod
def run(args): # pylint: disable=too-many-statements
from ..tools import database_import
from ..tools import refresh
from ..indexer.indexer import Indexer
if args.osm_file and not Path(args.osm_file).is_file():
LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
raise UsageError('Cannot access file.')
if args.continue_at is None:
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
args.data_dir,
args.no_partitions,
rouser=args.config.DATABASE_WEBUSER)
LOG.warning('Installing database module')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.install_module(args.module_dir, args.project_dir,
args.config.DATABASE_MODULE_PATH,
conn=conn)
LOG.warning('Importing OSM data file')
database_import.import_osm_data(Path(args.osm_file),
args.osm2pgsql_options(0, 1),
drop=args.no_updates)
LOG.warning('Create functions (1st pass)')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config, args.sqllib_dir,
False, False)
LOG.warning('Create tables')
params = ['setup.php', '--create-tables', '--create-partition-tables']
if args.reverse_only:
params.append('--reverse-only')
run_legacy_script(*params, nominatim_env=args)
LOG.warning('Create functions (2nd pass)')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config, args.sqllib_dir,
False, False)
LOG.warning('Importing wikipedia importance data')
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
data_path) > 0:
LOG.error('Wikipedia importance dump file not found. '
'Will be using default importances.')
LOG.warning('Initialise tables')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Load data into placex table')
database_import.load_data(args.config.get_libpq_dsn(),
args.data_dir,
args.threads or psutil.cpu_count() or 1)
LOG.warning('Calculate postcodes')
run_legacy_script('setup.php', '--calculate-postcodes', nominatim_env=args)
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
LOG.warning('Indexing places')
indexer = Indexer(args.config.get_libpq_dsn(),
args.threads or psutil.cpu_count() or 1)
indexer.index_full(analyse=not args.index_noanalyse)
LOG.warning('Post-process tables')
params = ['setup.php', '--create-search-indices', '--create-country-names']
if args.no_updates:
params.append('--drop')
run_legacy_script(*params, nominatim_env=args)
webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir)
refresh.setup_website(webdir, args.phplib_dir, args.config)
with connect(args.config.get_libpq_dsn()) as conn:
try:
dbdate = status.compute_database_date(conn)
status.set_status(conn, dbdate)
LOG.info('Database is at %s.', dbdate)
except Exception as exc: # pylint: disable=broad-except
LOG.error('Cannot determine date of database: %s', exc)
return 0

View File

@ -59,12 +59,12 @@ class AdminTransition:
if args.setup_db:
LOG.warning('Setup DB')
mpath = database_import.install_module(args.module_dir, args.project_dir,
args.config.DATABASE_MODULE_PATH)
with connect(args.config.get_libpq_dsn()) as conn:
database_import.setup_extensions(conn)
database_import.check_module_dir_path(conn, mpath)
database_import.install_module(args.module_dir, args.project_dir,
args.config.DATABASE_MODULE_PATH,
conn=conn)
database_import.import_base_data(args.config.get_libpq_dsn(),
args.data_dir, args.no_partitions)
@ -88,7 +88,7 @@ class AdminTransition:
with connect(args.config.get_libpq_dsn()) as conn:
try:
status.set_status(conn, status.compute_database_date(conn))
except Exception as exc: # pylint: disable=bare-except
except Exception as exc: # pylint: disable=broad-except
LOG.error('Cannot determine date of database: %s', exc)
if args.index:

View File

@ -119,6 +119,13 @@ class PostcodeRunner:
WHERE place_id IN ({})
""".format(','.join((str(i) for i in ids)))
def _analyse_db_if(conn, condition):
if condition:
with conn.cursor() as cur:
cur.execute('ANALYSE')
class Indexer:
""" Main indexing routine.
"""
@ -142,7 +149,7 @@ class Indexer:
for thread in self.threads:
thread.close()
threads = []
self.threads = []
def index_full(self, analyse=True):
@ -155,26 +162,22 @@ class Indexer:
try:
self.index_by_rank(0, 4)
self._analyse_db_if(conn, analyse)
_analyse_db_if(conn, analyse)
self.index_boundaries(0, 30)
self._analyse_db_if(conn, analyse)
_analyse_db_if(conn, analyse)
self.index_by_rank(5, 25)
self._analyse_db_if(conn, analyse)
_analyse_db_if(conn, analyse)
self.index_by_rank(26, 30)
self._analyse_db_if(conn, analyse)
_analyse_db_if(conn, analyse)
self.index_postcodes()
self._analyse_db_if(conn, analyse)
_analyse_db_if(conn, analyse)
finally:
conn.close()
def _analyse_db_if(self, conn, condition):
if condition:
with conn.cursor() as cur:
cur.execute('ANALYSE')
def index_boundaries(self, minrank, maxrank):
""" Index only administrative boundaries within the given rank range.

View File

@ -9,6 +9,7 @@ import shutil
from pathlib import Path
import psutil
import psycopg2
from ..db.connection import connect, get_pg_env
from ..db import utils as db_utils
@ -19,6 +20,21 @@ from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION
LOG = logging.getLogger()
def setup_database_skeleton(dsn, data_dir, no_partitions, rouser=None):
""" Create a new database for Nominatim and populate it with the
essential extensions and data.
"""
LOG.warning('Creating database')
create_db(dsn, rouser)
LOG.warning('Setting up database')
with connect(dsn) as conn:
setup_extensions(conn)
LOG.warning('Loading basic data')
import_base_data(dsn, data_dir, no_partitions)
def create_db(dsn, rouser=None):
""" Create a new database for the given DSN. Fails when the database
already exists or the PostgreSQL version is too old.
@ -72,7 +88,7 @@ def setup_extensions(conn):
raise UsageError('PostGIS version is too old.')
def install_module(src_dir, project_dir, module_dir):
def install_module(src_dir, project_dir, module_dir, conn=None):
""" Copy the normalization module from src_dir into the project
directory under the '/module' directory. If 'module_dir' is set, then
use the module from there instead and check that it is accessible
@ -80,6 +96,9 @@ def install_module(src_dir, project_dir, module_dir):
The function detects when the installation is run from the
build directory. It doesn't touch the module in that case.
If 'conn' is given, then the function also tests if the module
can be access via the given database.
"""
if not module_dir:
module_dir = project_dir / 'module'
@ -99,19 +118,17 @@ def install_module(src_dir, project_dir, module_dir):
else:
LOG.info("Using custom path for database module at '%s'", module_dir)
return module_dir
def check_module_dir_path(conn, path):
""" Check that the normalisation module can be found and executed
from the given path.
"""
with conn.cursor() as cur:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS '{}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""".format(path))
if conn is not None:
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS '{}/nominatim.so', 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""".format(module_dir))
except psycopg2.DatabaseError as err:
LOG.fatal("Error accessing database module: %s", err)
raise UsageError("Database module cannot be accessed.") from err
def import_base_data(dsn, sql_dir, ignore_partitions=False):
@ -174,7 +191,7 @@ def truncate_data_tables(conn, max_word_frequency=None):
cur.execute('TRUNCATE location_property_osmline')
cur.execute('TRUNCATE location_postcode')
cur.execute('TRUNCATE search_name')
cur.execute('DROP SEQUENCE seq_place')
cur.execute('DROP SEQUENCE IF EXISTS seq_place')
cur.execute('CREATE SEQUENCE seq_place start 100000')
cur.execute("""SELECT tablename FROM pg_tables

View File

@ -43,6 +43,11 @@ class _TestingCursor(psycopg2.extras.DictCursor):
WHERE tablename = %s""", (table, ))
return num == 1
def table_rows(self, table):
""" Return the number of rows in the given table.
"""
return self.scalar('SELECT count(*) FROM ' + table)
@pytest.fixture
def temp_db(monkeypatch):
@ -109,8 +114,12 @@ def temp_db_cursor(temp_db):
@pytest.fixture
def table_factory(temp_db_cursor):
def mk_table(name, definition='id INT'):
def mk_table(name, definition='id INT', content=None):
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
if content is not None:
if not isinstance(content, str):
content = '),('.join([str(x) for x in content])
temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
return mk_table
@ -174,7 +183,7 @@ def place_row(place_table, temp_db_cursor):
temp_db_cursor.execute("INSERT INTO place VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
(osm_id or next(idseq), osm_type, cls, typ, names,
admin_level, address, extratags,
geom or 'SRID=4326;POINT(0 0 )'))
geom or 'SRID=4326;POINT(0 0)'))
return _insert
@ -184,7 +193,7 @@ def placex_table(temp_db_with_extensions, temp_db_conn):
"""
with temp_db_conn.cursor() as cur:
cur.execute("""CREATE TABLE placex (
place_id BIGINT NOT NULL,
place_id BIGINT,
parent_place_id BIGINT,
linked_place_id BIGINT,
importance FLOAT,
@ -207,8 +216,43 @@ def placex_table(temp_db_with_extensions, temp_db_conn):
country_code varchar(2),
housenumber TEXT,
postcode TEXT,
centroid GEOMETRY(Geometry, 4326))
""")
centroid GEOMETRY(Geometry, 4326))""")
temp_db_conn.commit()
@pytest.fixture
def osmline_table(temp_db_with_extensions, temp_db_conn):
with temp_db_conn.cursor() as cur:
cur.execute("""CREATE TABLE location_property_osmline (
place_id BIGINT,
osm_id BIGINT,
parent_place_id BIGINT,
geometry_sector INTEGER,
indexed_date TIMESTAMP,
startnumber INTEGER,
endnumber INTEGER,
partition SMALLINT,
indexed_status SMALLINT,
linegeo GEOMETRY,
interpolationtype TEXT,
address HSTORE,
postcode TEXT,
country_code VARCHAR(2))""")
temp_db_conn.commit()
@pytest.fixture
def word_table(temp_db, temp_db_conn):
with temp_db_conn.cursor() as cur:
cur.execute("""CREATE TABLE word (
word_id INTEGER,
word_token text,
word text,
class text,
type text,
country_code varchar(2),
search_name_count INTEGER,
operator TEXT)""")
temp_db_conn.commit()

View File

@ -13,9 +13,11 @@ import nominatim.cli
import nominatim.clicmd.api
import nominatim.clicmd.refresh
import nominatim.clicmd.admin
import nominatim.clicmd.setup
import nominatim.indexer.indexer
import nominatim.tools.admin
import nominatim.tools.check_database
import nominatim.tools.database_import
import nominatim.tools.freeze
import nominatim.tools.refresh
@ -61,7 +63,6 @@ def test_cli_help(capsys):
@pytest.mark.parametrize("command,script", [
(('import', '--continue', 'load-data'), 'setup'),
(('special-phrases',), 'specialphrases'),
(('add-data', '--tiger-data', 'tiger'), 'setup'),
(('add-data', '--file', 'foo.osm'), 'update'),
@ -74,6 +75,36 @@ def test_legacy_commands_simple(mock_run_legacy, command, script):
assert mock_run_legacy.last_args[0] == script + '.php'
def test_import_missing_file(temp_db):
assert 1 == call_nominatim('import', '--osm-file', 'sfsafegweweggdgw.reh.erh')
def test_import_bad_file(temp_db):
assert 1 == call_nominatim('import', '--osm-file', '.')
def test_import_full(temp_db, mock_func_factory):
mocks = [
mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
mock_func_factory(nominatim.tools.database_import, 'install_module'),
mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
mock_func_factory(nominatim.tools.database_import, 'load_data'),
mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
mock_func_factory(nominatim.tools.refresh, 'setup_website'),
]
cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
mock_func_factory(nominatim.clicmd.setup, 'run_legacy_script')
assert 0 == call_nominatim('import', '--osm-file', __file__)
assert cf_mock.called > 1
for mock in mocks:
assert mock.called == 1
def test_freeze_command(mock_func_factory, temp_db):
mock_drop = mock_func_factory(nominatim.tools.freeze, 'drop_update_tables')
mock_flatnode = mock_func_factory(nominatim.tools.freeze, 'drop_flatnode_file')

View File

@ -63,6 +63,10 @@ def test_check_database_indexes_bad(temp_db_conn, def_config):
assert chkdb.check_database_indexes(temp_db_conn, def_config) == chkdb.CheckState.FAIL
def test_check_database_indexes_valid(temp_db_conn, def_config):
assert chkdb.check_database_index_valid(temp_db_conn, def_config) == chkdb.CheckState.OK
def test_check_tiger_table_disabled(temp_db_conn, def_config, monkeypatch):
monkeypatch.setenv('NOMINATIM_USE_US_TIGER_DATA' , 'no')
assert chkdb.check_tiger_table(temp_db_conn, def_config) == chkdb.CheckState.NOT_APPLICABLE

View File

@ -24,6 +24,24 @@ def nonexistant_db():
with conn.cursor() as cur:
cur.execute('DROP DATABASE IF EXISTS {}'.format(dbname))
@pytest.mark.parametrize("no_partitions", (True, False))
def test_setup_skeleton(src_dir, nonexistant_db, no_partitions):
database_import.setup_database_skeleton('dbname=' + nonexistant_db,
src_dir / 'data', no_partitions)
conn = psycopg2.connect(database=nonexistant_db)
try:
with conn.cursor() as cur:
cur.execute("SELECT distinct partition FROM country_name")
partitions = set([r[0] for r in list(cur)])
if no_partitions:
assert partitions == set([0])
else:
assert len(partitions) > 10
finally:
conn.close()
def test_create_db_success(nonexistant_db):
database_import.create_db('dbname=' + nonexistant_db, rouser='www-data')
@ -79,6 +97,22 @@ def test_install_module(tmp_path):
assert outfile.stat().st_mode == 33261
def test_install_module_custom(tmp_path):
(tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve()))
assert not (tmp_path / 'module').exists()
def test_install_module_fail_access(temp_db_conn, tmp_path):
(tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
with pytest.raises(UsageError, match='.*module cannot be accessed.*'):
database_import.install_module(tmp_path, tmp_path, '',
conn=temp_db_conn)
def test_import_base_data(src_dir, temp_db, temp_db_cursor):
temp_db_cursor.execute('CREATE EXTENSION hstore')
temp_db_cursor.execute('CREATE EXTENSION postgis')
@ -134,3 +168,35 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):
osm2pgsql_options['osm2pgsql_cache'] = 0
database_import.import_osm_data(Path(__file__), osm2pgsql_options)
def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
tables = ('word', 'placex', 'place_addressline', 'location_area',
'location_area_country', 'location_property',
'location_property_tiger', 'location_property_osmline',
'location_postcode', 'search_name', 'location_road_23')
for table in tables:
table_factory(table, content=(1, 2, 3))
database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
for table in tables:
assert temp_db_cursor.table_rows(table) == 0
@pytest.mark.parametrize("threads", (1, 5))
def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_table,
temp_db_cursor, threads):
for func in ('make_keywords', 'getorcreate_housenumber_id', 'make_standard_name'):
temp_db_cursor.execute("""CREATE FUNCTION {} (src TEXT)
RETURNS TEXT AS $$ SELECT 'a' $$ LANGUAGE SQL
""".format(func))
for oid in range(100, 130):
place_row(osm_id=oid)
place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
geom='SRID=4326;LINESTRING(0 0, 10 10)')
database_import.load_data(dsn, src_dir / 'data', threads)
assert temp_db_cursor.table_rows('placex') == 30
assert temp_db_cursor.table_rows('location_property_osmline') == 1