Nominatim/nominatim/clicmd/setup.py

"""
Implementation of the 'import' subcommand.
"""
import logging
from pathlib import Path

import psutil

from nominatim.db.connection import connect
from nominatim.db import status, properties
from nominatim.version import NOMINATIM_VERSION
from nominatim.errors import UsageError

# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415

LOG = logging.getLogger()

class SetupAll:
    """\
    Create a new Nominatim database from an OSM file.
    """

    @staticmethod
    def add_args(parser):
        group_name = parser.add_argument_group('Required arguments')
        group = group_name.add_mutually_exclusive_group(required=True)
        group.add_argument('--osm-file', metavar='FILE',
                           help='OSM file to be imported.')
        group.add_argument('--continue', dest='continue_at',
                           choices=['load-data', 'indexing', 'db-postprocess'],
                           help='Continue an import that was interrupted')
        group = parser.add_argument_group('Optional arguments')
        group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
                           help='Size of cache to be used by osm2pgsql (in MB)')
        group.add_argument('--reverse-only', action='store_true',
                           help='Do not create tables and indexes for searching')
        group.add_argument('--no-partitions', action='store_true',
                           help=("Do not partition search indices "
                                 "(speeds up import of single country extracts)"))
        group.add_argument('--no-updates', action='store_true',
                           help="Do not keep tables that are only needed for "
                                "updating the database later")
        group = parser.add_argument_group('Expert options')
        group.add_argument('--ignore-errors', action='store_true',
                           help='Continue import even when errors in SQL are present')
        group.add_argument('--index-noanalyse', action='store_true',
                           help='Do not perform analyse operations during index')


    @staticmethod
    def run(args): # pylint: disable=too-many-statements
        from ..tools import database_import
        from ..tools import refresh
        from ..indexer.indexer import Indexer
        from ..tools import postcodes
        from ..tokenizer import factory as tokenizer_factory

        if args.osm_file and not Path(args.osm_file).is_file():
            LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
            raise UsageError('Cannot access file.')

        if args.continue_at is None:
            database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
                                                    args.data_dir,
                                                    args.no_partitions,
                                                    rouser=args.config.DATABASE_WEBUSER)

            LOG.warning('Importing OSM data file')
            database_import.import_osm_data(Path(args.osm_file),
                                            args.osm2pgsql_options(0, 1),
                                            drop=args.no_updates,
                                            ignore_errors=args.ignore_errors)

            with connect(args.config.get_libpq_dsn()) as conn:
                LOG.warning('Create functions (1st pass)')
                refresh.create_functions(conn, args.config, False, False)
                LOG.warning('Create tables')
                database_import.create_tables(conn, args.config,
                                              reverse_only=args.reverse_only)
                refresh.load_address_levels_from_file(conn, Path(args.config.ADDRESS_LEVEL_CONFIG))
                LOG.warning('Create functions (2nd pass)')
                refresh.create_functions(conn, args.config, False, False)
                LOG.warning('Create table triggers')
                database_import.create_table_triggers(conn, args.config)
                LOG.warning('Create partition tables')
                database_import.create_partition_tables(conn, args.config)
                LOG.warning('Create functions (3rd pass)')
                refresh.create_functions(conn, args.config, False, False)

            LOG.warning('Importing wikipedia importance data')
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
            if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
                                                 data_path) > 0:
                LOG.error('Wikipedia importance dump file not found. '
                          'Will be using default importances.')

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Initialise tables')
            with connect(args.config.get_libpq_dsn()) as conn:
                database_import.truncate_data_tables(conn)

            LOG.warning('Load data into placex table')
            database_import.load_data(args.config.get_libpq_dsn(),
                                      args.threads or psutil.cpu_count() or 1)

        LOG.warning("Setting up tokenizer")
        if args.continue_at is None or args.continue_at == 'load-data':
            # (re)initialise the tokenizer data
            tokenizer = tokenizer_factory.create_tokenizer(args.config)
        else:
            # just load the tokenizer
            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)

        if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
            if args.continue_at is not None and args.continue_at != 'load-data':
                with connect(args.config.get_libpq_dsn()) as conn:
                    SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
            LOG.warning('Indexing places')
            indexer = Indexer(args.config.get_libpq_dsn(),
                              args.threads or psutil.cpu_count() or 1)
            indexer.index_full(analyse=not args.index_noanalyse)

        LOG.warning('Post-process tables')
        with connect(args.config.get_libpq_dsn()) as conn:
            database_import.create_search_indices(conn, args.config,
                                                  drop=args.no_updates)
            LOG.warning('Create search index for default country names.')
            database_import.create_country_names(conn, args.config)

        webdir = args.project_dir / 'website'
        LOG.warning('Setup website at %s', webdir)
        refresh.setup_website(webdir, args.config)

        with connect(args.config.get_libpq_dsn()) as conn:
            try:
                dbdate = status.compute_database_date(conn)
                status.set_status(conn, dbdate)
                LOG.info('Database is at %s.', dbdate)
            except Exception as exc: # pylint: disable=broad-except
                LOG.error('Cannot determine date of database: %s', exc)

            properties.set_property(conn, 'database_version',
                                    '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))

        return 0


    @staticmethod
    def _create_pending_index(conn, tablespace):
        """ Add a supporting index for finding places still to be indexed.

            This index is normally created at the end of the import process
            for later updates. When indexing was partially done, then this
            index can greatly improve speed going through already indexed data.
        """
        if conn.index_exists('idx_placex_pendingsector'):
            return

        with conn.cursor() as cur:
            LOG.warning('Creating support index')
            if tablespace:
                tablespace = 'TABLESPACE ' + tablespace
            cur.execute("""CREATE INDEX idx_placex_pendingsector
                           ON placex USING BTREE (rank_address,geometry_sector)
                           {} WHERE indexed_status > 0
                        """.format(tablespace))
        conn.commit()
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`"""`
			`Implementation of the 'import' subcommand.`
			`"""`
			`import logging`
			`from pathlib import Path`

			`import psutil`

use absolute imports in Python code Relative imports are no longer officially recommended. 2021-04-16 15:20:09 +03:00			`from nominatim.db.connection import connect`
			`from nominatim.db import status, properties`
			`from nominatim.version import NOMINATIM_VERSION`
			`from nominatim.errors import UsageError`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`# Do not repeat documentation of subcommand classes.`
			`# pylint: disable=C0111`
			`# Using non-top-level imports to avoid eventually unused imports.`
			`# pylint: disable=E0012,C0415`

			`LOG = logging.getLogger()`

			`class SetupAll:`
			`"""\`
			`Create a new Nominatim database from an OSM file.`
			`"""`

			`@staticmethod`
			`def add_args(parser):`
			`group_name = parser.add_argument_group('Required arguments')`
			`group = group_name.add_mutually_exclusive_group(required=True)`
			`group.add_argument('--osm-file', metavar='FILE',`
			`help='OSM file to be imported.')`
			`group.add_argument('--continue', dest='continue_at',`
			`choices=['load-data', 'indexing', 'db-postprocess'],`
			`help='Continue an import that was interrupted')`
			`group = parser.add_argument_group('Optional arguments')`
			`group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,`
			`help='Size of cache to be used by osm2pgsql (in MB)')`
			`group.add_argument('--reverse-only', action='store_true',`
			`help='Do not create tables and indexes for searching')`
			`group.add_argument('--no-partitions', action='store_true',`
Rebase with master 2021-03-29 11:30:45 +03:00			`help=("Do not partition search indices "`
			`"(speeds up import of single country extracts)"))`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`group.add_argument('--no-updates', action='store_true',`
Rebase with master 2021-03-29 11:30:45 +03:00			`help="Do not keep tables that are only needed for "`
			`"updating the database later")`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`group = parser.add_argument_group('Expert options')`
			`group.add_argument('--ignore-errors', action='store_true',`
			`help='Continue import even when errors in SQL are present')`
			`group.add_argument('--index-noanalyse', action='store_true',`
			`help='Do not perform analyse operations during index')`


			`@staticmethod`
			`def run(args): # pylint: disable=too-many-statements`
			`from ..tools import database_import`
			`from ..tools import refresh`
			`from ..indexer.indexer import Indexer`
port function to compute initial postcodes to Python 2021-04-16 16:05:40 +03:00			`from ..tools import postcodes`
introduce tokenizer modules This adds the boilerplate for selecting configurable tokenizers. A tokenizer can be chosen at import time and will then install itself such that it is fixed for the given database import even when the software itself is updated. The legacy tokenizer implements Nominatim's traditional algorithms. 2021-04-21 10:57:17 +03:00			`from ..tokenizer import factory as tokenizer_factory`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`if args.osm_file and not Path(args.osm_file).is_file():`
			`LOG.fatal("OSM file '%s' does not exist.", args.osm_file)`
			`raise UsageError('Cannot access file.')`

			`if args.continue_at is None:`
			`database_import.setup_database_skeleton(args.config.get_libpq_dsn(),`
			`args.data_dir,`
			`args.no_partitions,`
			`rouser=args.config.DATABASE_WEBUSER)`

			`LOG.warning('Importing OSM data file')`
			`database_import.import_osm_data(Path(args.osm_file),`
			`args.osm2pgsql_options(0, 1),`
bdd: use python library where possible Replace calls to PHP scripts with direct calls into the nominatim Python library where possible. This speed up tests quite a bit. 2021-02-26 18:14:29 +03:00			`drop=args.no_updates,`
			`ignore_errors=args.ignore_errors)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`with connect(args.config.get_libpq_dsn()) as conn:`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create functions (1st pass)')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`refresh.create_functions(conn, args.config, False, False)`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create tables')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`database_import.create_tables(conn, args.config,`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`reverse_only=args.reverse_only)`
			`refresh.load_address_levels_from_file(conn, Path(args.config.ADDRESS_LEVEL_CONFIG))`
			`LOG.warning('Create functions (2nd pass)')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`refresh.create_functions(conn, args.config, False, False)`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create table triggers')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`database_import.create_table_triggers(conn, args.config)`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create partition tables')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`database_import.create_partition_tables(conn, args.config)`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create functions (3rd pass)')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`refresh.create_functions(conn, args.config, False, False)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`LOG.warning('Importing wikipedia importance data')`
			`data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)`
			`if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),`
			`data_path) > 0:`
			`LOG.error('Wikipedia importance dump file not found. '`
			`'Will be using default importances.')`

add support index when continuing import at index phase Indexing scans the placex table sequentially during indexing on the initial import. That is okay because we know that all rows need to be processed anywhere. When continuing the import, however, a large part might already be indexed, so that the process spends a lot of time going through rows that are no longer of interest. Create a supporting index for all unindexed rows to speed up the scan. This is the same index as used later for updates. 2021-04-17 12:07:04 +03:00			`if args.continue_at is None or args.continue_at == 'load-data':`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`LOG.warning('Initialise tables')`
			`with connect(args.config.get_libpq_dsn()) as conn:`
move word table and normalisation SQL into tokenizer Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation. 2021-04-22 23:47:34 +03:00			`database_import.truncate_data_tables(conn)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`LOG.warning('Load data into placex table')`
			`database_import.load_data(args.config.get_libpq_dsn(),`
			`args.threads or psutil.cpu_count() or 1)`

introduce tokenizer modules This adds the boilerplate for selecting configurable tokenizers. A tokenizer can be chosen at import time and will then install itself such that it is fixed for the given database import even when the software itself is updated. The legacy tokenizer implements Nominatim's traditional algorithms. 2021-04-21 10:57:17 +03:00			`LOG.warning("Setting up tokenizer")`
move word table and normalisation SQL into tokenizer Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation. 2021-04-22 23:47:34 +03:00			`if args.continue_at is None or args.continue_at == 'load-data':`
			`# (re)initialise the tokenizer data`
			`tokenizer = tokenizer_factory.create_tokenizer(args.config)`
			`else:`
			`# just load the tokenizer`
			`tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)`
introduce tokenizer modules This adds the boilerplate for selecting configurable tokenizers. A tokenizer can be chosen at import time and will then install itself such that it is fixed for the given database import even when the software itself is updated. The legacy tokenizer implements Nominatim's traditional algorithms. 2021-04-21 10:57:17 +03:00
			`if args.continue_at is None or args.continue_at == 'load-data':`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`LOG.warning('Calculate postcodes')`
port function to compute initial postcodes to Python 2021-04-16 16:05:40 +03:00			`postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):`
add support index when continuing import at index phase Indexing scans the placex table sequentially during indexing on the initial import. That is okay because we know that all rows need to be processed anywhere. When continuing the import, however, a large part might already be indexed, so that the process spends a lot of time going through rows that are no longer of interest. Create a supporting index for all unindexed rows to speed up the scan. This is the same index as used later for updates. 2021-04-17 12:07:04 +03:00			`if args.continue_at is not None and args.continue_at != 'load-data':`
			`with connect(args.config.get_libpq_dsn()) as conn:`
			`SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`LOG.warning('Indexing places')`
			`indexer = Indexer(args.config.get_libpq_dsn(),`
			`args.threads or psutil.cpu_count() or 1)`
			`indexer.index_full(analyse=not args.index_noanalyse)`

			`LOG.warning('Post-process tables')`
port index creation to python Also switches to jinja-based preprocessing, which allows to simplify the SQL files. Use 'if not exists' where possible so that the step can be rerun to fix missing indexes. 2021-03-04 12:55:24 +03:00			`with connect(args.config.get_libpq_dsn()) as conn:`
			`database_import.create_search_indices(conn, args.config,`
			`drop=args.no_updates)`
Ported createCountryNames() to python and added tests 2021-03-12 07:58:41 +03:00			`LOG.warning('Create search index for default country names.')`
			`database_import.create_country_names(conn, args.config)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`webdir = args.project_dir / 'website'`
			`LOG.warning('Setup website at %s', webdir)`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`refresh.setup_website(webdir, args.config)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`with connect(args.config.get_libpq_dsn()) as conn:`
			`try:`
			`dbdate = status.compute_database_date(conn)`
			`status.set_status(conn, dbdate)`
			`LOG.info('Database is at %s.', dbdate)`
			`except Exception as exc: # pylint: disable=broad-except`
			`LOG.error('Cannot determine date of database: %s', exc)`

save software version in the database The version represents the software version that was used to import the data. 2021-03-01 22:35:15 +03:00			`properties.set_property(conn, 'database_version',`
			`'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))`

move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`return 0`
add support index when continuing import at index phase Indexing scans the placex table sequentially during indexing on the initial import. That is okay because we know that all rows need to be processed anywhere. When continuing the import, however, a large part might already be indexed, so that the process spends a lot of time going through rows that are no longer of interest. Create a supporting index for all unindexed rows to speed up the scan. This is the same index as used later for updates. 2021-04-17 12:07:04 +03:00

			`@staticmethod`
			`def _create_pending_index(conn, tablespace):`
			`""" Add a supporting index for finding places still to be indexed.`

			`This index is normally created at the end of the import process`
			`for later updates. When indexing was partially done, then this`
			`index can greatly improve speed going through already indexed data.`
			`"""`
			`if conn.index_exists('idx_placex_pendingsector'):`
			`return`

			`with conn.cursor() as cur:`
			`LOG.warning('Creating support index')`
			`if tablespace:`
			`tablespace = 'TABLESPACE ' + tablespace`
			`cur.execute("""CREATE INDEX idx_placex_pendingsector`
			`ON placex USING BTREE (rank_address,geometry_sector)`
			`{} WHERE indexed_status > 0`
			`""".format(tablespace))`
			`conn.commit()`