Nominatim/nominatim/clicmd/setup.py

"""
Implementation of the 'import' subcommand.
"""
import logging
from pathlib import Path

import psutil

from nominatim.db.connection import connect
from nominatim.db import status, properties
from nominatim.version import NOMINATIM_VERSION
from nominatim.errors import UsageError

# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415

LOG = logging.getLogger()

class SetupAll:
    """\
    Create a new Nominatim database from an OSM file.
    """

    @staticmethod
    def add_args(parser):
        group_name = parser.add_argument_group('Required arguments')
        group = group_name.add_mutually_exclusive_group(required=True)
        group.add_argument('--osm-file', metavar='FILE',
                           help='OSM file to be imported.')
        group.add_argument('--continue', dest='continue_at',
                           choices=['load-data', 'indexing', 'db-postprocess'],
                           help='Continue an import that was interrupted')
        group = parser.add_argument_group('Optional arguments')
        group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
                           help='Size of cache to be used by osm2pgsql (in MB)')
        group.add_argument('--reverse-only', action='store_true',
                           help='Do not create tables and indexes for searching')
        group.add_argument('--no-partitions', action='store_true',
                           help=("Do not partition search indices "
                                 "(speeds up import of single country extracts)"))
        group.add_argument('--no-updates', action='store_true',
                           help="Do not keep tables that are only needed for "
                                "updating the database later")
        group = parser.add_argument_group('Expert options')
        group.add_argument('--ignore-errors', action='store_true',
                           help='Continue import even when errors in SQL are present')
        group.add_argument('--index-noanalyse', action='store_true',
                           help='Do not perform analyse operations during index')


    @staticmethod
    def run(args): # pylint: disable=too-many-statements
        from ..tools import database_import, refresh, postcodes, freeze
        from ..indexer.indexer import Indexer
        from ..tokenizer import factory as tokenizer_factory

        if args.osm_file and not Path(args.osm_file).is_file():
            LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
            raise UsageError('Cannot access file.')

        if args.continue_at is None:
            database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
                                                    args.data_dir,
                                                    args.no_partitions,
                                                    rouser=args.config.DATABASE_WEBUSER)

            LOG.warning('Importing OSM data file')
            database_import.import_osm_data(Path(args.osm_file),
                                            args.osm2pgsql_options(0, 1),
                                            drop=args.no_updates,
                                            ignore_errors=args.ignore_errors)

            with connect(args.config.get_libpq_dsn()) as conn:
                LOG.warning('Create functions (1st pass)')
                refresh.create_functions(conn, args.config, False, False)
                LOG.warning('Create tables')
                database_import.create_tables(conn, args.config,
                                              reverse_only=args.reverse_only)
                refresh.load_address_levels_from_file(conn, Path(args.config.ADDRESS_LEVEL_CONFIG))
                LOG.warning('Create functions (2nd pass)')
                refresh.create_functions(conn, args.config, False, False)
                LOG.warning('Create table triggers')
                database_import.create_table_triggers(conn, args.config)
                LOG.warning('Create partition tables')
                database_import.create_partition_tables(conn, args.config)
                LOG.warning('Create functions (3rd pass)')
                refresh.create_functions(conn, args.config, False, False)

            LOG.warning('Importing wikipedia importance data')
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
            if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
                                                 data_path) > 0:
                LOG.error('Wikipedia importance dump file not found. '
                          'Will be using default importances.')

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Initialise tables')
            with connect(args.config.get_libpq_dsn()) as conn:
                database_import.truncate_data_tables(conn)

            LOG.warning('Load data into placex table')
            database_import.load_data(args.config.get_libpq_dsn(),
                                      args.threads or psutil.cpu_count() or 1)

        LOG.warning("Setting up tokenizer")
        if args.continue_at is None or args.continue_at == 'load-data':
            # (re)initialise the tokenizer data
            tokenizer = tokenizer_factory.create_tokenizer(args.config)
        else:
            # just load the tokenizer
            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)

        if args.continue_at is None or args.continue_at == 'load-data':
            LOG.warning('Calculate postcodes')
            postcodes.update_postcodes(args.config.get_libpq_dsn(),
                                       args.project_dir, tokenizer)

        if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
            if args.continue_at is not None and args.continue_at != 'load-data':
                with connect(args.config.get_libpq_dsn()) as conn:
                    SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
            LOG.warning('Indexing places')
            indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                              args.threads or psutil.cpu_count() or 1)
            indexer.index_full(analyse=not args.index_noanalyse)

        LOG.warning('Post-process tables')
        with connect(args.config.get_libpq_dsn()) as conn:
            database_import.create_search_indices(conn, args.config,
                                                  drop=args.no_updates)
            LOG.warning('Create search index for default country names.')
            database_import.create_country_names(conn, tokenizer,
                                                 args.config.LANGUAGES)
            conn.commit()
            if args.no_updates:
                freeze.drop_update_tables(conn)
        tokenizer.finalize_import(args.config)


        webdir = args.project_dir / 'website'
        LOG.warning('Setup website at %s', webdir)
        with connect(args.config.get_libpq_dsn()) as conn:
            refresh.setup_website(webdir, args.config, conn)

        with connect(args.config.get_libpq_dsn()) as conn:
            try:
                dbdate = status.compute_database_date(conn)
                status.set_status(conn, dbdate)
                LOG.info('Database is at %s.', dbdate)
            except Exception as exc: # pylint: disable=broad-except
                LOG.error('Cannot determine date of database: %s', exc)

            properties.set_property(conn, 'database_version',
                                    '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))

        return 0


    @staticmethod
    def _create_pending_index(conn, tablespace):
        """ Add a supporting index for finding places still to be indexed.

            This index is normally created at the end of the import process
            for later updates. When indexing was partially done, then this
            index can greatly improve speed going through already indexed data.
        """
        if conn.index_exists('idx_placex_pendingsector'):
            return

        with conn.cursor() as cur:
            LOG.warning('Creating support index')
            if tablespace:
                tablespace = 'TABLESPACE ' + tablespace
            cur.execute("""CREATE INDEX idx_placex_pendingsector
                           ON placex USING BTREE (rank_address,geometry_sector)
                           {} WHERE indexed_status > 0
                        """.format(tablespace))
        conn.commit()
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`"""`
			`Implementation of the 'import' subcommand.`
			`"""`
			`import logging`
			`from pathlib import Path`

			`import psutil`

use absolute imports in Python code Relative imports are no longer officially recommended. 2021-04-16 15:20:09 +03:00			`from nominatim.db.connection import connect`
			`from nominatim.db import status, properties`
			`from nominatim.version import NOMINATIM_VERSION`
			`from nominatim.errors import UsageError`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`# Do not repeat documentation of subcommand classes.`
			`# pylint: disable=C0111`
			`# Using non-top-level imports to avoid eventually unused imports.`
			`# pylint: disable=E0012,C0415`

			`LOG = logging.getLogger()`

			`class SetupAll:`
			`"""\`
			`Create a new Nominatim database from an OSM file.`
			`"""`

			`@staticmethod`
			`def add_args(parser):`
			`group_name = parser.add_argument_group('Required arguments')`
			`group = group_name.add_mutually_exclusive_group(required=True)`
			`group.add_argument('--osm-file', metavar='FILE',`
			`help='OSM file to be imported.')`
			`group.add_argument('--continue', dest='continue_at',`
			`choices=['load-data', 'indexing', 'db-postprocess'],`
			`help='Continue an import that was interrupted')`
			`group = parser.add_argument_group('Optional arguments')`
			`group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,`
			`help='Size of cache to be used by osm2pgsql (in MB)')`
			`group.add_argument('--reverse-only', action='store_true',`
			`help='Do not create tables and indexes for searching')`
			`group.add_argument('--no-partitions', action='store_true',`
Rebase with master 2021-03-29 11:30:45 +03:00			`help=("Do not partition search indices "`
			`"(speeds up import of single country extracts)"))`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`group.add_argument('--no-updates', action='store_true',`
Rebase with master 2021-03-29 11:30:45 +03:00			`help="Do not keep tables that are only needed for "`
			`"updating the database later")`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`group = parser.add_argument_group('Expert options')`
			`group.add_argument('--ignore-errors', action='store_true',`
			`help='Continue import even when errors in SQL are present')`
			`group.add_argument('--index-noanalyse', action='store_true',`
			`help='Do not perform analyse operations during index')`


			`@staticmethod`
			`def run(args): # pylint: disable=too-many-statements`
call freeze after running and non-updateable import Some of the tables will have already been removed but the tables for indexing are still there and should be dropped. 2021-06-02 12:08:48 +03:00			`from ..tools import database_import, refresh, postcodes, freeze`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`from ..indexer.indexer import Indexer`
introduce tokenizer modules This adds the boilerplate for selecting configurable tokenizers. A tokenizer can be chosen at import time and will then install itself such that it is fixed for the given database import even when the software itself is updated. The legacy tokenizer implements Nominatim's traditional algorithms. 2021-04-21 10:57:17 +03:00			`from ..tokenizer import factory as tokenizer_factory`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`if args.osm_file and not Path(args.osm_file).is_file():`
			`LOG.fatal("OSM file '%s' does not exist.", args.osm_file)`
			`raise UsageError('Cannot access file.')`

			`if args.continue_at is None:`
			`database_import.setup_database_skeleton(args.config.get_libpq_dsn(),`
			`args.data_dir,`
			`args.no_partitions,`
			`rouser=args.config.DATABASE_WEBUSER)`

			`LOG.warning('Importing OSM data file')`
			`database_import.import_osm_data(Path(args.osm_file),`
			`args.osm2pgsql_options(0, 1),`
bdd: use python library where possible Replace calls to PHP scripts with direct calls into the nominatim Python library where possible. This speed up tests quite a bit. 2021-02-26 18:14:29 +03:00			`drop=args.no_updates,`
			`ignore_errors=args.ignore_errors)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`with connect(args.config.get_libpq_dsn()) as conn:`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create functions (1st pass)')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`refresh.create_functions(conn, args.config, False, False)`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create tables')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`database_import.create_tables(conn, args.config,`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`reverse_only=args.reverse_only)`
			`refresh.load_address_levels_from_file(conn, Path(args.config.ADDRESS_LEVEL_CONFIG))`
			`LOG.warning('Create functions (2nd pass)')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`refresh.create_functions(conn, args.config, False, False)`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create table triggers')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`database_import.create_table_triggers(conn, args.config)`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create partition tables')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`database_import.create_partition_tables(conn, args.config)`
move table creation to jinja-based preprocessing 2021-03-04 00:07:51 +03:00			`LOG.warning('Create functions (3rd pass)')`
simplify sql and website creation functions 2021-04-19 11:01:17 +03:00			`refresh.create_functions(conn, args.config, False, False)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`LOG.warning('Importing wikipedia importance data')`
			`data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)`
			`if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),`
			`data_path) > 0:`
			`LOG.error('Wikipedia importance dump file not found. '`
			`'Will be using default importances.')`

add support index when continuing import at index phase Indexing scans the placex table sequentially during indexing on the initial import. That is okay because we know that all rows need to be processed anywhere. When continuing the import, however, a large part might already be indexed, so that the process spends a lot of time going through rows that are no longer of interest. Create a supporting index for all unindexed rows to speed up the scan. This is the same index as used later for updates. 2021-04-17 12:07:04 +03:00			`if args.continue_at is None or args.continue_at == 'load-data':`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`LOG.warning('Initialise tables')`
			`with connect(args.config.get_libpq_dsn()) as conn:`
move word table and normalisation SQL into tokenizer Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation. 2021-04-22 23:47:34 +03:00			`database_import.truncate_data_tables(conn)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`LOG.warning('Load data into placex table')`
			`database_import.load_data(args.config.get_libpq_dsn(),`
			`args.threads or psutil.cpu_count() or 1)`

introduce tokenizer modules This adds the boilerplate for selecting configurable tokenizers. A tokenizer can be chosen at import time and will then install itself such that it is fixed for the given database import even when the software itself is updated. The legacy tokenizer implements Nominatim's traditional algorithms. 2021-04-21 10:57:17 +03:00			`LOG.warning("Setting up tokenizer")`
move word table and normalisation SQL into tokenizer Creating and populating the word table is now the responsibility of the tokenizer. The get_maxwordfreq() function has been replaced with a simple template parameter to the SQL during function installation. The number is taken from the parameter list in the database to ensure that it is not changed after installation. 2021-04-22 23:47:34 +03:00			`if args.continue_at is None or args.continue_at == 'load-data':`
			`# (re)initialise the tokenizer data`
			`tokenizer = tokenizer_factory.create_tokenizer(args.config)`
			`else:`
			`# just load the tokenizer`
			`tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)`
introduce tokenizer modules This adds the boilerplate for selecting configurable tokenizers. A tokenizer can be chosen at import time and will then install itself such that it is fixed for the given database import even when the software itself is updated. The legacy tokenizer implements Nominatim's traditional algorithms. 2021-04-21 10:57:17 +03:00
			`if args.continue_at is None or args.continue_at == 'load-data':`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`LOG.warning('Calculate postcodes')`
move filling of postcode table to python The Python code now takes care of reading postcodes from placex, enhancing them with potentially existing external postcodes and updating location_postcodes accordingly. The initial setup and updates use exactly the same function. External postcode handling has been generalized. External postcodes for any country are now accepted. The format of the external postcode file has changed. We now expect CSV, potentially gzipped. The postcodes are no longer saved in the database. 2021-05-12 20:57:48 +03:00			`postcodes.update_postcodes(args.config.get_libpq_dsn(),`
			`args.project_dir, tokenizer)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):`
add support index when continuing import at index phase Indexing scans the placex table sequentially during indexing on the initial import. That is okay because we know that all rows need to be processed anywhere. When continuing the import, however, a large part might already be indexed, so that the process spends a lot of time going through rows that are no longer of interest. Create a supporting index for all unindexed rows to speed up the scan. This is the same index as used later for updates. 2021-04-17 12:07:04 +03:00			`if args.continue_at is not None and args.continue_at != 'load-data':`
			`with connect(args.config.get_libpq_dsn()) as conn:`
			`SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`LOG.warning('Indexing places')`
require tokeinzer for indexer 2021-04-24 12:25:47 +03:00			`indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`args.threads or psutil.cpu_count() or 1)`
			`indexer.index_full(analyse=not args.index_noanalyse)`

			`LOG.warning('Post-process tables')`
port index creation to python Also switches to jinja-based preprocessing, which allows to simplify the SQL files. Use 'if not exists' where possible so that the step can be rerun to fix missing indexes. 2021-03-04 12:55:24 +03:00			`with connect(args.config.get_libpq_dsn()) as conn:`
			`database_import.create_search_indices(conn, args.config,`
			`drop=args.no_updates)`
Ported createCountryNames() to python and added tests 2021-03-12 07:58:41 +03:00			`LOG.warning('Create search index for default country names.')`
move default country name creation to tokenizer The new function is also used, when a country us updated. All SQL function related to country names have been removed. 2021-04-27 12:37:18 +03:00			`database_import.create_country_names(conn, tokenizer,`
			`args.config.LANGUAGES)`
call freeze after running and non-updateable import Some of the tables will have already been removed but the tables for indexing are still there and should be dropped. 2021-06-02 12:08:48 +03:00			`conn.commit()`
			`if args.no_updates:`
			`freeze.drop_update_tables(conn)`
move index creation for word table to tokenizer This introduces a finalization routing for the tokenizer where it can post-process the import if necessary. 2021-04-30 18:28:34 +03:00			`tokenizer.finalize_import(args.config)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
call freeze after running and non-updateable import Some of the tables will have already been removed but the tables for indexing are still there and should be dropped. 2021-06-02 12:08:48 +03:00
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`webdir = args.project_dir / 'website'`
			`LOG.warning('Setup website at %s', webdir)`
feat: Added reverse-only-search validation 2021-05-13 00:44:37 +03:00			`with connect(args.config.get_libpq_dsn()) as conn:`
			`refresh.setup_website(webdir, args.config, conn)`
move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00
			`with connect(args.config.get_libpq_dsn()) as conn:`
			`try:`
			`dbdate = status.compute_database_date(conn)`
			`status.set_status(conn, dbdate)`
			`LOG.info('Database is at %s.', dbdate)`
			`except Exception as exc: # pylint: disable=broad-except`
			`LOG.error('Cannot determine date of database: %s', exc)`

save software version in the database The version represents the software version that was used to import the data. 2021-03-01 22:35:15 +03:00			`properties.set_property(conn, 'database_version',`
			`'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))`

move setup function to python There are still back-calls to PHP for some of the sub-steps. These needs some larger refactoring to be moved to Python. 2021-02-26 17:02:39 +03:00			`return 0`
add support index when continuing import at index phase Indexing scans the placex table sequentially during indexing on the initial import. That is okay because we know that all rows need to be processed anywhere. When continuing the import, however, a large part might already be indexed, so that the process spends a lot of time going through rows that are no longer of interest. Create a supporting index for all unindexed rows to speed up the scan. This is the same index as used later for updates. 2021-04-17 12:07:04 +03:00

			`@staticmethod`
			`def _create_pending_index(conn, tablespace):`
			`""" Add a supporting index for finding places still to be indexed.`

			`This index is normally created at the end of the import process`
			`for later updates. When indexing was partially done, then this`
			`index can greatly improve speed going through already indexed data.`
			`"""`
			`if conn.index_exists('idx_placex_pendingsector'):`
			`return`

			`with conn.cursor() as cur:`
			`LOG.warning('Creating support index')`
			`if tablespace:`
			`tablespace = 'TABLESPACE ' + tablespace`
			`cur.execute("""CREATE INDEX idx_placex_pendingsector`
			`ON placex USING BTREE (rank_address,geometry_sector)`
			`{} WHERE indexed_status > 0`
			`""".format(tablespace))`
			`conn.commit()`