Nominatim/nominatim/indexer/indexer.py
Sarah Hoffmann 20891abe1c indexer: fetch extra place data asynchronously
The indexer now fetches any extra data besides the place_id
asynchronously while processing the places from the last batch.
This also means that more places are now fetched at once.
2021-04-30 17:41:08 +02:00

288 lines
9.1 KiB
Python

"""
Main work horse for indexing (computing addresses) the database.
"""
import logging
import select
import time
import psycopg2.extras
from nominatim.indexer.progress import ProgressLogger
from nominatim.indexer import runners
from nominatim.db.async_connection import DBConnection
from nominatim.db.connection import connect
LOG = logging.getLogger()
class PlaceFetcher:
""" Asynchronous connection that fetches place details for processing.
"""
def __init__(self, dsn, setup_conn):
self.wait_time = 0
self.current_ids = None
self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
with setup_conn.cursor() as cur:
# need to fetch those manually because register_hstore cannot
# fetch them on an asynchronous connection below.
hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
array_oid=hstore_array_oid)
def close(self):
""" Close the underlying asynchronous connection.
"""
if self.conn:
self.conn.close()
self.conn = None
def fetch_next_batch(self, cur, runner):
""" Send a request for the next batch of places.
If details for the places are required, they will be fetched
asynchronously.
Returns true if there is still data available.
"""
ids = cur.fetchmany(100)
if not ids:
self.current_ids = None
return False
if hasattr(runner, 'get_place_details'):
runner.get_place_details(self.conn, ids)
self.current_ids = []
else:
self.current_ids = ids
return True
def get_batch(self):
""" Get the next batch of data, previously requested with
`fetch_next_batch`.
"""
if self.current_ids is not None and not self.current_ids:
tstart = time.time()
self.conn.wait()
self.wait_time += time.time() - tstart
self.current_ids = self.conn.cursor.fetchall()
return self.current_ids
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.conn.wait()
self.close()
class WorkerPool:
""" A pool of asynchronous database connections.
The pool may be used as a context manager.
"""
REOPEN_CONNECTIONS_AFTER = 100000
def __init__(self, dsn, pool_size):
self.threads = [DBConnection(dsn) for _ in range(pool_size)]
self.free_workers = self._yield_free_worker()
self.wait_time = 0
def finish_all(self):
""" Wait for all connection to finish.
"""
for thread in self.threads:
while not thread.is_done():
thread.wait()
self.free_workers = self._yield_free_worker()
def close(self):
""" Close all connections and clear the pool.
"""
for thread in self.threads:
thread.close()
self.threads = []
self.free_workers = None
def next_free_worker(self):
""" Get the next free connection.
"""
return next(self.free_workers)
def _yield_free_worker(self):
ready = self.threads
command_stat = 0
while True:
for thread in ready:
if thread.is_done():
command_stat += 1
yield thread
if command_stat > self.REOPEN_CONNECTIONS_AFTER:
for thread in self.threads:
while not thread.is_done():
thread.wait()
thread.connect()
ready = self.threads
command_stat = 0
else:
tstart = time.time()
_, ready, _ = select.select([], self.threads, [])
self.wait_time += time.time() - tstart
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.finish_all()
self.close()
class Indexer:
""" Main indexing routine.
"""
def __init__(self, dsn, tokenizer, num_threads):
self.dsn = dsn
self.tokenizer = tokenizer
self.num_threads = num_threads
def index_full(self, analyse=True):
""" Index the complete database. This will first index boudnaries
followed by all other objects. When `analyse` is True, then the
database will be analysed at the appropriate places to
ensure that database statistics are updated.
"""
with connect(self.dsn) as conn:
conn.autocommit = True
if analyse:
def _analyze():
with conn.cursor() as cur:
cur.execute('ANALYZE')
else:
def _analyze():
pass
self.index_by_rank(0, 4)
_analyze()
self.index_boundaries(0, 30)
_analyze()
self.index_by_rank(5, 25)
_analyze()
self.index_by_rank(26, 30)
_analyze()
self.index_postcodes()
_analyze()
def index_boundaries(self, minrank, maxrank):
""" Index only administrative boundaries within the given rank range.
"""
LOG.warning("Starting indexing boundaries using %s threads",
self.num_threads)
with self.tokenizer.name_analyzer() as analyzer:
for rank in range(max(minrank, 4), min(maxrank, 26)):
self._index(runners.BoundaryRunner(rank, analyzer))
def index_by_rank(self, minrank, maxrank):
""" Index all entries of placex in the given rank range (inclusive)
in order of their address rank.
When rank 30 is requested then also interpolations and
places with address rank 0 will be indexed.
"""
maxrank = min(maxrank, 30)
LOG.warning("Starting indexing rank (%i to %i) using %i threads",
minrank, maxrank, self.num_threads)
with self.tokenizer.name_analyzer() as analyzer:
for rank in range(max(1, minrank), maxrank):
self._index(runners.RankRunner(rank, analyzer))
if maxrank == 30:
self._index(runners.RankRunner(0, analyzer))
self._index(runners.InterpolationRunner(analyzer), 20)
self._index(runners.RankRunner(30, analyzer), 20)
else:
self._index(runners.RankRunner(maxrank, analyzer))
def index_postcodes(self):
"""Index the entries ofthe location_postcode table.
"""
LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
self._index(runners.PostcodeRunner(), 20)
def update_status_table(self):
""" Update the status in the status table to 'indexed'.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.execute('UPDATE import_status SET indexed = true')
conn.commit()
def _index(self, runner, batch=1):
""" Index a single rank or table. `runner` describes the SQL to use
for indexing. `batch` describes the number of objects that
should be processed with a single SQL statement
"""
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
with connect(self.dsn) as conn:
psycopg2.extras.register_hstore(conn)
with conn.cursor() as cur:
total_tuples = cur.scalar(runner.sql_count_objects())
LOG.debug("Total number of rows: %i", total_tuples)
conn.commit()
progress = ProgressLogger(runner.name(), total_tuples)
if total_tuples > 0:
with conn.cursor(name='places') as cur:
cur.execute(runner.sql_get_objects())
with PlaceFetcher(self.dsn, conn) as fetcher:
with WorkerPool(self.dsn, self.num_threads) as pool:
has_more = fetcher.fetch_next_batch(cur, runner)
while has_more:
places = fetcher.get_batch()
# asynchronously get the next batch
has_more = fetcher.fetch_next_batch(cur, runner)
# And insert the curent batch
for idx in range(0, len(places), batch):
part = places[idx:idx+batch]
LOG.debug("Processing places: %s", str(part))
runner.index_places(pool.next_free_worker(), part)
progress.add(len(part))
LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs",
fetcher.wait_time, pool.wait_time)
conn.commit()
progress.done()