only run analyze on indexing when work was done

This speeds up processing when continuing indexing after it was
interrupted.
This commit is contained in:
Sarah Hoffmann 2022-09-28 10:22:54 +02:00
parent 051f3720ce
commit a2ee58d8a1
2 changed files with 29 additions and 21 deletions

View File

@ -128,58 +128,64 @@ class Indexer:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute('ANALYZE') cur.execute('ANALYZE')
self.index_by_rank(0, 4) if self.index_by_rank(0, 4) > 0:
_analyze() _analyze()
self.index_boundaries(0, 30) if self.index_boundaries(0, 30) > 100:
_analyze() _analyze()
self.index_by_rank(5, 25) if self.index_by_rank(5, 25) > 100:
_analyze() _analyze()
self.index_by_rank(26, 30) if self.index_by_rank(26, 30) > 1000:
_analyze() _analyze()
self.index_postcodes() if self.index_postcodes() > 100:
_analyze() _analyze()
def index_boundaries(self, minrank: int, maxrank: int) -> None: def index_boundaries(self, minrank: int, maxrank: int) -> int:
""" Index only administrative boundaries within the given rank range. """ Index only administrative boundaries within the given rank range.
""" """
total = 0
LOG.warning("Starting indexing boundaries using %s threads", LOG.warning("Starting indexing boundaries using %s threads",
self.num_threads) self.num_threads)
with self.tokenizer.name_analyzer() as analyzer: with self.tokenizer.name_analyzer() as analyzer:
for rank in range(max(minrank, 4), min(maxrank, 26)): for rank in range(max(minrank, 4), min(maxrank, 26)):
self._index(runners.BoundaryRunner(rank, analyzer)) total += self._index(runners.BoundaryRunner(rank, analyzer))
def index_by_rank(self, minrank: int, maxrank: int) -> None: return total
def index_by_rank(self, minrank: int, maxrank: int) -> int:
""" Index all entries of placex in the given rank range (inclusive) """ Index all entries of placex in the given rank range (inclusive)
in order of their address rank. in order of their address rank.
When rank 30 is requested then also interpolations and When rank 30 is requested then also interpolations and
places with address rank 0 will be indexed. places with address rank 0 will be indexed.
""" """
total = 0
maxrank = min(maxrank, 30) maxrank = min(maxrank, 30)
LOG.warning("Starting indexing rank (%i to %i) using %i threads", LOG.warning("Starting indexing rank (%i to %i) using %i threads",
minrank, maxrank, self.num_threads) minrank, maxrank, self.num_threads)
with self.tokenizer.name_analyzer() as analyzer: with self.tokenizer.name_analyzer() as analyzer:
for rank in range(max(1, minrank), maxrank + 1): for rank in range(max(1, minrank), maxrank + 1):
self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1) total += self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1)
if maxrank == 30: if maxrank == 30:
self._index(runners.RankRunner(0, analyzer)) total += self._index(runners.RankRunner(0, analyzer))
self._index(runners.InterpolationRunner(analyzer), 20) total += self._index(runners.InterpolationRunner(analyzer), 20)
return total
def index_postcodes(self) -> None: def index_postcodes(self) -> int:
"""Index the entries of the location_postcode table. """Index the entries of the location_postcode table.
""" """
LOG.warning("Starting indexing postcodes using %s threads", self.num_threads) LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
self._index(runners.PostcodeRunner(), 20) return self._index(runners.PostcodeRunner(), 20)
def update_status_table(self) -> None: def update_status_table(self) -> None:
@ -191,7 +197,7 @@ class Indexer:
conn.commit() conn.commit()
def _index(self, runner: runners.Runner, batch: int = 1) -> None: def _index(self, runner: runners.Runner, batch: int = 1) -> int:
""" Index a single rank or table. `runner` describes the SQL to use """ Index a single rank or table. `runner` describes the SQL to use
for indexing. `batch` describes the number of objects that for indexing. `batch` describes the number of objects that
should be processed with a single SQL statement should be processed with a single SQL statement
@ -233,4 +239,4 @@ class Indexer:
conn.commit() conn.commit()
progress.done() return progress.done()

View File

@ -55,7 +55,7 @@ class ProgressLogger:
self.next_info += int(places_per_sec) * self.log_interval self.next_info += int(places_per_sec) * self.log_interval
def done(self) -> None: def done(self) -> int:
""" Print final statistics about the progress. """ Print final statistics about the progress.
""" """
rank_end_time = datetime.now() rank_end_time = datetime.now()
@ -70,3 +70,5 @@ class ProgressLogger:
LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n", LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n",
self.done_places, self.total_places, int(diff_seconds), self.done_places, self.total_places, int(diff_seconds),
places_per_sec, self.name) places_per_sec, self.name)
return self.done_places