plotting: Reduce Path.exists calls (#8999)

* plotting: Move plot removals out of `PlotManager.refresh_batch`

Moving this checks up in the trace so that it only runs once for each
refresh cycle reduces `plot_removed` calls per cycle from `batch_count *
plot_count` to `plot_count` only. This checks can be quite expensive
depending on how/where the plots are stored so this should noticeably
improve refresh times in slow setups and large farms even for the
initial refresh cycle. I don't see any downside here considered that
even if a plot gets removed during a refresh cycle (after the checks
run) it will catch it in the next cycle.

**Note**: This has become an issue now (e.g. in #8972) because prior to
way, that it only really processed in batches for the very first run
after a restart. For each repeated refresh cycle it just walked through
all files in one `refresh_batch` call. Now after #8385 fixed this
behavior the checks are called `batch_count * plot_count` for all
refresh cycles, even for each repeated cycle where it was `plot_count`
before.

* plotting: Move `file_path.exists` call down 

There is no point in testing it before here, it only leads to redundant 
calls for repeated refresh cycles. The important point here is just that 
it runs after 

```
if file_path in self.plots:
    return self.plots[file_path]
```

so it could still be moved around differently if there are other 
suggestions, tell me. I even think it might be possible to just drop it 
with no bad impact but lets just move it for now.
This commit is contained in:
dustinface 2021-10-29 18:29:46 +02:00 committed by GitHub
parent 3d1aa2b01d
commit d9143063ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 34 deletions

View File

@ -231,6 +231,33 @@ class PlotManager:
self._refresh_callback(PlotRefreshEvents.started, PlotRefreshResult(remaining=total_size))
# First drop all plots we have in plot_filename_paths but not longer in the filesystem or set in config
def plot_removed(test_path: Path):
return not test_path.exists() or test_path.parent not in plot_directories
filenames_to_remove: List[str] = []
for plot_filename, paths_entry in self.plot_filename_paths.items():
loaded_path, duplicated_paths = paths_entry
loaded_plot = Path(loaded_path) / Path(plot_filename)
if plot_removed(loaded_plot):
filenames_to_remove.append(plot_filename)
if loaded_plot in self.plots:
del self.plots[loaded_plot]
total_result.removed += 1
# No need to check the duplicates here since we drop the whole entry
continue
paths_to_remove: List[str] = []
for path in duplicated_paths:
if plot_removed(Path(path) / Path(plot_filename)):
paths_to_remove.append(path)
total_result.removed += 1
for path in paths_to_remove:
duplicated_paths.remove(path)
for filename in filenames_to_remove:
del self.plot_filename_paths[filename]
def batches() -> Iterator[Tuple[int, List[Path]]]:
if total_size > 0:
for batch_start in range(0, total_size, self.refresh_parameter.batch_size):
@ -247,7 +274,6 @@ class PlotManager:
# Set the remaining files since `refresh_batch()` doesn't know them but we want to report it
batch_result.remaining = remaining
total_result.loaded += batch_result.loaded
total_result.removed += batch_result.removed
total_result.processed += batch_result.processed
total_result.duration += batch_result.duration
@ -294,8 +320,6 @@ class PlotManager:
filename_str = str(file_path)
if self.match_str is not None and self.match_str not in filename_str:
return None
if not file_path.exists():
return None
if (
file_path in self.failed_to_open_filenames
and (time.time() - self.failed_to_open_filenames[file_path])
@ -314,6 +338,9 @@ class PlotManager:
log.debug(f"Skip duplicated plot {str(file_path)}")
return None
try:
if not file_path.exists():
return None
prover = DiskProver(str(file_path))
log.debug(f"process_file {str(file_path)}")
@ -413,35 +440,6 @@ class PlotManager:
return new_plot_info
with self, ThreadPoolExecutor() as executor:
# First drop all plots we have in plot_filename_paths but not longer in the filesystem or set in config
def plot_removed(test_path: Path):
return not test_path.exists() or test_path.parent not in plot_directories
with self.plot_filename_paths_lock:
filenames_to_remove: List[str] = []
for plot_filename, paths_entry in self.plot_filename_paths.items():
loaded_path, duplicated_paths = paths_entry
loaded_plot = Path(loaded_path) / Path(plot_filename)
if plot_removed(loaded_plot):
filenames_to_remove.append(plot_filename)
if loaded_plot in self.plots:
del self.plots[loaded_plot]
result.removed += 1
# No need to check the duplicates here since we drop the whole entry
continue
paths_to_remove: List[str] = []
for path in duplicated_paths:
if plot_removed(Path(path) / Path(plot_filename)):
paths_to_remove.append(path)
result.removed += 1
for path in paths_to_remove:
duplicated_paths.remove(path)
for filename in filenames_to_remove:
del self.plot_filename_paths[filename]
plots_refreshed: Dict[Path, PlotInfo] = {}
for new_plot in executor.map(process_file, plot_paths):
if new_plot is not None:

View File

@ -172,7 +172,6 @@ class BlockTools:
if event == PlotRefreshEvents.batch_processed:
self.total_result.loaded += update_result.loaded
self.total_result.removed += update_result.removed
self.total_result.processed += update_result.processed
self.total_result.duration += update_result.duration
assert update_result.remaining == len(self.expected_plots) - self.total_result.processed
@ -180,7 +179,6 @@ class BlockTools:
if event == PlotRefreshEvents.done:
assert self.total_result.loaded == update_result.loaded
assert self.total_result.removed == update_result.removed
assert self.total_result.processed == update_result.processed
assert self.total_result.duration == update_result.duration
assert update_result.remaining == 0