From d9143063ffc90baf416256c686c1791684011c4b Mon Sep 17 00:00:00 2001 From: dustinface <35775977+xdustinface@users.noreply.github.com> Date: Fri, 29 Oct 2021 18:29:46 +0200 Subject: [PATCH] plotting: Reduce `Path.exists` calls (#8999) * plotting: Move plot removals out of `PlotManager.refresh_batch` Moving this checks up in the trace so that it only runs once for each refresh cycle reduces `plot_removed` calls per cycle from `batch_count * plot_count` to `plot_count` only. This checks can be quite expensive depending on how/where the plots are stored so this should noticeably improve refresh times in slow setups and large farms even for the initial refresh cycle. I don't see any downside here considered that even if a plot gets removed during a refresh cycle (after the checks run) it will catch it in the next cycle. **Note**: This has become an issue now (e.g. in #8972) because prior to way, that it only really processed in batches for the very first run after a restart. For each repeated refresh cycle it just walked through all files in one `refresh_batch` call. Now after #8385 fixed this behavior the checks are called `batch_count * plot_count` for all refresh cycles, even for each repeated cycle where it was `plot_count` before. * plotting: Move `file_path.exists` call down There is no point in testing it before here, it only leads to redundant calls for repeated refresh cycles. The important point here is just that it runs after ``` if file_path in self.plots: return self.plots[file_path] ``` so it could still be moved around differently if there are other suggestions, tell me. I even think it might be possible to just drop it with no bad impact but lets just move it for now. --- chia/plotting/manager.py | 62 +++++++++++++++++++--------------------- tests/block_tools.py | 2 -- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/chia/plotting/manager.py b/chia/plotting/manager.py index 88c20a331a808..8e37bcd0b7c23 100644 --- a/chia/plotting/manager.py +++ b/chia/plotting/manager.py @@ -231,6 +231,33 @@ class PlotManager: self._refresh_callback(PlotRefreshEvents.started, PlotRefreshResult(remaining=total_size)) + # First drop all plots we have in plot_filename_paths but not longer in the filesystem or set in config + def plot_removed(test_path: Path): + return not test_path.exists() or test_path.parent not in plot_directories + + filenames_to_remove: List[str] = [] + for plot_filename, paths_entry in self.plot_filename_paths.items(): + loaded_path, duplicated_paths = paths_entry + loaded_plot = Path(loaded_path) / Path(plot_filename) + if plot_removed(loaded_plot): + filenames_to_remove.append(plot_filename) + if loaded_plot in self.plots: + del self.plots[loaded_plot] + total_result.removed += 1 + # No need to check the duplicates here since we drop the whole entry + continue + + paths_to_remove: List[str] = [] + for path in duplicated_paths: + if plot_removed(Path(path) / Path(plot_filename)): + paths_to_remove.append(path) + total_result.removed += 1 + for path in paths_to_remove: + duplicated_paths.remove(path) + + for filename in filenames_to_remove: + del self.plot_filename_paths[filename] + def batches() -> Iterator[Tuple[int, List[Path]]]: if total_size > 0: for batch_start in range(0, total_size, self.refresh_parameter.batch_size): @@ -247,7 +274,6 @@ class PlotManager: # Set the remaining files since `refresh_batch()` doesn't know them but we want to report it batch_result.remaining = remaining total_result.loaded += batch_result.loaded - total_result.removed += batch_result.removed total_result.processed += batch_result.processed total_result.duration += batch_result.duration @@ -294,8 +320,6 @@ class PlotManager: filename_str = str(file_path) if self.match_str is not None and self.match_str not in filename_str: return None - if not file_path.exists(): - return None if ( file_path in self.failed_to_open_filenames and (time.time() - self.failed_to_open_filenames[file_path]) @@ -314,6 +338,9 @@ class PlotManager: log.debug(f"Skip duplicated plot {str(file_path)}") return None try: + if not file_path.exists(): + return None + prover = DiskProver(str(file_path)) log.debug(f"process_file {str(file_path)}") @@ -413,35 +440,6 @@ class PlotManager: return new_plot_info with self, ThreadPoolExecutor() as executor: - - # First drop all plots we have in plot_filename_paths but not longer in the filesystem or set in config - def plot_removed(test_path: Path): - return not test_path.exists() or test_path.parent not in plot_directories - - with self.plot_filename_paths_lock: - filenames_to_remove: List[str] = [] - for plot_filename, paths_entry in self.plot_filename_paths.items(): - loaded_path, duplicated_paths = paths_entry - loaded_plot = Path(loaded_path) / Path(plot_filename) - if plot_removed(loaded_plot): - filenames_to_remove.append(plot_filename) - if loaded_plot in self.plots: - del self.plots[loaded_plot] - result.removed += 1 - # No need to check the duplicates here since we drop the whole entry - continue - - paths_to_remove: List[str] = [] - for path in duplicated_paths: - if plot_removed(Path(path) / Path(plot_filename)): - paths_to_remove.append(path) - result.removed += 1 - for path in paths_to_remove: - duplicated_paths.remove(path) - - for filename in filenames_to_remove: - del self.plot_filename_paths[filename] - plots_refreshed: Dict[Path, PlotInfo] = {} for new_plot in executor.map(process_file, plot_paths): if new_plot is not None: diff --git a/tests/block_tools.py b/tests/block_tools.py index e0b920f65724e..cabe601f40ba5 100644 --- a/tests/block_tools.py +++ b/tests/block_tools.py @@ -172,7 +172,6 @@ class BlockTools: if event == PlotRefreshEvents.batch_processed: self.total_result.loaded += update_result.loaded - self.total_result.removed += update_result.removed self.total_result.processed += update_result.processed self.total_result.duration += update_result.duration assert update_result.remaining == len(self.expected_plots) - self.total_result.processed @@ -180,7 +179,6 @@ class BlockTools: if event == PlotRefreshEvents.done: assert self.total_result.loaded == update_result.loaded - assert self.total_result.removed == update_result.removed assert self.total_result.processed == update_result.processed assert self.total_result.duration == update_result.duration assert update_result.remaining == 0