From 8ec680cecb5030d269097de1a6aaca32a26e7aaa Mon Sep 17 00:00:00 2001 From: Thorsten Ball Date: Tue, 27 Aug 2024 11:59:59 +0200 Subject: [PATCH] project search: Increase perf up to 10x by batching `git status` calls (#16936) ***Update**: after rebasing on top of https://github.com/zed-industries/zed/pull/16915/ (that also changed how search worked), the results are still good, but not 10x. Instead of going from 10s to 500ms, it goes from 10s to 3s.* This improves the performance of project-wide search by an order of magnitude. After digging in, @as-cii and I found that opening buffers was the bottleneck for project-wide search (since Zed opens, parses, ... buffers when finding them, which is something VS Code doesn't do, for example). So this PR improves the performance of opening multiple buffers at once. It does this by doing two things: - It batches scan-requests in the worktree. When we search, we search files in chunks of 64. Previously we'd handle all 64 scan requests separately. The new code checks if the scan requests can be batched and if so it, it does that. - It batches `git status` calls when reloading the project entries for the opened buffers. Instead of calling `git status` for each file, it calls `git status` for a batch of files, and then extracts the status for each file. (It has to be said that I think the slow performance on `main` has been a regression introduced over the last few months with the changes made to project/worktree/git. I don't think it was this slow ~5 months ago. But I also don't think it was this fast ~5 months ago.) ## Benchmarks | Search | Before | After (without https://github.com/zed-industries/zed/pull/16915) | After (with https://github.com/zed-industries/zed/pull/16915) |--------|--------|-------|------| | `zed.dev` at `2b2a501192e78e`, searching for `<` (`4484` results) | 3.0s
2.9s
2.89s | 489ms
517ms
476ms | n/a | | `zed.dev` at `2b2a501192e78e`, searching for `:` (`25886+` results) | 3.9s
3.9s
3.8s | 70ms
66ms
72ms | n/a | | `zed` at `55dda0e6af`, searching for `<` (`10937+` results) | 10s
11s
12s | 500m
499ms
543ms | 3.4s
3.1s
| (All results recorded after doing a warm-up run that would start language servers etc.) Release Notes: - Performance of project-wide search has been improved by up to 10x. --------- Co-authored-by: Antonio --- Cargo.lock | 1 + crates/git/src/repository.rs | 17 +++-- crates/git/src/status.rs | 14 ++-- crates/worktree/Cargo.toml | 1 + crates/worktree/src/worktree.rs | 114 ++++++++++++++++++++++++-------- 5 files changed, 106 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b350529290..03e7630ab2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13856,6 +13856,7 @@ dependencies = [ "serde", "serde_json", "settings", + "smallvec", "smol", "sum_tree", "text", diff --git a/crates/git/src/repository.rs b/crates/git/src/repository.rs index 6a3e4dcb1e..1b3686f021 100644 --- a/crates/git/src/repository.rs +++ b/crates/git/src/repository.rs @@ -36,11 +36,7 @@ pub trait GitRepository: Send + Sync { /// Returns the SHA of the current HEAD. fn head_sha(&self) -> Option; - fn statuses(&self, path_prefix: &Path) -> Result; - - fn status(&self, path: &Path) -> Option { - Some(self.statuses(path).ok()?.entries.first()?.1) - } + fn status(&self, path_prefixes: &[PathBuf]) -> Result; fn branches(&self) -> Result>; fn change_branch(&self, _: &str) -> Result<()>; @@ -126,14 +122,14 @@ impl GitRepository for RealGitRepository { Some(self.repository.lock().head().ok()?.target()?.to_string()) } - fn statuses(&self, path_prefix: &Path) -> Result { + fn status(&self, path_prefixes: &[PathBuf]) -> Result { let working_directory = self .repository .lock() .workdir() .context("failed to read git work directory")? .to_path_buf(); - GitStatus::new(&self.git_binary_path, &working_directory, path_prefix) + GitStatus::new(&self.git_binary_path, &working_directory, path_prefixes) } fn branches(&self) -> Result> { @@ -245,13 +241,16 @@ impl GitRepository for FakeGitRepository { None } - fn statuses(&self, path_prefix: &Path) -> Result { + fn status(&self, path_prefixes: &[PathBuf]) -> Result { let state = self.state.lock(); let mut entries = state .worktree_statuses .iter() .filter_map(|(repo_path, status)| { - if repo_path.0.starts_with(path_prefix) { + if path_prefixes + .iter() + .any(|path_prefix| repo_path.0.starts_with(path_prefix)) + { Some((repo_path.to_owned(), *status)) } else { None diff --git a/crates/git/src/status.rs b/crates/git/src/status.rs index 13a6919907..e6098ffd3c 100644 --- a/crates/git/src/status.rs +++ b/crates/git/src/status.rs @@ -15,14 +15,10 @@ impl GitStatus { pub(crate) fn new( git_binary: &Path, working_directory: &Path, - mut path_prefix: &Path, + path_prefixes: &[PathBuf], ) -> Result { let mut child = Command::new(git_binary); - if path_prefix == Path::new("") { - path_prefix = Path::new("."); - } - child .current_dir(working_directory) .args([ @@ -32,7 +28,13 @@ impl GitStatus { "--untracked-files=all", "-z", ]) - .arg(path_prefix) + .args(path_prefixes.iter().map(|path_prefix| { + if *path_prefix == Path::new("") { + Path::new(".") + } else { + path_prefix + } + })) .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()); diff --git a/crates/worktree/Cargo.toml b/crates/worktree/Cargo.toml index 2e88832031..1186c988ad 100644 --- a/crates/worktree/Cargo.toml +++ b/crates/worktree/Cargo.toml @@ -41,6 +41,7 @@ schemars.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true +smallvec.workspace = true smol.workspace = true sum_tree.workspace = true text.workspace = true diff --git a/crates/worktree/src/worktree.rs b/crates/worktree/src/worktree.rs index 1a11f9b27d..f304a29325 100644 --- a/crates/worktree/src/worktree.rs +++ b/crates/worktree/src/worktree.rs @@ -38,6 +38,7 @@ use postage::{ }; use rpc::proto::{self, AnyProtoClient}; use settings::{Settings, SettingsLocation, SettingsStore}; +use smallvec::{smallvec, SmallVec}; use smol::channel::{self, Sender}; use std::{ any::Any, @@ -124,7 +125,7 @@ pub struct LocalWorktree { struct ScanRequest { relative_paths: Vec>, - done: barrier::Sender, + done: SmallVec<[barrier::Sender; 1]>, } pub struct RemoteWorktree { @@ -339,7 +340,7 @@ enum ScanState { Updated { snapshot: LocalSnapshot, changes: UpdatedEntriesSet, - barrier: Option, + barrier: SmallVec<[barrier::Sender; 1]>, scanning: bool, }, } @@ -1659,7 +1660,7 @@ impl LocalWorktree { self.scan_requests_tx .try_send(ScanRequest { relative_paths: paths, - done: tx, + done: smallvec![tx], }) .ok(); rx @@ -2699,7 +2700,7 @@ impl BackgroundScannerState { work_directory: workdir_path, statuses: repo .repo_ptr - .statuses(&repo_path) + .status(&[repo_path.0]) .log_err() .unwrap_or_default(), }); @@ -3568,7 +3569,7 @@ impl BackgroundScanner { state.snapshot.completed_scan_id = state.snapshot.scan_id; } - self.send_status_update(false, None); + self.send_status_update(false, SmallVec::new()); // Process any any FS events that occurred while performing the initial scan. // For these events, update events cannot be as precise, because we didn't @@ -3588,7 +3589,7 @@ impl BackgroundScanner { select_biased! { // Process any path refresh requests from the worktree. Prioritize // these before handling changes reported by the filesystem. - request = self.scan_requests_rx.recv().fuse() => { + request = self.next_scan_request().fuse() => { let Ok(request) = request else { break }; if !self.process_scan_request(request, false).await { return; @@ -3669,7 +3670,7 @@ impl BackgroundScanner { ) .await; - self.send_status_update(scanning, Some(request.done)) + self.send_status_update(scanning, request.done) } async fn process_events(&mut self, mut abs_paths: Vec) { @@ -3777,7 +3778,7 @@ impl BackgroundScanner { #[cfg(test)] self.state.lock().snapshot.check_git_invariants(); - self.send_status_update(false, None); + self.send_status_update(false, SmallVec::new()); } async fn forcibly_load_paths(&self, paths: &[Arc]) -> bool { @@ -3834,7 +3835,7 @@ impl BackgroundScanner { select_biased! { // Process any path refresh requests before moving on to process // the scan queue, so that user operations are prioritized. - request = self.scan_requests_rx.recv().fuse() => { + request = self.next_scan_request().fuse() => { let Ok(request) = request else { break }; if !self.process_scan_request(request, true).await { return; @@ -3853,7 +3854,7 @@ impl BackgroundScanner { ) { Ok(_) => { last_progress_update_count += 1; - self.send_status_update(true, None); + self.send_status_update(true, SmallVec::new()); } Err(count) => { last_progress_update_count = count; @@ -3879,7 +3880,7 @@ impl BackgroundScanner { .await; } - fn send_status_update(&self, scanning: bool, barrier: Option) -> bool { + fn send_status_update(&self, scanning: bool, barrier: SmallVec<[barrier::Sender; 1]>) -> bool { let mut state = self.state.lock(); if state.changed_paths.is_empty() && scanning { return true; @@ -3954,7 +3955,7 @@ impl BackgroundScanner { if let Some((work_directory, repository)) = repo { let t0 = Instant::now(); let statuses = repository - .statuses(Path::new("")) + .status(&[PathBuf::from("")]) .log_err() .unwrap_or_default(); log::trace!("computed git status in {:?}", t0.elapsed()); @@ -4167,6 +4168,32 @@ impl BackgroundScanner { } } + // Group all relative paths by their git repository. + let mut paths_by_git_repo = HashMap::default(); + for relative_path in relative_paths.iter() { + if let Some((repo_entry, repo)) = state.snapshot.repo_for_path(relative_path) { + if let Ok(repo_path) = repo_entry.relativize(&state.snapshot, relative_path) { + paths_by_git_repo + .entry(repo.git_dir_path.clone()) + .or_insert_with(|| RepoPaths { + repo: repo.repo_ptr.clone(), + repo_paths: Vec::new(), + relative_paths: Vec::new(), + }) + .add_paths(&relative_path, repo_path); + } + } + } + + // Now call `git status` once per repository and collect each file's git status. + let mut git_statuses_by_relative_path = + paths_by_git_repo + .into_values() + .fold(HashMap::default(), |mut map, repo_paths| { + map.extend(repo_paths.into_git_file_statuses()); + map + }); + for (path, metadata) in relative_paths.iter().zip(metadata.into_iter()) { let abs_path: Arc = root_abs_path.join(&path).into(); match metadata { @@ -4192,15 +4219,7 @@ impl BackgroundScanner { fs_entry.is_external = is_external; fs_entry.is_private = self.is_path_private(path); - if !is_dir && !fs_entry.is_ignored && !fs_entry.is_external { - if let Some((repo_entry, repo)) = state.snapshot.repo_for_path(path) { - if let Ok(repo_path) = repo_entry.relativize(&state.snapshot, path) { - fs_entry.git_status = repo.repo_ptr.status(&repo_path); - } - } - } - - if let (Some(scan_queue_tx), true) = (&scan_queue_tx, fs_entry.is_dir()) { + if let (Some(scan_queue_tx), true) = (&scan_queue_tx, is_dir) { if state.should_scan_directory(&fs_entry) || (fs_entry.path.as_os_str().is_empty() && abs_path.file_name() == Some(*DOT_GIT)) @@ -4211,7 +4230,11 @@ impl BackgroundScanner { } } - state.insert_entry(fs_entry, self.fs.as_ref()); + if !is_dir && !fs_entry.is_ignored && !fs_entry.is_external { + fs_entry.git_status = git_statuses_by_relative_path.remove(path); + } + + state.insert_entry(fs_entry.clone(), self.fs.as_ref()); } Ok(None) => { self.remove_repo_path(path, &mut state.snapshot); @@ -4310,7 +4333,7 @@ impl BackgroundScanner { select_biased! { // Process any path refresh requests before moving on to process // the queue of ignore statuses. - request = self.scan_requests_rx.recv().fuse() => { + request = self.next_scan_request().fuse() => { let Ok(request) = request else { break }; if !self.process_scan_request(request, true).await { return; @@ -4380,7 +4403,12 @@ impl BackgroundScanner { if !entry.is_dir() && !entry.is_ignored && !entry.is_external { if let Some((ref repo_entry, local_repo)) = repo { if let Ok(repo_path) = repo_entry.relativize(&snapshot, &entry.path) { - entry.git_status = local_repo.repo_ptr.status(&repo_path); + let status = local_repo + .repo_ptr + .status(&[repo_path.0.clone()]) + .ok() + .and_then(|status| status.get(&repo_path)); + entry.git_status = status; } } } @@ -4519,7 +4547,7 @@ impl BackgroundScanner { select_biased! { // Process any path refresh requests before moving on to process // the queue of git statuses. - request = self.scan_requests_rx.recv().fuse() => { + request = self.next_scan_request().fuse() => { let Ok(request) = request else { break }; if !self.process_scan_request(request, true).await { return; @@ -4537,7 +4565,7 @@ impl BackgroundScanner { fn update_git_statuses(&self, job: UpdateGitStatusesJob) { log::trace!("updating git statuses for repo {:?}", job.work_directory.0); let t0 = Instant::now(); - let Some(statuses) = job.repository.statuses(Path::new("")).log_err() else { + let Some(statuses) = job.repository.status(&[PathBuf::from("")]).log_err() else { return; }; log::trace!( @@ -4726,6 +4754,15 @@ impl BackgroundScanner { fn is_path_private(&self, path: &Path) -> bool { !self.share_private_files && self.settings.is_path_private(path) } + + async fn next_scan_request(&self) -> Result { + let mut request = self.scan_requests_rx.recv().await?; + while let Ok(next_request) = self.scan_requests_rx.try_recv() { + request.relative_paths.extend(next_request.relative_paths); + request.done.extend(next_request.done); + } + Ok(request) + } } fn swap_to_front(child_paths: &mut Vec, file: &OsStr) { @@ -4748,6 +4785,31 @@ fn char_bag_for_path(root_char_bag: CharBag, path: &Path) -> CharBag { result } +struct RepoPaths { + repo: Arc, + relative_paths: Vec>, + repo_paths: Vec, +} + +impl RepoPaths { + fn add_paths(&mut self, relative_path: &Arc, repo_path: RepoPath) { + self.relative_paths.push(relative_path.clone()); + self.repo_paths.push(repo_path.0); + } + + fn into_git_file_statuses(self) -> HashMap, GitFileStatus> { + let mut statuses = HashMap::default(); + if let Ok(status) = self.repo.status(&self.repo_paths) { + for (repo_path, relative_path) in self.repo_paths.into_iter().zip(self.relative_paths) { + if let Some(path_status) = status.get(&repo_path) { + statuses.insert(relative_path, path_status); + } + } + } + statuses + } +} + struct ScanJob { abs_path: Arc, path: Arc,