From dd328efaa7e50617bc1efc624500a8e480f9f790 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 29 May 2024 14:31:24 -0700 Subject: [PATCH] Compute git statuses using the bundled git executable, not libgit2 (#12444) I realized that somehow, the `git` executable is able to compute `git status` much more quickly than libgit2, so I've switched our git status logic to use `git`. Follow-up to https://github.com/zed-industries/zed/pull/12266. Release Notes: - Improved the performance of git status updated when working in large git repositories. --- crates/git/src/blame.rs | 9 +- crates/git/src/git.rs | 1 + crates/git/src/repository.rs | 160 +++++-------------------- crates/git/src/status.rs | 99 ++++++++++++++++ crates/worktree/src/worktree.rs | 202 ++++++++++++++------------------ 5 files changed, 222 insertions(+), 249 deletions(-) create mode 100644 crates/git/src/status.rs diff --git a/crates/git/src/blame.rs b/crates/git/src/blame.rs index e6c76aa0b3..a76a99ad6a 100644 --- a/crates/git/src/blame.rs +++ b/crates/git/src/blame.rs @@ -8,15 +8,11 @@ use std::process::{Command, Stdio}; use std::sync::Arc; use std::{ops::Range, path::Path}; use text::Rope; -use time; use time::macros::format_description; use time::OffsetDateTime; use time::UtcOffset; use url::Url; -#[cfg(windows)] -use std::os::windows::process::CommandExt; - pub use git2 as libgit; #[derive(Debug, Clone, Default)] @@ -98,7 +94,10 @@ fn run_git_blame( .stderr(Stdio::piped()); #[cfg(windows)] - child.creation_flags(windows::Win32::System::Threading::CREATE_NO_WINDOW.0); + { + use std::os::windows::process::CommandExt; + child.creation_flags(windows::Win32::System::Threading::CREATE_NO_WINDOW.0); + } let child = child .spawn() diff --git a/crates/git/src/git.rs b/crates/git/src/git.rs index 92fb6ad843..3f5e0fd9f8 100644 --- a/crates/git/src/git.rs +++ b/crates/git/src/git.rs @@ -15,6 +15,7 @@ pub mod blame; pub mod commit; pub mod diff; pub mod repository; +pub mod status; lazy_static! { pub static ref DOT_GIT: &'static OsStr = OsStr::new(".git"); diff --git a/crates/git/src/repository.rs b/crates/git/src/repository.rs index ff6a4ce996..9221176b02 100644 --- a/crates/git/src/repository.rs +++ b/crates/git/src/repository.rs @@ -1,8 +1,8 @@ -use crate::blame::Blame; use crate::GitHostingProviderRegistry; +use crate::{blame::Blame, status::GitStatus}; use anyhow::{Context, Result}; use collections::HashMap; -use git2::{BranchType, StatusShow}; +use git2::BranchType; use parking_lot::Mutex; use rope::Rope; use serde::{Deserialize, Serialize}; @@ -10,10 +10,9 @@ use std::{ cmp::Ordering, path::{Component, Path, PathBuf}, sync::Arc, - time::SystemTime, }; -use sum_tree::{MapSeekTarget, TreeMap}; -use util::{paths::PathExt, ResultExt}; +use sum_tree::MapSeekTarget; +use util::ResultExt; pub use git2::Repository as LibGitRepository; @@ -39,23 +38,11 @@ pub trait GitRepository: Send { /// Returns the SHA of the current HEAD. fn head_sha(&self) -> Option; - /// Get the statuses of all of the files in the index that start with the given - /// path and have changes with respect to the HEAD commit. This is fast because - /// the index stores hashes of trees, so that unchanged directories can be skipped. - fn staged_statuses(&self, path_prefix: &Path) -> TreeMap; + fn statuses(&self, path_prefix: &Path) -> Result; - /// Get the status of a given file in the working directory with respect to - /// the index. In the common case, when there are no changes, this only requires - /// an index lookup. The index stores the mtime of each file when it was added, - /// so there's no work to do if the mtime matches. - fn unstaged_status(&self, path: &RepoPath, mtime: SystemTime) -> Option; - - /// Get the status of a given file in the working directory with respect to - /// the HEAD commit. In the common case, when there are no changes, this only - /// requires an index lookup and blob comparison between the index and the HEAD - /// commit. The index stores the mtime of each file when it was added, so there's - /// no need to consider the working directory file if the mtime matches. - fn status(&self, path: &RepoPath, mtime: SystemTime) -> Option; + fn status(&self, path: &Path) -> Option { + Some(self.statuses(path).ok()?.entries.first()?.1) + } fn branches(&self) -> Result>; fn change_branch(&self, _: &str) -> Result<()>; @@ -137,65 +124,12 @@ impl GitRepository for RealGitRepository { head.target().map(|oid| oid.to_string()) } - fn staged_statuses(&self, path_prefix: &Path) -> TreeMap { - let mut map = TreeMap::default(); - - let mut options = git2::StatusOptions::new(); - options.pathspec(path_prefix); - options.show(StatusShow::Index); - - if let Some(statuses) = self.repository.statuses(Some(&mut options)).log_err() { - for status in statuses.iter() { - let path = RepoPath(PathBuf::try_from_bytes(status.path_bytes()).unwrap()); - let status = status.status(); - if !status.contains(git2::Status::IGNORED) { - if let Some(status) = read_status(status) { - map.insert(path, status) - } - } - } - } - map - } - - fn unstaged_status(&self, path: &RepoPath, mtime: SystemTime) -> Option { - // If the file has not changed since it was added to the index, then - // there can't be any changes. - if matches_index(&self.repository, path, mtime) { - return None; - } - - let mut options = git2::StatusOptions::new(); - options.pathspec(&path.0); - options.disable_pathspec_match(true); - options.include_untracked(true); - options.recurse_untracked_dirs(true); - options.include_unmodified(true); - options.show(StatusShow::Workdir); - - let statuses = self.repository.statuses(Some(&mut options)).log_err()?; - let status = statuses.get(0).and_then(|s| read_status(s.status())); - status - } - - fn status(&self, path: &RepoPath, mtime: SystemTime) -> Option { - let mut options = git2::StatusOptions::new(); - options.pathspec(&path.0); - options.disable_pathspec_match(true); - options.include_untracked(true); - options.recurse_untracked_dirs(true); - options.include_unmodified(true); - - // If the file has not changed since it was added to the index, then - // there's no need to examine the working directory file: just compare - // the blob in the index to the one in the HEAD commit. - if matches_index(&self.repository, path, mtime) { - options.show(StatusShow::Index); - } - - let statuses = self.repository.statuses(Some(&mut options)).log_err()?; - let status = statuses.get(0).and_then(|s| read_status(s.status())); - status + fn statuses(&self, path_prefix: &Path) -> Result { + let working_directory = self + .repository + .workdir() + .context("failed to read git work directory")?; + GitStatus::new(&self.git_binary_path, working_directory, path_prefix) } fn branches(&self) -> Result> { @@ -222,6 +156,7 @@ impl GitRepository for RealGitRepository { .collect(); Ok(valid_branches) } + fn change_branch(&self, name: &str) -> Result<()> { let revision = self.repository.find_branch(name, BranchType::Local)?; let revision = revision.get(); @@ -261,38 +196,6 @@ impl GitRepository for RealGitRepository { } } -fn matches_index(repo: &LibGitRepository, path: &RepoPath, mtime: SystemTime) -> bool { - if let Some(index) = repo.index().log_err() { - if let Some(entry) = index.get_path(path, 0) { - if let Some(mtime) = mtime.duration_since(SystemTime::UNIX_EPOCH).log_err() { - if entry.mtime.seconds() == mtime.as_secs() as i32 - && entry.mtime.nanoseconds() == mtime.subsec_nanos() - { - return true; - } - } - } - } - false -} - -fn read_status(status: git2::Status) -> Option { - if status.contains(git2::Status::CONFLICTED) { - Some(GitFileStatus::Conflict) - } else if status.intersects( - git2::Status::WT_MODIFIED - | git2::Status::WT_RENAMED - | git2::Status::INDEX_MODIFIED - | git2::Status::INDEX_RENAMED, - ) { - Some(GitFileStatus::Modified) - } else if status.intersects(git2::Status::WT_NEW | git2::Status::INDEX_NEW) { - Some(GitFileStatus::Added) - } else { - None - } -} - #[derive(Debug, Clone, Default)] pub struct FakeGitRepository { state: Arc>, @@ -333,24 +236,23 @@ impl GitRepository for FakeGitRepository { None } - fn staged_statuses(&self, path_prefix: &Path) -> TreeMap { - let mut map = TreeMap::default(); + fn statuses(&self, path_prefix: &Path) -> Result { let state = self.state.lock(); - for (repo_path, status) in state.worktree_statuses.iter() { - if repo_path.0.starts_with(path_prefix) { - map.insert(repo_path.to_owned(), status.to_owned()); - } - } - map - } - - fn unstaged_status(&self, _path: &RepoPath, _mtime: SystemTime) -> Option { - None - } - - fn status(&self, path: &RepoPath, _mtime: SystemTime) -> Option { - let state = self.state.lock(); - state.worktree_statuses.get(path).cloned() + let mut entries = state + .worktree_statuses + .iter() + .filter_map(|(repo_path, status)| { + if repo_path.0.starts_with(path_prefix) { + Some((repo_path.to_owned(), *status)) + } else { + None + } + }) + .collect::>(); + entries.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + Ok(GitStatus { + entries: entries.into(), + }) } fn branches(&self) -> Result> { diff --git a/crates/git/src/status.rs b/crates/git/src/status.rs new file mode 100644 index 0000000000..13a6919907 --- /dev/null +++ b/crates/git/src/status.rs @@ -0,0 +1,99 @@ +use crate::repository::{GitFileStatus, RepoPath}; +use anyhow::{anyhow, Result}; +use std::{ + path::{Path, PathBuf}, + process::{Command, Stdio}, + sync::Arc, +}; + +#[derive(Clone)] +pub struct GitStatus { + pub entries: Arc<[(RepoPath, GitFileStatus)]>, +} + +impl GitStatus { + pub(crate) fn new( + git_binary: &Path, + working_directory: &Path, + mut path_prefix: &Path, + ) -> Result { + let mut child = Command::new(git_binary); + + if path_prefix == Path::new("") { + path_prefix = Path::new("."); + } + + child + .current_dir(working_directory) + .args([ + "--no-optional-locks", + "status", + "--porcelain=v1", + "--untracked-files=all", + "-z", + ]) + .arg(path_prefix) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + #[cfg(windows)] + { + use std::os::windows::process::CommandExt; + child.creation_flags(windows::Win32::System::Threading::CREATE_NO_WINDOW.0); + } + + let child = child + .spawn() + .map_err(|e| anyhow!("Failed to start git status process: {}", e))?; + + let output = child + .wait_with_output() + .map_err(|e| anyhow!("Failed to read git blame output: {}", e))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(anyhow!("git status process failed: {}", stderr)); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let mut entries = stdout + .split('\0') + .filter_map(|entry| { + if entry.is_char_boundary(3) { + let (status, path) = entry.split_at(3); + let status = status.trim(); + Some(( + RepoPath(PathBuf::from(path)), + match status { + "A" | "??" => GitFileStatus::Added, + "M" => GitFileStatus::Modified, + _ => return None, + }, + )) + } else { + None + } + }) + .collect::>(); + entries.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + Ok(Self { + entries: entries.into(), + }) + } + + pub fn get(&self, path: &Path) -> Option { + self.entries + .binary_search_by(|(repo_path, _)| repo_path.0.as_path().cmp(path)) + .ok() + .map(|index| self.entries[index].1) + } +} + +impl Default for GitStatus { + fn default() -> Self { + Self { + entries: Arc::new([]), + } + } +} diff --git a/crates/worktree/src/worktree.rs b/crates/worktree/src/worktree.rs index 4818f70aa0..bbc7430f9f 100644 --- a/crates/worktree/src/worktree.rs +++ b/crates/worktree/src/worktree.rs @@ -21,6 +21,7 @@ use futures::{ FutureExt as _, Stream, StreamExt, }; use fuzzy::CharBag; +use git::status::GitStatus; use git::{ repository::{GitFileStatus, GitRepository, RepoPath}, DOT_GIT, GITIGNORE, @@ -77,7 +78,7 @@ pub const FS_WATCH_LATENCY: Duration = Duration::from_millis(100); #[cfg(not(feature = "test-support"))] pub const FS_WATCH_LATENCY: Duration = Duration::from_millis(100); -const GIT_STATUS_UPDATE_BATCH_SIZE: usize = 100; +const GIT_STATUS_UPDATE_BATCH_SIZE: usize = 1024; #[derive(Copy, Clone, PartialEq, Eq, Debug, Hash, PartialOrd, Ord)] pub struct WorktreeId(usize); @@ -2019,18 +2020,13 @@ impl Snapshot { include_ignored: bool, path: &Path, ) -> Traversal { - let mut cursor = self.entries_by_path.cursor(); - cursor.seek(&TraversalTarget::Path(path), Bias::Left, &()); - let mut traversal = Traversal { - cursor, + Traversal::new( + &self.entries_by_path, include_files, include_dirs, include_ignored, - }; - if traversal.end_offset() == traversal.start_offset() { - traversal.next(); - } - traversal + path, + ) } pub fn files(&self, include_ignored: bool, start: usize) -> Traversal { @@ -2558,8 +2554,12 @@ impl BackgroundScannerState { if let Ok(repo_path) = repo_entry.relativize(&self.snapshot, &path) { containing_repository = Some(ScanJobContainingRepository { work_directory: workdir_path, - repository: repo.repo_ptr.clone(), - staged_statuses: repo.repo_ptr.lock().staged_statuses(&repo_path), + statuses: repo + .repo_ptr + .lock() + .statuses(&repo_path) + .log_err() + .unwrap_or_default(), }); } } @@ -3832,11 +3832,14 @@ impl BackgroundScanner { .lock() .build_git_repository(child_path.clone(), self.fs.as_ref()) { - let staged_statuses = repository.lock().staged_statuses(Path::new("")); + let statuses = repository + .lock() + .statuses(Path::new("")) + .log_err() + .unwrap_or_default(); containing_repository = Some(ScanJobContainingRepository { work_directory, - repository, - staged_statuses, + statuses, }); } } else if child_name == *GITIGNORE { @@ -3946,13 +3949,8 @@ impl BackgroundScanner { if !child_entry.is_ignored { if let Some(repo) = &containing_repository { if let Ok(repo_path) = child_entry.path.strip_prefix(&repo.work_directory) { - if let Some(mtime) = child_entry.mtime { - let repo_path = RepoPath(repo_path.into()); - child_entry.git_status = combine_git_statuses( - repo.staged_statuses.get(&repo_path).copied(), - repo.repository.lock().unstaged_status(&repo_path, mtime), - ); - } + let repo_path = RepoPath(repo_path.into()); + child_entry.git_status = repo.statuses.get(&repo_path); } } } @@ -4079,10 +4077,7 @@ impl BackgroundScanner { if !is_dir && !fs_entry.is_ignored && !fs_entry.is_external { if let Some((repo_entry, repo)) = state.snapshot.repo_for_path(path) { if let Ok(repo_path) = repo_entry.relativize(&state.snapshot, path) { - if let Some(mtime) = fs_entry.mtime { - let repo = repo.repo_ptr.lock(); - fs_entry.git_status = repo.status(&repo_path, mtime); - } + fs_entry.git_status = repo.repo_ptr.lock().status(&repo_path); } } } @@ -4267,11 +4262,8 @@ impl BackgroundScanner { path_entry.is_ignored = entry.is_ignored; if !entry.is_dir() && !entry.is_ignored && !entry.is_external { if let Some((ref repo_entry, local_repo)) = repo { - if let Some(mtime) = &entry.mtime { - if let Ok(repo_path) = repo_entry.relativize(&snapshot, &entry.path) { - let repo = local_repo.repo_ptr.lock(); - entry.git_status = repo.status(&repo_path, *mtime); - } + if let Ok(repo_path) = repo_entry.relativize(&snapshot, &entry.path) { + entry.git_status = local_repo.repo_ptr.lock().status(&repo_path); } } } @@ -4351,7 +4343,17 @@ impl BackgroundScanner { } }; - let staged_statuses = repository.lock().staged_statuses(Path::new("")); + let statuses = repository + .lock() + .statuses(Path::new("")) + .log_err() + .unwrap_or_default(); + let entries = state.snapshot.entries_by_path.clone(); + let location_in_repo = state + .snapshot + .repository_entries + .get(&work_dir) + .and_then(|repo| repo.location_in_repo.clone()); let mut files = state .snapshot @@ -4363,10 +4365,11 @@ impl BackgroundScanner { smol::block_on(update_job_tx.send(UpdateGitStatusesJob { start_path: start_path.clone(), end_path: end_path.clone(), + entries: entries.clone(), + location_in_repo: location_in_repo.clone(), containing_repository: ScanJobContainingRepository { work_directory: work_dir.clone(), - repository: repository.clone(), - staged_statuses: staged_statuses.clone(), + statuses: statuses.clone(), }, })) .unwrap(); @@ -4414,16 +4417,12 @@ impl BackgroundScanner { self.executor .scoped(|scope| { - // Git status updates are currently not very parallelizable, - // because they need to lock the git repository. Limit the number - // of workers so that - for _ in 0..self.executor.num_cpus().min(3) { + for _ in 0..self.executor.num_cpus() { scope.spawn(async { - let mut entries = Vec::with_capacity(GIT_STATUS_UPDATE_BATCH_SIZE); loop { select_biased! { // Process any path refresh requests before moving on to process - // the queue of ignore statuses. + // the queue of git statuses. request = self.scan_requests_rx.recv().fuse() => { let Ok(request) = request else { break }; if !self.process_scan_request(request, true).await { @@ -4434,7 +4433,7 @@ impl BackgroundScanner { // Process git status updates in batches. job = update_job_rx.recv().fuse() => { let Ok(job) = job else { break }; - self.update_git_statuses(job, &mut entries); + self.update_git_statuses(job); } } } @@ -4445,61 +4444,30 @@ impl BackgroundScanner { } /// Update the git statuses for a given batch of entries. - fn update_git_statuses(&self, job: UpdateGitStatusesJob, entries: &mut Vec) { + fn update_git_statuses(&self, job: UpdateGitStatusesJob) { + // Determine which entries in this batch have changed their git status. let t0 = Instant::now(); - let repo_work_dir = &job.containing_repository.work_directory; - let state = self.state.lock(); - let Some(repo_entry) = state - .snapshot - .repository_entries - .get(&repo_work_dir) - .cloned() - else { - return; - }; - - // Retrieve a batch of entries for this job, and then release the state lock. - entries.clear(); - for entry in state - .snapshot - .traverse_from_path(true, false, false, &job.start_path) - { + let mut edits = Vec::new(); + for entry in Traversal::new(&job.entries, true, false, false, &job.start_path) { if job .end_path .as_ref() .map_or(false, |end| &entry.path >= end) - || !entry.path.starts_with(&repo_work_dir) { break; } - entries.push(entry.clone()); - } - drop(state); - - // Determine which entries in this batch have changed their git status. - let mut edits = vec![]; - for entry in entries.iter() { - let Ok(repo_path) = entry.path.strip_prefix(&repo_work_dir) else { + let Ok(repo_path) = entry + .path + .strip_prefix(&job.containing_repository.work_directory) + else { continue; }; - let Some(mtime) = entry.mtime else { - continue; - }; - let repo_path = RepoPath(if let Some(location) = &repo_entry.location_in_repo { + let repo_path = RepoPath(if let Some(location) = &job.location_in_repo { location.join(repo_path) } else { repo_path.to_path_buf() }); - let git_status = combine_git_statuses( - job.containing_repository - .staged_statuses - .get(&repo_path) - .copied(), - job.containing_repository - .repository - .lock() - .unstaged_status(&repo_path, mtime), - ); + let git_status = job.containing_repository.statuses.get(&repo_path); if entry.git_status != git_status { let mut entry = entry.clone(); entry.git_status = git_status; @@ -4508,20 +4476,22 @@ impl BackgroundScanner { } // Apply the git status changes. - let mut state = self.state.lock(); - let path_changes = edits.iter().map(|edit| { - if let Edit::Insert(entry) = edit { - entry.path.clone() - } else { - unreachable!() - } - }); - util::extend_sorted(&mut state.changed_paths, path_changes, usize::MAX, Ord::cmp); - state.snapshot.entries_by_path.edit(edits, &()); + if edits.len() > 0 { + let mut state = self.state.lock(); + let path_changes = edits.iter().map(|edit| { + if let Edit::Insert(entry) = edit { + entry.path.clone() + } else { + unreachable!() + } + }); + util::extend_sorted(&mut state.changed_paths, path_changes, usize::MAX, Ord::cmp); + state.snapshot.entries_by_path.edit(edits, &()); + } log::trace!( - "refreshed git status of {} entries starting with {} in {:?}", - entries.len(), + "refreshed git status of entries starting with {} in {:?}", + // entries.len(), job.start_path.display(), t0.elapsed() ); @@ -4682,8 +4652,7 @@ struct ScanJob { #[derive(Clone)] struct ScanJobContainingRepository { work_directory: RepositoryWorkDirectory, - repository: Arc>, - staged_statuses: TreeMap, + statuses: GitStatus, } struct UpdateIgnoreStatusJob { @@ -4694,9 +4663,11 @@ struct UpdateIgnoreStatusJob { } struct UpdateGitStatusesJob { + entries: SumTree, start_path: Arc, end_path: Option>, containing_repository: ScanJobContainingRepository, + location_in_repo: Option>, } pub trait WorktreeModelHandle { @@ -4906,6 +4877,26 @@ pub struct Traversal<'a> { } impl<'a> Traversal<'a> { + fn new( + entries: &'a SumTree, + include_files: bool, + include_dirs: bool, + include_ignored: bool, + start_path: &Path, + ) -> Self { + let mut cursor = entries.cursor(); + cursor.seek(&TraversalTarget::Path(start_path), Bias::Left, &()); + let mut traversal = Self { + cursor, + include_files, + include_dirs, + include_ignored, + }; + if traversal.end_offset() == traversal.start_offset() { + traversal.next(); + } + traversal + } pub fn advance(&mut self) -> bool { self.advance_by(1) } @@ -5079,25 +5070,6 @@ impl<'a> TryFrom<(&'a CharBag, proto::Entry)> for Entry { } } -fn combine_git_statuses( - staged: Option, - unstaged: Option, -) -> Option { - if let Some(staged) = staged { - if let Some(unstaged) = unstaged { - if unstaged == staged { - Some(staged) - } else { - Some(GitFileStatus::Modified) - } - } else { - Some(staged) - } - } else { - unstaged - } -} - fn git_status_from_proto(git_status: Option) -> Option { git_status.and_then(|status| { proto::GitStatus::from_i32(status).map(|status| match status {