From 79798c74074886ef466f9cbf4d0114d1d90e1541 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 28 Aug 2024 21:52:48 +0200 Subject: [PATCH] Exclude big files when performing a worktree diff. This was lost previously when switching it over to a read-only implementation. Implementing it with an ignore list will take time, 400ms in the GitLab repository, but it's not slower than it was before and it is always preferred to not dump objects into the ODB unnecessarily. --- Cargo.lock | 2 + crates/gitbutler-branch-actions/src/status.rs | 10 ++-- crates/gitbutler-command-context/Cargo.toml | 1 + crates/gitbutler-command-context/src/lib.rs | 3 + .../src/repository_ext.rs | 50 +++++++++++++++++ crates/gitbutler-diff/src/diff.rs | 3 +- crates/gitbutler-oplog/Cargo.toml | 1 + crates/gitbutler-oplog/src/oplog.rs | 56 ++----------------- 8 files changed, 69 insertions(+), 57 deletions(-) create mode 100644 crates/gitbutler-command-context/src/repository_ext.rs diff --git a/Cargo.lock b/Cargo.lock index b94b5110f..299146394 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2211,6 +2211,7 @@ name = "gitbutler-command-context" version = "0.0.0" dependencies = [ "anyhow", + "bstr", "git2", "gitbutler-project", "gix", @@ -2374,6 +2375,7 @@ dependencies = [ "anyhow", "git2", "gitbutler-branch", + "gitbutler-command-context", "gitbutler-diff", "gitbutler-fs", "gitbutler-project", diff --git a/crates/gitbutler-branch-actions/src/status.rs b/crates/gitbutler-branch-actions/src/status.rs index dc2f5ae28..65126359e 100644 --- a/crates/gitbutler-branch-actions/src/status.rs +++ b/crates/gitbutler-branch-actions/src/status.rs @@ -47,16 +47,16 @@ pub fn get_applied_status_cached( worktree_changes: Option, ) -> Result { assure_open_workspace_mode(ctx).context("ng applied status requires open workspace mode")?; - // TODO(ST): this was `get_workspace_head()`, which is slow and ideally, we don't dynamically - // calculate which should already be 'fixed' - why do we have the integration branch - // if we can't assume it's in the right state? So ideally, we assure that the code - // that affects the integration branch also updates it? - let integration_commit_id = ctx.repository().head_commit()?.id(); let mut virtual_branches = ctx .project() .virtual_branches() .list_branches_in_workspace()?; let base_file_diffs = worktree_changes.map(Ok).unwrap_or_else(|| { + // TODO(ST): this was `get_workspace_head()`, which is slow and ideally, we don't dynamically + // calculate which should already be 'fixed' - why do we have the integration branch + // if we can't assume it's in the right state? So ideally, we assure that the code + // that affects the integration branch also updates it? + let integration_commit_id = ctx.repository().head_commit()?.id(); gitbutler_diff::workdir(ctx.repository(), &integration_commit_id.to_owned()) .context("failed to diff workdir") })?; diff --git a/crates/gitbutler-command-context/Cargo.toml b/crates/gitbutler-command-context/Cargo.toml index df826abf7..cd5d07620 100644 --- a/crates/gitbutler-command-context/Cargo.toml +++ b/crates/gitbutler-command-context/Cargo.toml @@ -12,3 +12,4 @@ gix.workspace = true tracing.workspace = true gitbutler-project.workspace = true itertools = "0.13" +bstr = "1.10.0" diff --git a/crates/gitbutler-command-context/src/lib.rs b/crates/gitbutler-command-context/src/lib.rs index ca8a45422..c4dc09dc6 100644 --- a/crates/gitbutler-command-context/src/lib.rs +++ b/crates/gitbutler-command-context/src/lib.rs @@ -98,3 +98,6 @@ impl CommandContext { )?) } } + +mod repository_ext; +pub use repository_ext::RepositoryExtLite; diff --git a/crates/gitbutler-command-context/src/repository_ext.rs b/crates/gitbutler-command-context/src/repository_ext.rs new file mode 100644 index 000000000..58841ea89 --- /dev/null +++ b/crates/gitbutler-command-context/src/repository_ext.rs @@ -0,0 +1,50 @@ +use anyhow::{Context, Result}; +use gix::bstr::{BString, ByteVec}; +use tracing::instrument; + +/// An extension trait that should avoid pulling in large amounts of dependency so it can be used +/// in more places without causing cycles. +/// `gitbutler_repo::RepositoryExt` may not be usable everywhere due to that. +pub trait RepositoryExtLite { + /// Exclude files that are larger than `limit_in_bytes` (eg. database.sql which may never be intended to be committed) + /// so they don't show up in the next diff. + fn ignore_large_files_in_diffs(&self, limit_in_bytes: u64) -> Result<()>; +} + +impl RepositoryExtLite for git2::Repository { + #[instrument(level = tracing::Level::DEBUG, skip(self), err(Debug))] + fn ignore_large_files_in_diffs(&self, limit_in_bytes: u64) -> Result<()> { + use gix::bstr::ByteSlice; + let repo = gix::open(self.path())?; + let worktree_dir = repo + .work_dir() + .context("All repos are expected to have a worktree")?; + let files_to_exclude: Vec<_> = repo + .dirwalk_iter( + repo.index_or_empty()?, + None::, + Default::default(), + repo.dirwalk_options()? + .emit_ignored(None) + .emit_pruned(false) + .emit_untracked(gix::dir::walk::EmissionMode::Matching), + )? + .filter_map(Result::ok) + .filter_map(|item| { + let path = worktree_dir.join(gix::path::from_bstr(item.entry.rela_path.as_bstr())); + let file_is_too_large = path + .metadata() + .map_or(false, |md| md.is_file() && md.len() > limit_in_bytes); + file_is_too_large + .then(|| Vec::from(item.entry.rela_path).into_string().ok()) + .flatten() + }) + .collect(); + // TODO(ST): refactor this to be path-safe and ' ' save - the returned list is space separated (!!) + // Just make sure this isn't needed anymore. + let ignore_list = files_to_exclude.join(" "); + // In-memory, libgit2 internal ignore rule + self.add_ignore_rule(&ignore_list)?; + Ok(()) + } +} diff --git a/crates/gitbutler-diff/src/diff.rs b/crates/gitbutler-diff/src/diff.rs index 393f27c50..0315d8cdb 100644 --- a/crates/gitbutler-diff/src/diff.rs +++ b/crates/gitbutler-diff/src/diff.rs @@ -3,6 +3,7 @@ use std::{borrow::Cow, collections::HashMap, path::PathBuf, str}; use anyhow::{Context, Result}; use bstr::{BStr, BString, ByteSlice, ByteVec}; use gitbutler_cherry_pick::RepositoryExt; +use gitbutler_command_context::RepositoryExtLite; use gitbutler_serde::BStringForFrontend; use serde::{Deserialize, Serialize}; use tracing::instrument; @@ -155,8 +156,8 @@ pub fn workdir(repo: &git2::Repository, commit_oid: &git2::Oid) -> Result Result { - let repo = gix::open(repo.path())?; - let files_to_exclude: Vec<_> = repo - .dirwalk_iter( - repo.index_or_empty()?, - None::, - Default::default(), - repo.dirwalk_options()? - .emit_ignored(None) - .emit_pruned(false) - .emit_untracked(gix::dir::walk::EmissionMode::Matching), - )? - .filter_map(Result::ok) - .filter_map(|item| { - let path = worktree_dir.join(gix::path::from_bstr(item.entry.rela_path.as_bstr())); - let file_is_too_large = path.metadata().map_or(false, |md| { - md.is_file() && md.len() > SNAPSHOT_FILE_LIMIT_BYTES - }); - file_is_too_large - .then(|| Vec::from(item.entry.rela_path).into_string().ok()) - .flatten() - }) - .collect(); - Ok(files_to_exclude.join(" ")) -} - /// Returns the number of lines of code (added + removed) since the last snapshot in `project`. /// Includes untracked files. /// `repo` is an already opened project repository. @@ -752,13 +712,7 @@ fn lines_since_snapshot(project: &Project, repo: &git2::Repository) -> Result