Exclude big files when performing a worktree diff.

This was lost previously when switching it over to a read-only
implementation.
Implementing it with an ignore list will take time, 400ms in the GitLab
repository, but it's not slower than it was before and it is always
preferred to not dump objects into the ODB unnecessarily.
This commit is contained in:
Sebastian Thiel 2024-08-28 21:52:48 +02:00
parent a0e236110a
commit 79798c7407
No known key found for this signature in database
GPG Key ID: 9CB5EE7895E8268B
8 changed files with 69 additions and 57 deletions

2
Cargo.lock generated
View File

@ -2211,6 +2211,7 @@ name = "gitbutler-command-context"
version = "0.0.0"
dependencies = [
"anyhow",
"bstr",
"git2",
"gitbutler-project",
"gix",
@ -2374,6 +2375,7 @@ dependencies = [
"anyhow",
"git2",
"gitbutler-branch",
"gitbutler-command-context",
"gitbutler-diff",
"gitbutler-fs",
"gitbutler-project",

View File

@ -47,16 +47,16 @@ pub fn get_applied_status_cached(
worktree_changes: Option<gitbutler_diff::DiffByPathMap>,
) -> Result<VirtualBranchesStatus> {
assure_open_workspace_mode(ctx).context("ng applied status requires open workspace mode")?;
// TODO(ST): this was `get_workspace_head()`, which is slow and ideally, we don't dynamically
// calculate which should already be 'fixed' - why do we have the integration branch
// if we can't assume it's in the right state? So ideally, we assure that the code
// that affects the integration branch also updates it?
let integration_commit_id = ctx.repository().head_commit()?.id();
let mut virtual_branches = ctx
.project()
.virtual_branches()
.list_branches_in_workspace()?;
let base_file_diffs = worktree_changes.map(Ok).unwrap_or_else(|| {
// TODO(ST): this was `get_workspace_head()`, which is slow and ideally, we don't dynamically
// calculate which should already be 'fixed' - why do we have the integration branch
// if we can't assume it's in the right state? So ideally, we assure that the code
// that affects the integration branch also updates it?
let integration_commit_id = ctx.repository().head_commit()?.id();
gitbutler_diff::workdir(ctx.repository(), &integration_commit_id.to_owned())
.context("failed to diff workdir")
})?;

View File

@ -12,3 +12,4 @@ gix.workspace = true
tracing.workspace = true
gitbutler-project.workspace = true
itertools = "0.13"
bstr = "1.10.0"

View File

@ -98,3 +98,6 @@ impl CommandContext {
)?)
}
}
mod repository_ext;
pub use repository_ext::RepositoryExtLite;

View File

@ -0,0 +1,50 @@
use anyhow::{Context, Result};
use gix::bstr::{BString, ByteVec};
use tracing::instrument;
/// An extension trait that should avoid pulling in large amounts of dependency so it can be used
/// in more places without causing cycles.
/// `gitbutler_repo::RepositoryExt` may not be usable everywhere due to that.
pub trait RepositoryExtLite {
/// Exclude files that are larger than `limit_in_bytes` (eg. database.sql which may never be intended to be committed)
/// so they don't show up in the next diff.
fn ignore_large_files_in_diffs(&self, limit_in_bytes: u64) -> Result<()>;
}
impl RepositoryExtLite for git2::Repository {
#[instrument(level = tracing::Level::DEBUG, skip(self), err(Debug))]
fn ignore_large_files_in_diffs(&self, limit_in_bytes: u64) -> Result<()> {
use gix::bstr::ByteSlice;
let repo = gix::open(self.path())?;
let worktree_dir = repo
.work_dir()
.context("All repos are expected to have a worktree")?;
let files_to_exclude: Vec<_> = repo
.dirwalk_iter(
repo.index_or_empty()?,
None::<BString>,
Default::default(),
repo.dirwalk_options()?
.emit_ignored(None)
.emit_pruned(false)
.emit_untracked(gix::dir::walk::EmissionMode::Matching),
)?
.filter_map(Result::ok)
.filter_map(|item| {
let path = worktree_dir.join(gix::path::from_bstr(item.entry.rela_path.as_bstr()));
let file_is_too_large = path
.metadata()
.map_or(false, |md| md.is_file() && md.len() > limit_in_bytes);
file_is_too_large
.then(|| Vec::from(item.entry.rela_path).into_string().ok())
.flatten()
})
.collect();
// TODO(ST): refactor this to be path-safe and ' ' save - the returned list is space separated (!!)
// Just make sure this isn't needed anymore.
let ignore_list = files_to_exclude.join(" ");
// In-memory, libgit2 internal ignore rule
self.add_ignore_rule(&ignore_list)?;
Ok(())
}
}

View File

@ -3,6 +3,7 @@ use std::{borrow::Cow, collections::HashMap, path::PathBuf, str};
use anyhow::{Context, Result};
use bstr::{BStr, BString, ByteSlice, ByteVec};
use gitbutler_cherry_pick::RepositoryExt;
use gitbutler_command_context::RepositoryExtLite;
use gitbutler_serde::BStringForFrontend;
use serde::{Deserialize, Serialize};
use tracing::instrument;
@ -155,8 +156,8 @@ pub fn workdir(repo: &git2::Repository, commit_oid: &git2::Oid) -> Result<DiffBy
for conflict_path_to_resolve in paths_to_add {
index.add_path(conflict_path_to_resolve.as_ref())?;
}
repo.ignore_large_files_in_diffs(50_000_000)?;
let diff = repo.diff_tree_to_workdir_with_index(Some(&old_tree), Some(&mut diff_opts))?;
// TODO(ST): bring back support for skipped (large) files.
hunks_by_filepath(Some(repo), &diff)
}

View File

@ -15,6 +15,7 @@ strum = { version = "0.26", features = ["derive"] }
tracing.workspace = true
gix = { workspace = true, features = ["dirwalk", "credentials", "parallel"] }
toml.workspace = true
gitbutler-command-context.workspace = true
gitbutler-project.workspace = true
gitbutler-branch.workspace = true
gitbutler-serde.workspace = true

View File

@ -1,7 +1,7 @@
use std::{
collections::{hash_map::Entry, HashMap},
fs,
path::{Path, PathBuf},
path::PathBuf,
str::{from_utf8, FromStr},
time::Duration,
};
@ -9,13 +9,13 @@ use std::{
use anyhow::{anyhow, bail, Context, Result};
use git2::{DiffOptions, FileMode};
use gitbutler_branch::{Branch, SignaturePurpose, VirtualBranchesHandle, VirtualBranchesState};
use gitbutler_command_context::RepositoryExtLite;
use gitbutler_diff::{hunks_by_filepath, FileDiff};
use gitbutler_project::{
access::{WorktreeReadPermission, WorktreeWritePermission},
Project,
};
use gitbutler_repo::RepositoryExt;
use gix::bstr::{BString, ByteSlice, ByteVec};
use tracing::instrument;
use super::{
@ -286,11 +286,7 @@ impl OplogExt for Project {
let old_wd_tree_id = tree_from_applied_vbranches(&repo, commit.parent(0)?.id())?;
let old_wd_tree = repo.find_tree(old_wd_tree_id)?;
// Exclude files that are larger than the limit (eg. database.sql which may never be intended to be committed)
let files_to_exclude =
worktree_files_larger_than_limit_as_git2_ignore_rule(&repo, worktree_dir)?;
// In-memory, libgit2 internal ignore rule
repo.add_ignore_rule(&files_to_exclude)?;
repo.ignore_large_files_in_diffs(SNAPSHOT_FILE_LIMIT_BYTES)?;
let mut diff_opts = git2::DiffOptions::new();
diff_opts
@ -579,11 +575,7 @@ fn restore_snapshot(
let workdir_tree_id = tree_from_applied_vbranches(&repo, snapshot_commit_id)?;
let workdir_tree = repo.find_tree(workdir_tree_id)?;
// Exclude files that are larger than the limit (eg. database.sql which may never be intended to be committed)
let files_to_exclude =
worktree_files_larger_than_limit_as_git2_ignore_rule(&repo, worktree_dir)?;
// In-memory, libgit2 internal ignore rule
repo.add_ignore_rule(&files_to_exclude)?;
repo.ignore_large_files_in_diffs(SNAPSHOT_FILE_LIMIT_BYTES)?;
// Define the checkout builder
let mut checkout_builder = git2::build::CheckoutBuilder::new();
@ -711,38 +703,6 @@ fn write_conflicts_tree(
Ok(conflicts_tree)
}
/// Exclude files that are larger than the limit (eg. database.sql which may never be intended to be committed)
/// TODO(ST): refactor this to be path-safe and ' ' save - the returned list is space separated (!!)
#[instrument(level = tracing::Level::DEBUG, skip(repo), err(Debug))]
fn worktree_files_larger_than_limit_as_git2_ignore_rule(
repo: &git2::Repository,
worktree_dir: &Path,
) -> Result<String> {
let repo = gix::open(repo.path())?;
let files_to_exclude: Vec<_> = repo
.dirwalk_iter(
repo.index_or_empty()?,
None::<BString>,
Default::default(),
repo.dirwalk_options()?
.emit_ignored(None)
.emit_pruned(false)
.emit_untracked(gix::dir::walk::EmissionMode::Matching),
)?
.filter_map(Result::ok)
.filter_map(|item| {
let path = worktree_dir.join(gix::path::from_bstr(item.entry.rela_path.as_bstr()));
let file_is_too_large = path.metadata().map_or(false, |md| {
md.is_file() && md.len() > SNAPSHOT_FILE_LIMIT_BYTES
});
file_is_too_large
.then(|| Vec::from(item.entry.rela_path).into_string().ok())
.flatten()
})
.collect();
Ok(files_to_exclude.join(" "))
}
/// Returns the number of lines of code (added + removed) since the last snapshot in `project`.
/// Includes untracked files.
/// `repo` is an already opened project repository.
@ -752,13 +712,7 @@ fn lines_since_snapshot(project: &Project, repo: &git2::Repository) -> Result<us
// This looks at the diff between the tree of the currently selected as 'default' branch (where new changes go)
// and that same tree in the last snapshot. For some reason, comparing workdir to the workdir subree from
// the snapshot simply does not give us what we need here, so instead using tree to tree comparison.
let worktree_dir = project.path.as_path();
// Exclude files that are larger than the limit (eg. database.sql which may never be intended to be committed)
let files_to_exclude =
worktree_files_larger_than_limit_as_git2_ignore_rule(repo, worktree_dir)?;
// In-memory, libgit2 internal ignore rule
repo.add_ignore_rule(&files_to_exclude)?;
repo.ignore_large_files_in_diffs(SNAPSHOT_FILE_LIMIT_BYTES)?;
let oplog_state = OplogHandle::new(&project.gb_dir());
let Some(oplog_commit_id) = oplog_state.oplog_head()? else {