Adjust GitHunk to not require UTF8 for diffs

This will make the diffing engine more correct overall, as only for
display there will be a potentially lossy conversion.

This will also prevent it to be considered binary even though it is not.
This commit is contained in:
Sebastian Thiel 2024-04-19 20:53:10 +02:00
parent fe950ec00a
commit 27714d8e0d
No known key found for this signature in database
GPG Key ID: 9CB5EE7895E8268B
6 changed files with 122 additions and 95 deletions

View File

@ -15,7 +15,7 @@ toml = "0.8.12"
anyhow = "1.0.81"
async-trait = "0.1.79"
backtrace = { version = "0.3.71", optional = true }
bstr = "1.9.1"
bstr = { version = "1.9.1", features = ["serde"] }
chrono = { version = "0.4.37", features = ["serde"] }
diffy = "0.3.0"
filetime = "0.2.23"

View File

@ -2,6 +2,7 @@ use std::path::PathBuf;
use std::{collections::HashMap, str};
use anyhow::{Context, Result};
use bstr::{BStr, BString, ByteSlice, ByteVec};
use serde::{Deserialize, Serialize};
use tracing::instrument;
@ -42,7 +43,7 @@ pub struct GitHunk {
pub old_lines: u32,
pub new_start: u32,
pub new_lines: u32,
pub diff: String,
pub diff: BString,
pub binary: bool,
pub change_type: ChangeType,
}
@ -179,32 +180,26 @@ fn hunks_by_filepath(
let old_start = hunk.as_ref().map_or(0, git2::DiffHunk::old_start);
let old_lines = hunk.as_ref().map_or(0, git2::DiffHunk::old_lines);
let assume_binary = || {
let full_path = repository.workdir().unwrap().join(file_path);
// save the file_path to the odb
if !delta.new_file().id().is_zero() && full_path.exists() {
// the binary file wasn't deleted
repository.blob_path(full_path.as_path()).unwrap();
}
Some((delta.new_file().id().to_string(), true))
};
let line = match line.origin() {
'+' | '-' | ' ' => {
if let Ok(content) = str::from_utf8(line.content()) {
Some((format!("{}{}", line.origin(), content), false))
} else {
assume_binary()
}
let mut buf = BString::new(Vec::with_capacity(line.content().len() + 1));
buf.push_char(line.origin());
buf.push_str(line.content());
Some((buf, false))
}
'B' => {
let full_path = repository.workdir().unwrap().join(file_path);
// save the file_path to the odb
if !delta.new_file().id().is_zero() && full_path.exists() {
// the binary file wasn't deleted
repository.blob_path(full_path.as_path()).unwrap();
}
Some((delta.new_file().id().to_string().into(), true))
}
'B' => assume_binary(),
'F' => None,
_ => {
if let Ok(content) = str::from_utf8(line.content()) {
Some((content.to_string(), false))
} else {
assume_binary()
}
let line: BString = line.content().into();
Some((line, false))
}
};
if let Some((line, is_binary)) = line {
@ -277,6 +272,12 @@ fn hunks_by_filepath(
.map(|(k, v)| {
if let Some(binary_hunk) = v.iter().find(|hunk| hunk.binary) {
if v.len() > 1 {
// TODO(ST): Would it be possible here to permanently discard lines because
// they are considered binary? After all, here we create a new change,
// turning multiple binary hunks into single line hunk (somehow).
// Probably answer: it's likely that this data is only created on the fly,
// and only the original source data is relevant - validate it.
// But: virtual branches definitely apply hunks.
// if there are multiple hunks with binary among them, then the binary hunk
// takes precedence
(
@ -303,7 +304,7 @@ fn hunks_by_filepath(
old_lines: 0,
new_start: 0,
new_lines: 0,
diff: String::new(),
diff: Default::default(),
binary: false,
change_type: ChangeType::Modified,
}],
@ -321,51 +322,58 @@ fn hunks_by_filepath(
}
// returns None if cannot reverse the patch header
fn reverse_patch_header(header: &str) -> Option<String> {
use itertools::Itertools;
let mut parts = header.split_whitespace();
fn reverse_patch_header(header: &BStr) -> Option<BString> {
let mut parts = header.split(|b| b.is_ascii_whitespace());
match parts.next() {
Some("@@") => {}
Some(b"@@") => {}
_ => return None,
};
let old_range = parts.next()?;
let new_range = parts.next()?;
match parts.next() {
Some("@@") => {}
_ => return None,
if parts.next() != Some(b"@@") {
return None;
};
Some(format!(
"@@ {} {} @@ {}",
new_range.replace('+', "-"),
old_range.replace('-', "+"),
parts.join(" ")
))
let mut buf: BString = "@@ ".into();
buf.extend_from_slice(&new_range.replace(b"+", b"-"));
buf.push(b' ');
buf.extend_from_slice(&old_range.replace(b"-", b"+"));
buf.push_str(b" @@ ");
let mut at_least_one_part = false;
for part in parts {
buf.extend_from_slice(part);
buf.push(b' ');
at_least_one_part = true;
}
if at_least_one_part {
buf.pop();
}
Some(buf)
}
fn reverse_patch(patch: &str) -> Option<String> {
let mut reversed = String::new();
fn reverse_patch(patch: &BStr) -> Option<BString> {
let mut reversed = BString::default();
for line in patch.lines() {
if line.starts_with("@@") {
if let Some(header) = reverse_patch_header(line) {
if line.starts_with(b"@@") {
if let Some(header) = reverse_patch_header(line.as_ref()) {
reversed.push_str(&header);
reversed.push('\n');
reversed.push(b'\n');
} else {
return None;
}
} else if line.starts_with('+') {
reversed.push_str(&line.replacen('+', "-", 1));
reversed.push('\n');
} else if line.starts_with('-') {
reversed.push_str(&line.replacen('-', "+", 1));
reversed.push('\n');
} else if line.starts_with(b"+") {
reversed.push_str(&line.replacen(b"+", b"-", 1));
reversed.push(b'\n');
} else if line.starts_with(b"-") {
reversed.push_str(&line.replacen(b"-", b"+", 1));
reversed.push(b'\n');
} else {
reversed.push_str(line);
reversed.push('\n');
reversed.push(b'\n');
}
}
Some(reversed)
@ -376,7 +384,7 @@ pub fn reverse_hunk(hunk: &GitHunk) -> Option<GitHunk> {
if hunk.binary {
None
} else {
reverse_patch(&hunk.diff).map(|diff| GitHunk {
reverse_patch(hunk.diff.as_ref()).map(|diff| GitHunk {
old_start: hunk.new_start,
old_lines: hunk.new_lines,
new_start: hunk.old_start,

View File

@ -7,6 +7,14 @@ pub struct Oid {
oid: git2::Oid,
}
impl Oid {
pub fn from_bytes(bytes: &[u8]) -> Result<Self, git2::Error> {
Ok(Self {
oid: git2::Oid::from_bytes(bytes)?,
})
}
}
impl Default for Oid {
fn default() -> Self {
git2::Oid::zero().into()

View File

@ -1,6 +1,7 @@
use std::{fmt::Display, ops::RangeInclusive, str::FromStr};
use anyhow::{anyhow, Context, Result};
use bstr::{BStr, ByteSlice};
use crate::git::diff;
@ -17,7 +18,7 @@ impl From<&diff::GitHunk> for Hunk {
Hunk {
start: hunk.new_start,
end: hunk.new_start + hunk.new_lines,
hash: Some(Hunk::hash(&hunk.diff)),
hash: Some(Hunk::hash(hunk.diff.as_ref())),
timestamp_ms: None,
}
}
@ -120,6 +121,7 @@ impl Hunk {
}
}
// TODO(ST): self - prevent many unnecessary copies
pub fn with_hash(&self, hash: &str) -> Self {
Hunk {
start: self.start,
@ -129,6 +131,7 @@ impl Hunk {
}
}
// TODO(ST): self - prevent many unnecessary copies
pub fn with_timestamp(&self, timestamp_ms: u128) -> Self {
Hunk {
start: self.start,
@ -157,12 +160,12 @@ impl Hunk {
self.start == other.new_start && self.end == other.new_start + other.new_lines
}
pub fn hash(diff: &str) -> String {
let addition = diff
.lines()
.skip(1) // skip the first line which is the diff header
.collect::<Vec<_>>()
.join("\n");
format!("{:x}", md5::compute(addition))
// TODO(perf): keep the hash as digest to avoid allocation.
pub fn hash(diff: &BStr) -> String {
let mut ctx = md5::Context::new();
diff.lines()
.skip(1) // skip the first line which is the diff header.
.for_each(|line| ctx.consume(line));
format!("{:x}", ctx.compute())
}
}

View File

@ -1,5 +1,5 @@
#[cfg(target_family = "unix")]
use std::os::unix::prelude::*;
use std::os::unix::prelude::PermissionsExt;
use std::{
collections::HashMap,
hash::Hash,
@ -8,8 +8,8 @@ use std::{
};
use anyhow::{anyhow, bail, Context, Result};
use bstr::ByteSlice;
use diffy::{apply as diffy_apply, Line, Patch};
use bstr::{BStr, BString, ByteSlice, ByteVec};
use diffy::{apply_bytes as diffy_apply, Line, Patch};
use git2_hooks::HookResult;
use regex::Regex;
use serde::Serialize;
@ -126,7 +126,7 @@ pub struct VirtualBranchFile {
#[serde(rename_all = "camelCase")]
pub struct VirtualBranchHunk {
pub id: String,
pub diff: String,
pub diff: BString,
pub modified_at: u128,
pub file_path: PathBuf,
pub hash: String,
@ -909,9 +909,9 @@ fn branches_with_large_files_abridged(mut branches: Vec<VirtualBranch>) -> Vec<V
// Diffs larger than 500kb are considered large
if file.hunks.iter().any(|hunk| hunk.diff.len() > 500_000) {
file.large = true;
file.hunks
.iter_mut()
.for_each(|hunk| hunk.diff = String::new());
file.hunks.iter_mut().for_each(|hunk| {
hunk.diff.drain(..);
});
}
}
}
@ -1603,7 +1603,7 @@ pub fn virtual_hunks_by_filepath(
start: hunk.new_start,
end: hunk.new_start + hunk.new_lines,
binary: hunk.binary,
hash: Hunk::hash(&hunk.diff),
hash: Hunk::hash(hunk.diff.as_ref()),
locked: false,
locked_to: None,
change_type: hunk.change_type,
@ -1784,7 +1784,7 @@ fn get_applied_status(
committed_git_hunk.new_start,
committed_git_hunk.new_start + committed_git_hunk.new_lines,
) {
let hash = Hunk::hash(&uncommitted_git_hunk.diff);
let hash = Hunk::hash(uncommitted_git_hunk.diff.as_ref());
git_hunk_map.insert(hash, branch.id);
}
}
@ -1817,7 +1817,7 @@ fn get_applied_status(
.filter_map(|claimed_hunk| {
// if any of the current hunks intersects with the owned hunk, we want to keep it
for (i, git_diff_hunk) in git_diff_hunks.iter().enumerate() {
let hash = Hunk::hash(&git_diff_hunk.diff);
let hash = Hunk::hash(git_diff_hunk.diff.as_ref());
if let Some(locked_to) = git_hunk_map.get(&hash) {
if locked_to != &branch.id {
return None;
@ -1887,7 +1887,7 @@ fn get_applied_status(
for (filepath, hunks) in base_diffs {
for hunk in hunks {
let hash = Hunk::hash(&hunk.diff);
let hash = Hunk::hash(hunk.diff.as_ref());
let vbranch_pos = if let Some(locked_to) = git_hunk_map.get(&hash) {
let p = virtual_branches.iter().position(|vb| vb.id == *locked_to);
match p {
@ -1904,7 +1904,7 @@ fn get_applied_status(
file_path: filepath.clone(),
hunks: vec![Hunk::from(&hunk)
.with_timestamp(get_mtime(&mut mtimes, &filepath))
.with_hash(Hunk::hash(hunk.diff.as_str()).as_str())],
.with_hash(Hunk::hash(hunk.diff.as_ref()).as_str())],
});
diffs_by_branch
@ -2079,8 +2079,9 @@ pub fn write_tree_onto_tree(
let rel_path = Path::new(&filepath);
let full_path = project_repository.path().join(rel_path);
let is_submodule =
full_path.is_dir() && hunks.len() == 1 && hunks[0].diff.contains("Subproject commit");
let is_submodule = full_path.is_dir()
&& hunks.len() == 1
&& hunks[0].diff.contains_str(b"Subproject commit");
// if file exists
if full_path.exists() {
@ -2130,7 +2131,11 @@ pub fn write_tree_onto_tree(
if hunks.len() == 1 && hunks[0].binary {
let new_blob_oid = &hunks[0].diff;
// convert string to Oid
let new_blob_oid = new_blob_oid.parse().context("failed to diff as oid")?;
let new_blob_oid = new_blob_oid
.to_str()
.expect("hex-string")
.parse()
.context("failed to diff as oid")?;
builder.upsert(rel_path, new_blob_oid, filemode);
} else {
// blob from tree_entry
@ -2140,34 +2145,35 @@ pub fn write_tree_onto_tree(
.peel_to_blob()
.context("failed to get blob")?;
let mut blob_contents = blob.content().to_str()?.to_string();
let blob_contents = blob.content();
let mut hunks = hunks.clone();
hunks.sort_by_key(|hunk| hunk.new_start);
let mut all_diffs = String::new();
let mut all_diffs = BString::default();
for hunk in hunks {
all_diffs.push_str(&hunk.diff);
}
let patch = Patch::from_str(&all_diffs)?;
blob_contents = apply(&blob_contents, &patch).context(format!(
let patch = Patch::from_bytes(&all_diffs)?;
let blob_contents = apply(blob_contents.into(), &patch).context(format!(
"failed to apply\n{}\nonto:\n{}",
&all_diffs, &blob_contents
all_diffs.as_bstr(),
blob_contents.as_bstr()
))?;
// create a blob
let new_blob_oid = git_repository.blob(blob_contents.as_bytes())?;
let new_blob_oid = git_repository.blob(&blob_contents)?;
// upsert into the builder
builder.upsert(rel_path, new_blob_oid, filemode);
}
} else if is_submodule {
let mut blob_contents = String::new();
let mut blob_contents = BString::default();
let mut hunks = hunks.clone();
hunks.sort_by_key(|hunk| hunk.new_start);
for hunk in hunks {
let patch = Patch::from_str(&hunk.diff)?;
blob_contents = apply(&blob_contents, &patch)
let patch = Patch::from_bytes(&hunk.diff)?;
blob_contents = apply(blob_contents.as_ref(), &patch)
.context(format!("failed to apply {}", &hunk.diff))?;
}
@ -3640,7 +3646,7 @@ pub fn create_virtual_branch_from_branch(
}
/// Just like [`diffy::apply()`], but on error it will attach hashes of the input `base_image` and `patch`.
pub fn apply(base_image: &str, patch: &Patch<'_, str>) -> Result<String> {
pub fn apply(base_image: &BStr, patch: &Patch<'_, [u8]>) -> Result<BString> {
fn md5_hash_hex(b: impl AsRef<[u8]>) -> String {
format!("{:x}", md5::compute(b))
}
@ -3654,8 +3660,8 @@ pub fn apply(base_image: &str, patch: &Patch<'_, str>) -> Result<String> {
Insert(String),
}
impl<'a> From<&diffy::Line<'a, str>> for DebugLine {
fn from(line: &Line<'a, str>) -> Self {
impl<'a> From<&diffy::Line<'a, [u8]>> for DebugLine {
fn from(line: &Line<'a, [u8]>) -> Self {
match line {
Line::Context(s) => DebugLine::Context(md5_hash_hex(s)),
Line::Delete(s) => DebugLine::Delete(md5_hash_hex(s)),
@ -3672,8 +3678,8 @@ pub fn apply(base_image: &str, patch: &Patch<'_, str>) -> Result<String> {
lines: Vec<DebugLine>,
}
impl<'a> From<&diffy::Hunk<'a, str>> for DebugHunk {
fn from(hunk: &diffy::Hunk<'a, str>) -> Self {
impl<'a> From<&diffy::Hunk<'a, [u8]>> for DebugHunk {
fn from(hunk: &diffy::Hunk<'a, [u8]>) -> Self {
Self {
old_range: hunk.old_range(),
new_range: hunk.new_range(),
@ -3695,10 +3701,12 @@ pub fn apply(base_image: &str, patch: &Patch<'_, str>) -> Result<String> {
}
}
diffy_apply(base_image, patch).with_context(|| DebugContext {
base_image_hash: md5_hash_hex(base_image),
hunks: patch.hunks().iter().map(Into::into).collect(),
})
diffy_apply(base_image, patch)
.with_context(|| DebugContext {
base_image_hash: md5_hash_hex(base_image),
hunks: patch.hunks().iter().map(Into::into).collect(),
})
.map(Into::into)
}
// Goes through a set of changes and checks if conflicts are present. If no conflicts
@ -3714,10 +3722,10 @@ fn update_conflict_markers(
if conflicting_files.contains(&file_path.display().to_string()) {
// check file for conflict markers, resolve the file if there are none in any hunk
for hunk in non_commited_hunks {
if hunk.diff.contains("<<<<<<< ours") {
if hunk.diff.contains_str(b"<<<<<<< ours") {
conflicted = true;
}
if hunk.diff.contains(">>>>>>> theirs") {
if hunk.diff.contains_str(b">>>>>>> theirs") {
conflicted = true;
}
}

View File

@ -48,7 +48,7 @@ async fn should_unapply_with_commits() {
.unwrap(),
)
.await
.unwrap();
.unwrap_or_else(|err| panic!("{err:?}"));
let branch = controller
.list_virtual_branches(project_id)