Adjust GitHunk to not require UTF8 for diffs

This will make the diffing engine more correct overall, as only for
display there will be a potentially lossy conversion.

This will also prevent it to be considered binary even though it is not.
This commit is contained in:
Sebastian Thiel 2024-04-19 20:53:10 +02:00
parent fe950ec00a
commit 27714d8e0d
No known key found for this signature in database
GPG Key ID: 9CB5EE7895E8268B
6 changed files with 122 additions and 95 deletions

View File

@ -15,7 +15,7 @@ toml = "0.8.12"
anyhow = "1.0.81" anyhow = "1.0.81"
async-trait = "0.1.79" async-trait = "0.1.79"
backtrace = { version = "0.3.71", optional = true } backtrace = { version = "0.3.71", optional = true }
bstr = "1.9.1" bstr = { version = "1.9.1", features = ["serde"] }
chrono = { version = "0.4.37", features = ["serde"] } chrono = { version = "0.4.37", features = ["serde"] }
diffy = "0.3.0" diffy = "0.3.0"
filetime = "0.2.23" filetime = "0.2.23"

View File

@ -2,6 +2,7 @@ use std::path::PathBuf;
use std::{collections::HashMap, str}; use std::{collections::HashMap, str};
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use bstr::{BStr, BString, ByteSlice, ByteVec};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tracing::instrument; use tracing::instrument;
@ -42,7 +43,7 @@ pub struct GitHunk {
pub old_lines: u32, pub old_lines: u32,
pub new_start: u32, pub new_start: u32,
pub new_lines: u32, pub new_lines: u32,
pub diff: String, pub diff: BString,
pub binary: bool, pub binary: bool,
pub change_type: ChangeType, pub change_type: ChangeType,
} }
@ -179,32 +180,26 @@ fn hunks_by_filepath(
let old_start = hunk.as_ref().map_or(0, git2::DiffHunk::old_start); let old_start = hunk.as_ref().map_or(0, git2::DiffHunk::old_start);
let old_lines = hunk.as_ref().map_or(0, git2::DiffHunk::old_lines); let old_lines = hunk.as_ref().map_or(0, git2::DiffHunk::old_lines);
let assume_binary = || { let line = match line.origin() {
'+' | '-' | ' ' => {
let mut buf = BString::new(Vec::with_capacity(line.content().len() + 1));
buf.push_char(line.origin());
buf.push_str(line.content());
Some((buf, false))
}
'B' => {
let full_path = repository.workdir().unwrap().join(file_path); let full_path = repository.workdir().unwrap().join(file_path);
// save the file_path to the odb // save the file_path to the odb
if !delta.new_file().id().is_zero() && full_path.exists() { if !delta.new_file().id().is_zero() && full_path.exists() {
// the binary file wasn't deleted // the binary file wasn't deleted
repository.blob_path(full_path.as_path()).unwrap(); repository.blob_path(full_path.as_path()).unwrap();
} }
Some((delta.new_file().id().to_string(), true)) Some((delta.new_file().id().to_string().into(), true))
};
let line = match line.origin() {
'+' | '-' | ' ' => {
if let Ok(content) = str::from_utf8(line.content()) {
Some((format!("{}{}", line.origin(), content), false))
} else {
assume_binary()
} }
}
'B' => assume_binary(),
'F' => None, 'F' => None,
_ => { _ => {
if let Ok(content) = str::from_utf8(line.content()) { let line: BString = line.content().into();
Some((content.to_string(), false)) Some((line, false))
} else {
assume_binary()
}
} }
}; };
if let Some((line, is_binary)) = line { if let Some((line, is_binary)) = line {
@ -277,6 +272,12 @@ fn hunks_by_filepath(
.map(|(k, v)| { .map(|(k, v)| {
if let Some(binary_hunk) = v.iter().find(|hunk| hunk.binary) { if let Some(binary_hunk) = v.iter().find(|hunk| hunk.binary) {
if v.len() > 1 { if v.len() > 1 {
// TODO(ST): Would it be possible here to permanently discard lines because
// they are considered binary? After all, here we create a new change,
// turning multiple binary hunks into single line hunk (somehow).
// Probably answer: it's likely that this data is only created on the fly,
// and only the original source data is relevant - validate it.
// But: virtual branches definitely apply hunks.
// if there are multiple hunks with binary among them, then the binary hunk // if there are multiple hunks with binary among them, then the binary hunk
// takes precedence // takes precedence
( (
@ -303,7 +304,7 @@ fn hunks_by_filepath(
old_lines: 0, old_lines: 0,
new_start: 0, new_start: 0,
new_lines: 0, new_lines: 0,
diff: String::new(), diff: Default::default(),
binary: false, binary: false,
change_type: ChangeType::Modified, change_type: ChangeType::Modified,
}], }],
@ -321,51 +322,58 @@ fn hunks_by_filepath(
} }
// returns None if cannot reverse the patch header // returns None if cannot reverse the patch header
fn reverse_patch_header(header: &str) -> Option<String> { fn reverse_patch_header(header: &BStr) -> Option<BString> {
use itertools::Itertools; let mut parts = header.split(|b| b.is_ascii_whitespace());
let mut parts = header.split_whitespace();
match parts.next() { match parts.next() {
Some("@@") => {} Some(b"@@") => {}
_ => return None, _ => return None,
}; };
let old_range = parts.next()?; let old_range = parts.next()?;
let new_range = parts.next()?; let new_range = parts.next()?;
match parts.next() { if parts.next() != Some(b"@@") {
Some("@@") => {} return None;
_ => return None,
}; };
Some(format!( let mut buf: BString = "@@ ".into();
"@@ {} {} @@ {}", buf.extend_from_slice(&new_range.replace(b"+", b"-"));
new_range.replace('+', "-"), buf.push(b' ');
old_range.replace('-', "+"), buf.extend_from_slice(&old_range.replace(b"-", b"+"));
parts.join(" ") buf.push_str(b" @@ ");
))
let mut at_least_one_part = false;
for part in parts {
buf.extend_from_slice(part);
buf.push(b' ');
at_least_one_part = true;
}
if at_least_one_part {
buf.pop();
}
Some(buf)
} }
fn reverse_patch(patch: &str) -> Option<String> { fn reverse_patch(patch: &BStr) -> Option<BString> {
let mut reversed = String::new(); let mut reversed = BString::default();
for line in patch.lines() { for line in patch.lines() {
if line.starts_with("@@") { if line.starts_with(b"@@") {
if let Some(header) = reverse_patch_header(line) { if let Some(header) = reverse_patch_header(line.as_ref()) {
reversed.push_str(&header); reversed.push_str(&header);
reversed.push('\n'); reversed.push(b'\n');
} else { } else {
return None; return None;
} }
} else if line.starts_with('+') { } else if line.starts_with(b"+") {
reversed.push_str(&line.replacen('+', "-", 1)); reversed.push_str(&line.replacen(b"+", b"-", 1));
reversed.push('\n'); reversed.push(b'\n');
} else if line.starts_with('-') { } else if line.starts_with(b"-") {
reversed.push_str(&line.replacen('-', "+", 1)); reversed.push_str(&line.replacen(b"-", b"+", 1));
reversed.push('\n'); reversed.push(b'\n');
} else { } else {
reversed.push_str(line); reversed.push_str(line);
reversed.push('\n'); reversed.push(b'\n');
} }
} }
Some(reversed) Some(reversed)
@ -376,7 +384,7 @@ pub fn reverse_hunk(hunk: &GitHunk) -> Option<GitHunk> {
if hunk.binary { if hunk.binary {
None None
} else { } else {
reverse_patch(&hunk.diff).map(|diff| GitHunk { reverse_patch(hunk.diff.as_ref()).map(|diff| GitHunk {
old_start: hunk.new_start, old_start: hunk.new_start,
old_lines: hunk.new_lines, old_lines: hunk.new_lines,
new_start: hunk.old_start, new_start: hunk.old_start,

View File

@ -7,6 +7,14 @@ pub struct Oid {
oid: git2::Oid, oid: git2::Oid,
} }
impl Oid {
pub fn from_bytes(bytes: &[u8]) -> Result<Self, git2::Error> {
Ok(Self {
oid: git2::Oid::from_bytes(bytes)?,
})
}
}
impl Default for Oid { impl Default for Oid {
fn default() -> Self { fn default() -> Self {
git2::Oid::zero().into() git2::Oid::zero().into()

View File

@ -1,6 +1,7 @@
use std::{fmt::Display, ops::RangeInclusive, str::FromStr}; use std::{fmt::Display, ops::RangeInclusive, str::FromStr};
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use bstr::{BStr, ByteSlice};
use crate::git::diff; use crate::git::diff;
@ -17,7 +18,7 @@ impl From<&diff::GitHunk> for Hunk {
Hunk { Hunk {
start: hunk.new_start, start: hunk.new_start,
end: hunk.new_start + hunk.new_lines, end: hunk.new_start + hunk.new_lines,
hash: Some(Hunk::hash(&hunk.diff)), hash: Some(Hunk::hash(hunk.diff.as_ref())),
timestamp_ms: None, timestamp_ms: None,
} }
} }
@ -120,6 +121,7 @@ impl Hunk {
} }
} }
// TODO(ST): self - prevent many unnecessary copies
pub fn with_hash(&self, hash: &str) -> Self { pub fn with_hash(&self, hash: &str) -> Self {
Hunk { Hunk {
start: self.start, start: self.start,
@ -129,6 +131,7 @@ impl Hunk {
} }
} }
// TODO(ST): self - prevent many unnecessary copies
pub fn with_timestamp(&self, timestamp_ms: u128) -> Self { pub fn with_timestamp(&self, timestamp_ms: u128) -> Self {
Hunk { Hunk {
start: self.start, start: self.start,
@ -157,12 +160,12 @@ impl Hunk {
self.start == other.new_start && self.end == other.new_start + other.new_lines self.start == other.new_start && self.end == other.new_start + other.new_lines
} }
pub fn hash(diff: &str) -> String { // TODO(perf): keep the hash as digest to avoid allocation.
let addition = diff pub fn hash(diff: &BStr) -> String {
.lines() let mut ctx = md5::Context::new();
.skip(1) // skip the first line which is the diff header diff.lines()
.collect::<Vec<_>>() .skip(1) // skip the first line which is the diff header.
.join("\n"); .for_each(|line| ctx.consume(line));
format!("{:x}", md5::compute(addition)) format!("{:x}", ctx.compute())
} }
} }

View File

@ -1,5 +1,5 @@
#[cfg(target_family = "unix")] #[cfg(target_family = "unix")]
use std::os::unix::prelude::*; use std::os::unix::prelude::PermissionsExt;
use std::{ use std::{
collections::HashMap, collections::HashMap,
hash::Hash, hash::Hash,
@ -8,8 +8,8 @@ use std::{
}; };
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{anyhow, bail, Context, Result};
use bstr::ByteSlice; use bstr::{BStr, BString, ByteSlice, ByteVec};
use diffy::{apply as diffy_apply, Line, Patch}; use diffy::{apply_bytes as diffy_apply, Line, Patch};
use git2_hooks::HookResult; use git2_hooks::HookResult;
use regex::Regex; use regex::Regex;
use serde::Serialize; use serde::Serialize;
@ -126,7 +126,7 @@ pub struct VirtualBranchFile {
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct VirtualBranchHunk { pub struct VirtualBranchHunk {
pub id: String, pub id: String,
pub diff: String, pub diff: BString,
pub modified_at: u128, pub modified_at: u128,
pub file_path: PathBuf, pub file_path: PathBuf,
pub hash: String, pub hash: String,
@ -909,9 +909,9 @@ fn branches_with_large_files_abridged(mut branches: Vec<VirtualBranch>) -> Vec<V
// Diffs larger than 500kb are considered large // Diffs larger than 500kb are considered large
if file.hunks.iter().any(|hunk| hunk.diff.len() > 500_000) { if file.hunks.iter().any(|hunk| hunk.diff.len() > 500_000) {
file.large = true; file.large = true;
file.hunks file.hunks.iter_mut().for_each(|hunk| {
.iter_mut() hunk.diff.drain(..);
.for_each(|hunk| hunk.diff = String::new()); });
} }
} }
} }
@ -1603,7 +1603,7 @@ pub fn virtual_hunks_by_filepath(
start: hunk.new_start, start: hunk.new_start,
end: hunk.new_start + hunk.new_lines, end: hunk.new_start + hunk.new_lines,
binary: hunk.binary, binary: hunk.binary,
hash: Hunk::hash(&hunk.diff), hash: Hunk::hash(hunk.diff.as_ref()),
locked: false, locked: false,
locked_to: None, locked_to: None,
change_type: hunk.change_type, change_type: hunk.change_type,
@ -1784,7 +1784,7 @@ fn get_applied_status(
committed_git_hunk.new_start, committed_git_hunk.new_start,
committed_git_hunk.new_start + committed_git_hunk.new_lines, committed_git_hunk.new_start + committed_git_hunk.new_lines,
) { ) {
let hash = Hunk::hash(&uncommitted_git_hunk.diff); let hash = Hunk::hash(uncommitted_git_hunk.diff.as_ref());
git_hunk_map.insert(hash, branch.id); git_hunk_map.insert(hash, branch.id);
} }
} }
@ -1817,7 +1817,7 @@ fn get_applied_status(
.filter_map(|claimed_hunk| { .filter_map(|claimed_hunk| {
// if any of the current hunks intersects with the owned hunk, we want to keep it // if any of the current hunks intersects with the owned hunk, we want to keep it
for (i, git_diff_hunk) in git_diff_hunks.iter().enumerate() { for (i, git_diff_hunk) in git_diff_hunks.iter().enumerate() {
let hash = Hunk::hash(&git_diff_hunk.diff); let hash = Hunk::hash(git_diff_hunk.diff.as_ref());
if let Some(locked_to) = git_hunk_map.get(&hash) { if let Some(locked_to) = git_hunk_map.get(&hash) {
if locked_to != &branch.id { if locked_to != &branch.id {
return None; return None;
@ -1887,7 +1887,7 @@ fn get_applied_status(
for (filepath, hunks) in base_diffs { for (filepath, hunks) in base_diffs {
for hunk in hunks { for hunk in hunks {
let hash = Hunk::hash(&hunk.diff); let hash = Hunk::hash(hunk.diff.as_ref());
let vbranch_pos = if let Some(locked_to) = git_hunk_map.get(&hash) { let vbranch_pos = if let Some(locked_to) = git_hunk_map.get(&hash) {
let p = virtual_branches.iter().position(|vb| vb.id == *locked_to); let p = virtual_branches.iter().position(|vb| vb.id == *locked_to);
match p { match p {
@ -1904,7 +1904,7 @@ fn get_applied_status(
file_path: filepath.clone(), file_path: filepath.clone(),
hunks: vec![Hunk::from(&hunk) hunks: vec![Hunk::from(&hunk)
.with_timestamp(get_mtime(&mut mtimes, &filepath)) .with_timestamp(get_mtime(&mut mtimes, &filepath))
.with_hash(Hunk::hash(hunk.diff.as_str()).as_str())], .with_hash(Hunk::hash(hunk.diff.as_ref()).as_str())],
}); });
diffs_by_branch diffs_by_branch
@ -2079,8 +2079,9 @@ pub fn write_tree_onto_tree(
let rel_path = Path::new(&filepath); let rel_path = Path::new(&filepath);
let full_path = project_repository.path().join(rel_path); let full_path = project_repository.path().join(rel_path);
let is_submodule = let is_submodule = full_path.is_dir()
full_path.is_dir() && hunks.len() == 1 && hunks[0].diff.contains("Subproject commit"); && hunks.len() == 1
&& hunks[0].diff.contains_str(b"Subproject commit");
// if file exists // if file exists
if full_path.exists() { if full_path.exists() {
@ -2130,7 +2131,11 @@ pub fn write_tree_onto_tree(
if hunks.len() == 1 && hunks[0].binary { if hunks.len() == 1 && hunks[0].binary {
let new_blob_oid = &hunks[0].diff; let new_blob_oid = &hunks[0].diff;
// convert string to Oid // convert string to Oid
let new_blob_oid = new_blob_oid.parse().context("failed to diff as oid")?; let new_blob_oid = new_blob_oid
.to_str()
.expect("hex-string")
.parse()
.context("failed to diff as oid")?;
builder.upsert(rel_path, new_blob_oid, filemode); builder.upsert(rel_path, new_blob_oid, filemode);
} else { } else {
// blob from tree_entry // blob from tree_entry
@ -2140,34 +2145,35 @@ pub fn write_tree_onto_tree(
.peel_to_blob() .peel_to_blob()
.context("failed to get blob")?; .context("failed to get blob")?;
let mut blob_contents = blob.content().to_str()?.to_string(); let blob_contents = blob.content();
let mut hunks = hunks.clone(); let mut hunks = hunks.clone();
hunks.sort_by_key(|hunk| hunk.new_start); hunks.sort_by_key(|hunk| hunk.new_start);
let mut all_diffs = String::new(); let mut all_diffs = BString::default();
for hunk in hunks { for hunk in hunks {
all_diffs.push_str(&hunk.diff); all_diffs.push_str(&hunk.diff);
} }
let patch = Patch::from_str(&all_diffs)?; let patch = Patch::from_bytes(&all_diffs)?;
blob_contents = apply(&blob_contents, &patch).context(format!( let blob_contents = apply(blob_contents.into(), &patch).context(format!(
"failed to apply\n{}\nonto:\n{}", "failed to apply\n{}\nonto:\n{}",
&all_diffs, &blob_contents all_diffs.as_bstr(),
blob_contents.as_bstr()
))?; ))?;
// create a blob // create a blob
let new_blob_oid = git_repository.blob(blob_contents.as_bytes())?; let new_blob_oid = git_repository.blob(&blob_contents)?;
// upsert into the builder // upsert into the builder
builder.upsert(rel_path, new_blob_oid, filemode); builder.upsert(rel_path, new_blob_oid, filemode);
} }
} else if is_submodule { } else if is_submodule {
let mut blob_contents = String::new(); let mut blob_contents = BString::default();
let mut hunks = hunks.clone(); let mut hunks = hunks.clone();
hunks.sort_by_key(|hunk| hunk.new_start); hunks.sort_by_key(|hunk| hunk.new_start);
for hunk in hunks { for hunk in hunks {
let patch = Patch::from_str(&hunk.diff)?; let patch = Patch::from_bytes(&hunk.diff)?;
blob_contents = apply(&blob_contents, &patch) blob_contents = apply(blob_contents.as_ref(), &patch)
.context(format!("failed to apply {}", &hunk.diff))?; .context(format!("failed to apply {}", &hunk.diff))?;
} }
@ -3640,7 +3646,7 @@ pub fn create_virtual_branch_from_branch(
} }
/// Just like [`diffy::apply()`], but on error it will attach hashes of the input `base_image` and `patch`. /// Just like [`diffy::apply()`], but on error it will attach hashes of the input `base_image` and `patch`.
pub fn apply(base_image: &str, patch: &Patch<'_, str>) -> Result<String> { pub fn apply(base_image: &BStr, patch: &Patch<'_, [u8]>) -> Result<BString> {
fn md5_hash_hex(b: impl AsRef<[u8]>) -> String { fn md5_hash_hex(b: impl AsRef<[u8]>) -> String {
format!("{:x}", md5::compute(b)) format!("{:x}", md5::compute(b))
} }
@ -3654,8 +3660,8 @@ pub fn apply(base_image: &str, patch: &Patch<'_, str>) -> Result<String> {
Insert(String), Insert(String),
} }
impl<'a> From<&diffy::Line<'a, str>> for DebugLine { impl<'a> From<&diffy::Line<'a, [u8]>> for DebugLine {
fn from(line: &Line<'a, str>) -> Self { fn from(line: &Line<'a, [u8]>) -> Self {
match line { match line {
Line::Context(s) => DebugLine::Context(md5_hash_hex(s)), Line::Context(s) => DebugLine::Context(md5_hash_hex(s)),
Line::Delete(s) => DebugLine::Delete(md5_hash_hex(s)), Line::Delete(s) => DebugLine::Delete(md5_hash_hex(s)),
@ -3672,8 +3678,8 @@ pub fn apply(base_image: &str, patch: &Patch<'_, str>) -> Result<String> {
lines: Vec<DebugLine>, lines: Vec<DebugLine>,
} }
impl<'a> From<&diffy::Hunk<'a, str>> for DebugHunk { impl<'a> From<&diffy::Hunk<'a, [u8]>> for DebugHunk {
fn from(hunk: &diffy::Hunk<'a, str>) -> Self { fn from(hunk: &diffy::Hunk<'a, [u8]>) -> Self {
Self { Self {
old_range: hunk.old_range(), old_range: hunk.old_range(),
new_range: hunk.new_range(), new_range: hunk.new_range(),
@ -3695,10 +3701,12 @@ pub fn apply(base_image: &str, patch: &Patch<'_, str>) -> Result<String> {
} }
} }
diffy_apply(base_image, patch).with_context(|| DebugContext { diffy_apply(base_image, patch)
.with_context(|| DebugContext {
base_image_hash: md5_hash_hex(base_image), base_image_hash: md5_hash_hex(base_image),
hunks: patch.hunks().iter().map(Into::into).collect(), hunks: patch.hunks().iter().map(Into::into).collect(),
}) })
.map(Into::into)
} }
// Goes through a set of changes and checks if conflicts are present. If no conflicts // Goes through a set of changes and checks if conflicts are present. If no conflicts
@ -3714,10 +3722,10 @@ fn update_conflict_markers(
if conflicting_files.contains(&file_path.display().to_string()) { if conflicting_files.contains(&file_path.display().to_string()) {
// check file for conflict markers, resolve the file if there are none in any hunk // check file for conflict markers, resolve the file if there are none in any hunk
for hunk in non_commited_hunks { for hunk in non_commited_hunks {
if hunk.diff.contains("<<<<<<< ours") { if hunk.diff.contains_str(b"<<<<<<< ours") {
conflicted = true; conflicted = true;
} }
if hunk.diff.contains(">>>>>>> theirs") { if hunk.diff.contains_str(b">>>>>>> theirs") {
conflicted = true; conflicted = true;
} }
} }

View File

@ -48,7 +48,7 @@ async fn should_unapply_with_commits() {
.unwrap(), .unwrap(),
) )
.await .await
.unwrap(); .unwrap_or_else(|err| panic!("{err:?}"));
let branch = controller let branch = controller
.list_virtual_branches(project_id) .list_virtual_branches(project_id)