Merge pull request #2934 from MichaelOwenDyer/Refactor-signature-module

Refactor diff/signature.rs
2024-12-29 20:43:37 +03:00 · 2024-02-29 11:49:36 +01:00 · 2024-02-29 11:49:36 +01:00 · e66b19f1b1
commit e66b19f1b1
parent 8ba697482e 4892272a08
1 changed files with 78 additions and 89 deletions
--- a/gitbutler-diff/src/signature.rs
+++ b/gitbutler-diff/src/signature.rs
@ -14,15 +14,11 @@
 //! Otherwise, neither the length prefix imposed by `(de)serialize_bytes()` nor the
 //! terrible compaction and optimization of `(de)serialize_tuple()` are acceptable.

-// FIXME(qix-): There are a ton of identifiers in here that make no sense and
-// FIXME(qix-): were copied over from the exploratory data-science-ey code that
-// FIXME(qix-): need to be cleaned up. PR welcome!
-
 const BITS: usize = 3;
 const SHIFT: usize = 8 - BITS;
-const SIG_ENTRIES: usize = (1 << BITS) * (1 << BITS);
-const SIG_BYTES: usize = SIG_ENTRIES * ::core::mem::size_of::<SigBucket>();
-const TOTAL_BYTES: usize = SIG_BYTES + 4 + 1; // we encode a 4-byte length at the beginning, along with a version byte
+const FINGERPRINT_ENTRIES: usize = (1 << BITS) * (1 << BITS);
+const FINGERPRINT_BYTES: usize = FINGERPRINT_ENTRIES * ::core::mem::size_of::<SigBucket>();
+const TOTAL_BYTES: usize = 1 + 4 + FINGERPRINT_BYTES; // we encode a version byte and a 4-byte length at the beginning

 // NOTE: This is not efficient if `SigBucket` is 1 byte (u8).
 // NOTE: If `SigBucket` is changed to a u8, then the implementation
@ -80,42 +76,44 @@ impl Signature {
    /// about the signature or the original file contents.
    ///
    /// Do not use for any security-related purposes.
-    pub fn score_str<S: AsRef<str>>(&self, s: S) -> f64 {
+    pub fn score_str<S: AsRef<str>>(&self, other: S) -> f64 {
        if self.0[0] != 0 {
            panic!("unsupported signature version");
        }

-        let original_length = u32::from_le_bytes(self.0[1..5].try_into().unwrap());
-
-        let s = s.as_ref();
-
-        let s_s: String = s.chars().filter(|&x| !char::is_whitespace(x)).collect();
-        let s = s_s.as_bytes();
-
-        if original_length < 2 || s.len() < 2 {
+        let original_length = u32::from_le_bytes(self.0[1..5].try_into().expect("invalid length"));
+        if original_length < 2 {
            return 0.0;
        }

-        let mut intersection_size = 0usize;
+        let other = other.as_ref();
+        let other_string: String = other.chars().filter(|&x| !char::is_whitespace(x)).collect();
+        let other = other_string.as_bytes();

-        let mut wb = self.bucket_iter().collect::<Vec<_>>();
+        if other.len() < 2 {
+            return 0.0;
+        }

-        for (b1, b2) in bigrams(s) {
-            let b1 = b1 >> SHIFT;
-            let b2 = b2 >> SHIFT;
-            let ix = ((b1 as usize) << BITS) | (b2 as usize);
-            if wb[ix] > 0 {
-                wb[ix] = wb[ix].saturating_sub(1);
-                intersection_size += 1;
+        let mut matching_bigrams: usize = 0;
+
+        let mut self_buckets = self.bucket_iter().collect::<Vec<_>>();
+
+        for (left, right) in bigrams(other) {
+            let left = left >> SHIFT;
+            let right = right >> SHIFT;
+            let index = ((left as usize) << BITS) | (right as usize);
+            if self_buckets[index] > 0 {
+                self_buckets[index] = self_buckets[index] - 1;
+                matching_bigrams += 1;
            }
        }

-        (2 * intersection_size) as f64 / (original_length as usize + s.len() - 2) as f64
+        (2 * matching_bigrams) as f64 / (original_length as usize + other.len() - 2) as f64
    }

    fn bucket_iter(&self) -> impl Iterator<Item = SigBucket> + '_ {
        unsafe {
-            self.0[(TOTAL_BYTES - SIG_BYTES)..]
+            self.0[(TOTAL_BYTES - FINGERPRINT_BYTES)..]
                .as_chunks_unchecked::<{ ::core::mem::size_of::<SigBucket>() }>()
                .iter()
                .map(|ch: &[u8; ::core::mem::size_of::<SigBucket>()]| SigBucket::from_le_bytes(*ch))
@ -125,45 +123,50 @@ impl Signature {

 impl<S: AsRef<str>> From<S> for Signature {
    #[inline]
-    fn from(s: S) -> Self {
-        let s = s.as_ref();
+    fn from(source: S) -> Self {
+        let source = source.as_ref();
+        let source_string: String = source
+            .chars()
+            .filter(|&x| !char::is_whitespace(x))
+            .collect();
+        let source = source_string.as_bytes();

-        let a_s: String = s.chars().filter(|&x| !char::is_whitespace(x)).collect();
-        let a = a_s.as_bytes();
-
-        let a_len: u32 = a
+        let source_len: u32 = source
            .len()
            .try_into()
            .expect("strings with a byte-length above u32::MAX are not supported");

-        let mut a_res = [0; TOTAL_BYTES];
-        a_res[0] = 0; // version byte
-        a_res[1..5].copy_from_slice(&a_len.to_le_bytes()); // length
+        let mut bytes = [0; TOTAL_BYTES];
+        bytes[0] = 0; // version byte (0)
+        bytes[1..5].copy_from_slice(&source_len.to_le_bytes()); // next 4 bytes are the length

-        if a_len >= 2 {
-            let mut a_bigrams = [0 as SigBucket; SIG_ENTRIES];
+        if source_len >= 2 {
+            let mut buckets = [0 as SigBucket; FINGERPRINT_ENTRIES];

-            for (b1, b2) in bigrams(a) {
-                let b1 = b1 >> SHIFT;
-                let b2 = b2 >> SHIFT;
-                let encoded_bigram = ((b1 as usize) << BITS) | (b2 as usize);
-                a_bigrams[encoded_bigram] = a_bigrams[encoded_bigram].saturating_add(1);
+            for (left, right) in bigrams(source) {
+                let left = left >> SHIFT;
+                let right = right >> SHIFT;
+                let index = ((left as usize) << BITS) | (right as usize);
+                buckets[index] = buckets[index].saturating_add(1);
            }

            // NOTE: This is not efficient if `SigBucket` is 1 byte (u8).
-            let mut offset = TOTAL_BYTES - SIG_BYTES;
-            for bucket in a_bigrams {
+            let mut offset = TOTAL_BYTES - FINGERPRINT_BYTES;
+            for bucket in buckets {
                let start = offset;
                let end = start + ::core::mem::size_of::<SigBucket>();
-                a_res[start..end].copy_from_slice(&bucket.to_le_bytes());
+                bytes[start..end].copy_from_slice(&bucket.to_le_bytes());
                offset = end;
            }
        }

-        Self(a_res)
+        Self(bytes)
    }
 }

+/// Copies the passed bytes twice and zips them together with a one-byte offset.
+/// This produces an iterator of the bigrams (pairs of consecutive bytes) in the input.
+/// For example, the bigrams of 1, 2, 3, 4, 5 would be (1, 2), (2, 3), (3, 4), (4, 5).
 #[inline]
 fn bigrams(s: &[u8]) -> impl Iterator<Item = (u8, u8)> + '_ {
    s.iter().copied().zip(s.iter().skip(1).copied())
@ -173,61 +176,47 @@ fn bigrams(s: &[u8]) -> impl Iterator<Item = (u8, u8)> + '_ {
 mod tests {
    use super::*;

-    #[test]
-    fn score_signature() {
-        let sig = Signature::from("hello world");
-
    macro_rules! assert_score {
-            ($s:expr, $e:expr) => {
-                if (sig.score_str($s) - $e).abs() >= 0.1 {
+        ($sig:ident, $s:expr, $e:expr) => {
+            let score = $sig.score_str($s);
+            if (score - $e).abs() >= 0.1 {
                panic!(
                    "expected score of {} for string {:?}, got {}",
-                        $e,
-                        $s,
-                        sig.score_str($s)
+                    $e, $s, score
                );
            }
        };
    }

+    #[test]
+    fn score_signature() {
+        let sig = Signature::from("hello world");
+
        // NOTE: The scores here are not exact, but are close enough
        // to be useful for testing purposes, hence why some have the same
        // "score" but different strings.
-        assert_score!("hello world", 1.0);
-        assert_score!("hello world!", 0.95);
-        assert_score!("hello world!!", 0.9);
-        assert_score!("hello world!!!", 0.85);
-        assert_score!("hello world!!!!", 0.8);
-        assert_score!("hello world!!!!!", 0.75);
-        assert_score!("hello world!!!!!!", 0.7);
-        assert_score!("hello world!!!!!!!", 0.65);
-        assert_score!("hello world!!!!!!!!", 0.62);
-        assert_score!("hello world!!!!!!!!!", 0.6);
-        assert_score!("hello world!!!!!!!!!!", 0.55);
+        assert_score!(sig, "hello world", 1.0);
+        assert_score!(sig, "hello world!", 0.95);
+        assert_score!(sig, "hello world!!", 0.9);
+        assert_score!(sig, "hello world!!!", 0.85);
+        assert_score!(sig, "hello world!!!!", 0.8);
+        assert_score!(sig, "hello world!!!!!", 0.75);
+        assert_score!(sig, "hello world!!!!!!", 0.7);
+        assert_score!(sig, "hello world!!!!!!!", 0.65);
+        assert_score!(sig, "hello world!!!!!!!!", 0.62);
+        assert_score!(sig, "hello world!!!!!!!!!", 0.6);
+        assert_score!(sig, "hello world!!!!!!!!!!", 0.55);
    }

    #[test]
    fn score_ignores_whitespace() {
        let sig = Signature::from("hello world");

-        macro_rules! assert_score {
-            ($s:expr, $e:expr) => {
-                if (sig.score_str($s) - $e).abs() >= 0.1 {
-                    panic!(
-                        "expected score of {} for string {:?}, got {}",
-                        $e,
-                        $s,
-                        sig.score_str($s)
-                    );
-                }
-            };
-        }
-
-        assert_score!("hello world", 1.0);
-        assert_score!("hello world ", 1.0);
-        assert_score!("hello\nworld ", 1.0);
-        assert_score!("hello\n\tworld ", 1.0);
-        assert_score!("\t\t  hel lo\n\two rld \t\t", 1.0);
+        assert_score!(sig, "hello world", 1.0);
+        assert_score!(sig, "hello world ", 1.0);
+        assert_score!(sig, "hello\nworld ", 1.0);
+        assert_score!(sig, "hello\n\tworld ", 1.0);
+        assert_score!(sig, "\t\t  hel lo\n\two rld \t\t", 1.0);
    }

    const TEXT1: &str = include_str!("../fixture/text1.txt");