Rename Sha1 to DocumentDigest

Co-Authored-By: Kyle Caverly <kyle@zed.dev>
2024-11-08 07:35:01 +03:00 · 2023-08-31 18:00:36 +02:00 · 2023-08-31 18:00:36 +02:00 · 2503d54d19
commit 2503d54d19
parent 3001a46f69
4 changed files with 28 additions and 27 deletions
--- a/crates/semantic_index/src/db.rs
+++ b/crates/semantic_index/src/db.rs
@ -1,4 +1,8 @@
-use crate::{embedding::Embedding, parsing::Document, SEMANTIC_INDEX_VERSION};
+use crate::{
+    embedding::Embedding,
+    parsing::{Document, DocumentDigest},
+    SEMANTIC_INDEX_VERSION,
+};
 use anyhow::{anyhow, Context, Result};
 use futures::channel::oneshot;
 use gpui::executor;
@ -165,7 +169,7 @@ impl VectorDatabase {
                    end_byte INTEGER NOT NULL,
                    name VARCHAR NOT NULL,
                    embedding BLOB NOT NULL,
-                    sha1 BLOB NOT NULL,
+                    digest BLOB NOT NULL,
                    FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
                )",
                [],
@ -225,14 +229,14 @@ impl VectorDatabase {
            // I imagine we can speed this up with a bulk insert of some kind.
            for document in documents {
                db.execute(
-                    "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, sha1) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+                    "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, digest) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
                    params![
                        file_id,
                        document.range.start.to_string(),
                        document.range.end.to_string(),
                        document.name,
                        document.embedding,
-                        document.sha1
+                        document.digest
                    ],
                )?;
           }
--- a/crates/semantic_index/src/parsing.rs
+++ b/crates/semantic_index/src/parsing.rs
@ -1,11 +1,11 @@
-use crate::embedding::{EmbeddingProvider, Embedding};
+use crate::embedding::{Embedding, EmbeddingProvider};
 use anyhow::{anyhow, Result};
 use language::{Grammar, Language};
 use rusqlite::{
    types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
    ToSql,
 };
-use sha1::Digest;
+use sha1::{Digest, Sha1};
 use std::{
    cmp::{self, Reverse},
    collections::HashSet,
@ -15,10 +15,10 @@ use std::{
 };
 use tree_sitter::{Parser, QueryCursor};

-#[derive(Debug, PartialEq, Clone)]
-pub struct Sha1([u8; 20]);
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub struct DocumentDigest([u8; 20]);

-impl FromSql for Sha1 {
+impl FromSql for DocumentDigest {
    fn column_result(value: ValueRef) -> FromSqlResult<Self> {
        let blob = value.as_blob()?;
        let bytes =
@ -27,19 +27,19 @@ impl FromSql for Sha1 {
                    expected_size: 20,
                    blob_size: blob.len(),
                })?;
-        return Ok(Sha1(bytes));
+        return Ok(DocumentDigest(bytes));
    }
 }

-impl ToSql for Sha1 {
+impl ToSql for DocumentDigest {
    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
        self.0.to_sql()
    }
 }

-impl From<&'_ str> for Sha1 {
+impl From<&'_ str> for DocumentDigest {
    fn from(value: &'_ str) -> Self {
-        let mut sha1 = sha1::Sha1::new();
+        let mut sha1 = Sha1::new();
        sha1.update(value);
        Self(sha1.finalize().into())
    }
@ -51,7 +51,7 @@ pub struct Document {
    pub range: Range<usize>,
    pub content: String,
    pub embedding: Option<Embedding>,
-    pub sha1: Sha1,
+    pub digest: DocumentDigest,
    pub token_count: usize,
 }

@ -102,17 +102,14 @@ impl CodeContextRetriever {
            .replace("<path>", relative_path.to_string_lossy().as_ref())
            .replace("<language>", language_name.as_ref())
            .replace("<item>", &content);
-
-        let sha1 = Sha1::from(document_span.as_str());
-
+        let digest = DocumentDigest::from(document_span.as_str());
        let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
-
        Ok(vec![Document {
            range: 0..content.len(),
            content: document_span,
            embedding: Default::default(),
            name: language_name.to_string(),
-            sha1,
+            digest,
            token_count,
        }])
    }
@ -121,14 +118,14 @@ impl CodeContextRetriever {
        let document_span = MARKDOWN_CONTEXT_TEMPLATE
            .replace("<path>", relative_path.to_string_lossy().as_ref())
            .replace("<item>", &content);
-        let sha1 = Sha1::from(document_span.as_str());
+        let digest = DocumentDigest::from(document_span.as_str());
        let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
        Ok(vec![Document {
            range: 0..content.len(),
            content: document_span,
            embedding: None,
            name: "Markdown".to_string(),
-            sha1,
+            digest,
            token_count,
        }])
    }
@ -308,13 +305,13 @@ impl CodeContextRetriever {
                );
            }

-            let sha1 = Sha1::from(document_content.as_str());
+            let sha1 = DocumentDigest::from(document_content.as_str());
            documents.push(Document {
                name,
                content: document_content,
                range: item_range.clone(),
                embedding: None,
-                sha1,
+                digest: sha1,
                token_count: 0,
            })
        }
--- a/crates/semantic_index/src/semantic_index.rs
+++ b/crates/semantic_index/src/semantic_index.rs
@ -37,7 +37,7 @@ use util::{
 };
 use workspace::WorkspaceCreated;

-const SEMANTIC_INDEX_VERSION: usize = 7;
+const SEMANTIC_INDEX_VERSION: usize = 8;
 const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600);

 pub fn init(
--- a/crates/semantic_index/src/semantic_index_tests.rs
+++ b/crates/semantic_index/src/semantic_index_tests.rs
@ -1,7 +1,7 @@
 use crate::{
    embedding::{DummyEmbeddings, Embedding, EmbeddingProvider},
    embedding_queue::EmbeddingQueue,
-    parsing::{subtract_ranges, CodeContextRetriever, Document, Sha1},
+    parsing::{subtract_ranges, CodeContextRetriever, Document, DocumentDigest},
    semantic_index_settings::SemanticIndexSettings,
    FileToEmbed, JobHandle, SearchResult, SemanticIndex,
 };
@ -220,13 +220,13 @@ async fn test_embedding_batching(cx: &mut TestAppContext, mut rng: StdRng) {
                        .with_simple_text()
                        .take(content_len)
                        .collect::<String>();
-                    let sha1 = Sha1::from(content.as_str());
+                    let digest = DocumentDigest::from(content.as_str());
                    Document {
                        range: 0..10,
                        embedding: None,
                        name: format!("document {document_ix}"),
                        content,
-                        sha1,
+                        digest,
                        token_count: rng.gen_range(10..30),
                    }
                })