Rename Sha1 to DocumentDigest

Co-Authored-By: Kyle Caverly <kyle@zed.dev>
This commit is contained in:
Antonio Scandurra 2023-08-31 18:00:36 +02:00
parent 3001a46f69
commit 2503d54d19
4 changed files with 28 additions and 27 deletions

View File

@ -1,4 +1,8 @@
use crate::{embedding::Embedding, parsing::Document, SEMANTIC_INDEX_VERSION};
use crate::{
embedding::Embedding,
parsing::{Document, DocumentDigest},
SEMANTIC_INDEX_VERSION,
};
use anyhow::{anyhow, Context, Result};
use futures::channel::oneshot;
use gpui::executor;
@ -165,7 +169,7 @@ impl VectorDatabase {
end_byte INTEGER NOT NULL,
name VARCHAR NOT NULL,
embedding BLOB NOT NULL,
sha1 BLOB NOT NULL,
digest BLOB NOT NULL,
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
)",
[],
@ -225,14 +229,14 @@ impl VectorDatabase {
// I imagine we can speed this up with a bulk insert of some kind.
for document in documents {
db.execute(
"INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, sha1) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
"INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, digest) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
params![
file_id,
document.range.start.to_string(),
document.range.end.to_string(),
document.name,
document.embedding,
document.sha1
document.digest
],
)?;
}

View File

@ -1,11 +1,11 @@
use crate::embedding::{EmbeddingProvider, Embedding};
use crate::embedding::{Embedding, EmbeddingProvider};
use anyhow::{anyhow, Result};
use language::{Grammar, Language};
use rusqlite::{
types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
ToSql,
};
use sha1::Digest;
use sha1::{Digest, Sha1};
use std::{
cmp::{self, Reverse},
collections::HashSet,
@ -15,10 +15,10 @@ use std::{
};
use tree_sitter::{Parser, QueryCursor};
#[derive(Debug, PartialEq, Clone)]
pub struct Sha1([u8; 20]);
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub struct DocumentDigest([u8; 20]);
impl FromSql for Sha1 {
impl FromSql for DocumentDigest {
fn column_result(value: ValueRef) -> FromSqlResult<Self> {
let blob = value.as_blob()?;
let bytes =
@ -27,19 +27,19 @@ impl FromSql for Sha1 {
expected_size: 20,
blob_size: blob.len(),
})?;
return Ok(Sha1(bytes));
return Ok(DocumentDigest(bytes));
}
}
impl ToSql for Sha1 {
impl ToSql for DocumentDigest {
fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
self.0.to_sql()
}
}
impl From<&'_ str> for Sha1 {
impl From<&'_ str> for DocumentDigest {
fn from(value: &'_ str) -> Self {
let mut sha1 = sha1::Sha1::new();
let mut sha1 = Sha1::new();
sha1.update(value);
Self(sha1.finalize().into())
}
@ -51,7 +51,7 @@ pub struct Document {
pub range: Range<usize>,
pub content: String,
pub embedding: Option<Embedding>,
pub sha1: Sha1,
pub digest: DocumentDigest,
pub token_count: usize,
}
@ -102,17 +102,14 @@ impl CodeContextRetriever {
.replace("<path>", relative_path.to_string_lossy().as_ref())
.replace("<language>", language_name.as_ref())
.replace("<item>", &content);
let sha1 = Sha1::from(document_span.as_str());
let digest = DocumentDigest::from(document_span.as_str());
let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
Ok(vec![Document {
range: 0..content.len(),
content: document_span,
embedding: Default::default(),
name: language_name.to_string(),
sha1,
digest,
token_count,
}])
}
@ -121,14 +118,14 @@ impl CodeContextRetriever {
let document_span = MARKDOWN_CONTEXT_TEMPLATE
.replace("<path>", relative_path.to_string_lossy().as_ref())
.replace("<item>", &content);
let sha1 = Sha1::from(document_span.as_str());
let digest = DocumentDigest::from(document_span.as_str());
let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
Ok(vec![Document {
range: 0..content.len(),
content: document_span,
embedding: None,
name: "Markdown".to_string(),
sha1,
digest,
token_count,
}])
}
@ -308,13 +305,13 @@ impl CodeContextRetriever {
);
}
let sha1 = Sha1::from(document_content.as_str());
let sha1 = DocumentDigest::from(document_content.as_str());
documents.push(Document {
name,
content: document_content,
range: item_range.clone(),
embedding: None,
sha1,
digest: sha1,
token_count: 0,
})
}

View File

@ -37,7 +37,7 @@ use util::{
};
use workspace::WorkspaceCreated;
const SEMANTIC_INDEX_VERSION: usize = 7;
const SEMANTIC_INDEX_VERSION: usize = 8;
const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600);
pub fn init(

View File

@ -1,7 +1,7 @@
use crate::{
embedding::{DummyEmbeddings, Embedding, EmbeddingProvider},
embedding_queue::EmbeddingQueue,
parsing::{subtract_ranges, CodeContextRetriever, Document, Sha1},
parsing::{subtract_ranges, CodeContextRetriever, Document, DocumentDigest},
semantic_index_settings::SemanticIndexSettings,
FileToEmbed, JobHandle, SearchResult, SemanticIndex,
};
@ -220,13 +220,13 @@ async fn test_embedding_batching(cx: &mut TestAppContext, mut rng: StdRng) {
.with_simple_text()
.take(content_len)
.collect::<String>();
let sha1 = Sha1::from(content.as_str());
let digest = DocumentDigest::from(content.as_str());
Document {
range: 0..10,
embedding: None,
name: format!("document {document_ix}"),
content,
sha1,
digest,
token_count: rng.gen_range(10..30),
}
})