mirror of
https://github.com/zed-industries/zed.git
synced 2025-01-04 14:38:16 +03:00
updated file compare in the semantic indexing engine, to work off of modified system times as opposed to file hashes
Co-authored-by: maxbrunsfeld <max@zed.dev>
This commit is contained in:
parent
36907bb4dc
commit
3408b98167
25
Cargo.lock
generated
25
Cargo.lock
generated
@ -4232,19 +4232,6 @@ dependencies = [
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ndarray"
|
||||
version = "0.15.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
|
||||
dependencies = [
|
||||
"matrixmultiply",
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"rawpointer",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "net2"
|
||||
version = "0.2.38"
|
||||
@ -4353,15 +4340,6 @@ dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.45"
|
||||
@ -8050,14 +8028,13 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"matrixmultiply",
|
||||
"ndarray",
|
||||
"picker",
|
||||
"project",
|
||||
"rand 0.8.5",
|
||||
"rpc",
|
||||
"rusqlite",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha-1 0.10.1",
|
||||
"smol",
|
||||
"tempdir",
|
||||
"theme",
|
||||
|
@ -17,6 +17,7 @@ util = { path = "../util" }
|
||||
picker = { path = "../picker" }
|
||||
theme = { path = "../theme" }
|
||||
editor = { path = "../editor" }
|
||||
rpc = { path = "../rpc" }
|
||||
anyhow.workspace = true
|
||||
futures.workspace = true
|
||||
smol.workspace = true
|
||||
@ -29,14 +30,13 @@ serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
async-trait.workspace = true
|
||||
bincode = "1.3.3"
|
||||
ndarray = "0.15.6"
|
||||
sha-1 = "0.10.1"
|
||||
matrixmultiply = "0.3.7"
|
||||
|
||||
[dev-dependencies]
|
||||
gpui = { path = "../gpui", features = ["test-support"] }
|
||||
language = { path = "../language", features = ["test-support"] }
|
||||
project = { path = "../project", features = ["test-support"] }
|
||||
rpc = { path = "../rpc", features = ["test-support"] }
|
||||
workspace = { path = "../workspace", features = ["test-support"] }
|
||||
tree-sitter-rust = "*"
|
||||
rand.workspace = true
|
||||
|
@ -2,18 +2,17 @@ use std::{
|
||||
collections::HashMap,
|
||||
path::{Path, PathBuf},
|
||||
rc::Rc,
|
||||
time::SystemTime,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
|
||||
use crate::IndexedFile;
|
||||
use rpc::proto::Timestamp;
|
||||
use rusqlite::{
|
||||
params,
|
||||
types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
|
||||
ToSql,
|
||||
types::{FromSql, FromSqlResult, ValueRef},
|
||||
};
|
||||
use sha1::{Digest, Sha1};
|
||||
|
||||
use crate::IndexedFile;
|
||||
|
||||
// Note this is not an appropriate document
|
||||
#[derive(Debug)]
|
||||
@ -29,60 +28,7 @@ pub struct DocumentRecord {
|
||||
pub struct FileRecord {
|
||||
pub id: usize,
|
||||
pub relative_path: String,
|
||||
pub sha1: FileSha1,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FileSha1(pub Vec<u8>);
|
||||
|
||||
impl FileSha1 {
|
||||
pub fn from_str(content: String) -> Self {
|
||||
let mut hasher = Sha1::new();
|
||||
hasher.update(content);
|
||||
let sha1 = hasher.finalize()[..]
|
||||
.into_iter()
|
||||
.map(|val| val.to_owned())
|
||||
.collect::<Vec<u8>>();
|
||||
return FileSha1(sha1);
|
||||
}
|
||||
|
||||
pub fn equals(&self, content: &String) -> bool {
|
||||
let mut hasher = Sha1::new();
|
||||
hasher.update(content);
|
||||
let sha1 = hasher.finalize()[..]
|
||||
.into_iter()
|
||||
.map(|val| val.to_owned())
|
||||
.collect::<Vec<u8>>();
|
||||
|
||||
let equal = self
|
||||
.0
|
||||
.clone()
|
||||
.into_iter()
|
||||
.zip(sha1)
|
||||
.filter(|&(a, b)| a == b)
|
||||
.count()
|
||||
== self.0.len();
|
||||
|
||||
equal
|
||||
}
|
||||
}
|
||||
|
||||
impl ToSql for FileSha1 {
|
||||
fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
|
||||
return self.0.to_sql();
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSql for FileSha1 {
|
||||
fn column_result(value: ValueRef) -> FromSqlResult<Self> {
|
||||
let bytes = value.as_blob()?;
|
||||
Ok(FileSha1(
|
||||
bytes
|
||||
.into_iter()
|
||||
.map(|val| val.to_owned())
|
||||
.collect::<Vec<u8>>(),
|
||||
))
|
||||
}
|
||||
pub mtime: Timestamp,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@ -133,7 +79,8 @@ impl VectorDatabase {
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
worktree_id INTEGER NOT NULL,
|
||||
relative_path VARCHAR NOT NULL,
|
||||
sha1 BLOB NOT NULL,
|
||||
mtime_seconds INTEGER NOT NULL,
|
||||
mtime_nanos INTEGER NOT NULL,
|
||||
FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
|
||||
)",
|
||||
[],
|
||||
@ -170,11 +117,20 @@ impl VectorDatabase {
|
||||
",
|
||||
params![worktree_id, indexed_file.path.to_str()],
|
||||
)?;
|
||||
let mtime = Timestamp::from(indexed_file.mtime);
|
||||
self.db.execute(
|
||||
"
|
||||
INSERT INTO files (worktree_id, relative_path, sha1) VALUES (?1, ?2, $3);
|
||||
INSERT INTO files
|
||||
(worktree_id, relative_path, mtime_seconds, mtime_nanos)
|
||||
VALUES
|
||||
(?1, ?2, $3, $4);
|
||||
",
|
||||
params![worktree_id, indexed_file.path.to_str(), indexed_file.sha1],
|
||||
params![
|
||||
worktree_id,
|
||||
indexed_file.path.to_str(),
|
||||
mtime.seconds,
|
||||
mtime.nanos
|
||||
],
|
||||
)?;
|
||||
|
||||
let file_id = self.db.last_insert_rowid();
|
||||
@ -224,13 +180,24 @@ impl VectorDatabase {
|
||||
Ok(self.db.last_insert_rowid())
|
||||
}
|
||||
|
||||
pub fn get_file_hashes(&self, worktree_id: i64) -> Result<HashMap<PathBuf, FileSha1>> {
|
||||
pub fn get_file_mtimes(&self, worktree_id: i64) -> Result<HashMap<PathBuf, SystemTime>> {
|
||||
let mut statement = self.db.prepare(
|
||||
"SELECT relative_path, sha1 FROM files WHERE worktree_id = ?1 ORDER BY relative_path",
|
||||
"
|
||||
SELECT relative_path, mtime_seconds, mtime_nanos
|
||||
FROM files
|
||||
WHERE worktree_id = ?1
|
||||
ORDER BY relative_path",
|
||||
)?;
|
||||
let mut result: HashMap<PathBuf, FileSha1> = HashMap::new();
|
||||
let mut result: HashMap<PathBuf, SystemTime> = HashMap::new();
|
||||
for row in statement.query_map(params![worktree_id], |row| {
|
||||
Ok((row.get::<_, String>(0)?.into(), row.get(1)?))
|
||||
Ok((
|
||||
row.get::<_, String>(0)?.into(),
|
||||
Timestamp {
|
||||
seconds: row.get(1)?,
|
||||
nanos: row.get(2)?,
|
||||
}
|
||||
.into(),
|
||||
))
|
||||
})? {
|
||||
let row = row?;
|
||||
result.insert(row.0, row.1);
|
||||
|
@ -6,7 +6,7 @@ mod modal;
|
||||
mod vector_store_tests;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use db::{FileSha1, VectorDatabase};
|
||||
use db::VectorDatabase;
|
||||
use embedding::{EmbeddingProvider, OpenAIEmbeddings};
|
||||
use gpui::{AppContext, Entity, ModelContext, ModelHandle, Task, ViewContext};
|
||||
use language::{Language, LanguageRegistry};
|
||||
@ -15,9 +15,10 @@ use project::{Fs, Project, WorktreeId};
|
||||
use smol::channel;
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
collections::{HashMap, HashSet},
|
||||
collections::HashMap,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
time::SystemTime,
|
||||
};
|
||||
use tree_sitter::{Parser, QueryCursor};
|
||||
use util::{
|
||||
@ -46,6 +47,7 @@ pub fn init(
|
||||
VectorStore::new(
|
||||
fs,
|
||||
db_file_path,
|
||||
// Arc::new(embedding::DummyEmbeddings {}),
|
||||
Arc::new(OpenAIEmbeddings {
|
||||
client: http_client,
|
||||
}),
|
||||
@ -91,7 +93,7 @@ pub fn init(
|
||||
#[derive(Debug)]
|
||||
pub struct IndexedFile {
|
||||
path: PathBuf,
|
||||
sha1: FileSha1,
|
||||
mtime: SystemTime,
|
||||
documents: Vec<Document>,
|
||||
}
|
||||
|
||||
@ -131,9 +133,10 @@ impl VectorStore {
|
||||
cursor: &mut QueryCursor,
|
||||
parser: &mut Parser,
|
||||
embedding_provider: &dyn EmbeddingProvider,
|
||||
fs: &Arc<dyn Fs>,
|
||||
language: Arc<Language>,
|
||||
file_path: PathBuf,
|
||||
content: String,
|
||||
mtime: SystemTime,
|
||||
) -> Result<IndexedFile> {
|
||||
let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?;
|
||||
let embedding_config = grammar
|
||||
@ -141,6 +144,8 @@ impl VectorStore {
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("no outline query"))?;
|
||||
|
||||
let content = fs.load(&file_path).await?;
|
||||
|
||||
parser.set_language(grammar.ts_language).unwrap();
|
||||
let tree = parser
|
||||
.parse(&content, None)
|
||||
@ -184,11 +189,9 @@ impl VectorStore {
|
||||
}
|
||||
}
|
||||
|
||||
let sha1 = FileSha1::from_str(content);
|
||||
|
||||
return Ok(IndexedFile {
|
||||
path: file_path,
|
||||
sha1,
|
||||
mtime,
|
||||
documents,
|
||||
});
|
||||
}
|
||||
@ -231,38 +234,36 @@ impl VectorStore {
|
||||
|
||||
// Here we query the worktree ids, and yet we dont have them elsewhere
|
||||
// We likely want to clean up these datastructures
|
||||
let (db, worktree_hashes, worktree_db_ids) = cx
|
||||
let (db, mut worktree_file_times, worktree_db_ids) = cx
|
||||
.background()
|
||||
.spawn({
|
||||
let worktrees = worktrees.clone();
|
||||
async move {
|
||||
let mut worktree_db_ids: HashMap<WorktreeId, i64> = HashMap::new();
|
||||
let mut hashes: HashMap<WorktreeId, HashMap<PathBuf, FileSha1>> =
|
||||
let mut file_times: HashMap<WorktreeId, HashMap<PathBuf, SystemTime>> =
|
||||
HashMap::new();
|
||||
for worktree in worktrees {
|
||||
let worktree_db_id =
|
||||
db.find_or_create_worktree(worktree.abs_path().as_ref())?;
|
||||
worktree_db_ids.insert(worktree.id(), worktree_db_id);
|
||||
hashes.insert(worktree.id(), db.get_file_hashes(worktree_db_id)?);
|
||||
file_times.insert(worktree.id(), db.get_file_mtimes(worktree_db_id)?);
|
||||
}
|
||||
anyhow::Ok((db, hashes, worktree_db_ids))
|
||||
anyhow::Ok((db, file_times, worktree_db_ids))
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
let (paths_tx, paths_rx) =
|
||||
channel::unbounded::<(i64, PathBuf, String, Arc<Language>)>();
|
||||
channel::unbounded::<(i64, PathBuf, Arc<Language>, SystemTime)>();
|
||||
let (delete_paths_tx, delete_paths_rx) = channel::unbounded::<(i64, PathBuf)>();
|
||||
let (indexed_files_tx, indexed_files_rx) = channel::unbounded::<(i64, IndexedFile)>();
|
||||
cx.background()
|
||||
.spawn({
|
||||
let fs = fs.clone();
|
||||
let worktree_db_ids = worktree_db_ids.clone();
|
||||
async move {
|
||||
for worktree in worktrees.into_iter() {
|
||||
let file_hashes = &worktree_hashes[&worktree.id()];
|
||||
let mut files_included =
|
||||
file_hashes.keys().collect::<HashSet<&PathBuf>>();
|
||||
let mut file_mtimes =
|
||||
worktree_file_times.remove(&worktree.id()).unwrap();
|
||||
for file in worktree.files(false, 0) {
|
||||
let absolute_path = worktree.absolutize(&file.path);
|
||||
|
||||
@ -278,30 +279,26 @@ impl VectorStore {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(content) = fs.load(&absolute_path).await.log_err() {
|
||||
let path_buf = file.path.to_path_buf();
|
||||
let already_stored = file_hashes.get(&path_buf).map_or(
|
||||
false,
|
||||
|existing_hash| {
|
||||
files_included.remove(&path_buf);
|
||||
existing_hash.equals(&content)
|
||||
},
|
||||
);
|
||||
let path_buf = file.path.to_path_buf();
|
||||
let stored_mtime = file_mtimes.remove(&file.path.to_path_buf());
|
||||
let already_stored = stored_mtime
|
||||
.map_or(false, |existing_mtime| {
|
||||
existing_mtime == file.mtime
|
||||
});
|
||||
|
||||
if !already_stored {
|
||||
paths_tx
|
||||
.try_send((
|
||||
worktree_db_ids[&worktree.id()],
|
||||
path_buf,
|
||||
content,
|
||||
language,
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
if !already_stored {
|
||||
paths_tx
|
||||
.try_send((
|
||||
worktree_db_ids[&worktree.id()],
|
||||
path_buf,
|
||||
language,
|
||||
file.mtime,
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
for file in files_included {
|
||||
for file in file_mtimes.keys() {
|
||||
delete_paths_tx
|
||||
.try_send((worktree_db_ids[&worktree.id()], file.to_owned()))
|
||||
.unwrap();
|
||||
@ -336,16 +333,17 @@ impl VectorStore {
|
||||
scope.spawn(async {
|
||||
let mut parser = Parser::new();
|
||||
let mut cursor = QueryCursor::new();
|
||||
while let Ok((worktree_id, file_path, content, language)) =
|
||||
while let Ok((worktree_id, file_path, language, mtime)) =
|
||||
paths_rx.recv().await
|
||||
{
|
||||
if let Some(indexed_file) = Self::index_file(
|
||||
&mut cursor,
|
||||
&mut parser,
|
||||
embedding_provider.as_ref(),
|
||||
&fs,
|
||||
language,
|
||||
file_path,
|
||||
content,
|
||||
mtime,
|
||||
)
|
||||
.await
|
||||
.log_err()
|
||||
@ -395,6 +393,8 @@ impl VectorStore {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
log::info!("Searching for: {:?}", phrase);
|
||||
|
||||
let embedding_provider = self.embedding_provider.clone();
|
||||
let database_url = self.database_url.clone();
|
||||
cx.spawn(|this, cx| async move {
|
||||
|
Loading…
Reference in New Issue
Block a user