mirror of
https://github.com/zed-industries/zed.git
synced 2024-11-08 15:44:31 +03:00
WIP: started DB creating and naive inserts
This commit is contained in:
parent
80a894b829
commit
d4a4db42aa
19
Cargo.lock
generated
19
Cargo.lock
generated
@ -1389,6 +1389,15 @@ dependencies = [
|
||||
"theme",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "conv"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299"
|
||||
dependencies = [
|
||||
"custom_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "copilot"
|
||||
version = "0.1.0"
|
||||
@ -1766,6 +1775,12 @@ dependencies = [
|
||||
"winapi 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "custom_derive"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
|
||||
|
||||
[[package]]
|
||||
name = "cxx"
|
||||
version = "1.0.94"
|
||||
@ -7882,11 +7897,15 @@ name = "vector_store"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compat",
|
||||
"conv",
|
||||
"futures 0.3.28",
|
||||
"gpui",
|
||||
"language",
|
||||
"project",
|
||||
"rand 0.8.5",
|
||||
"smol",
|
||||
"sqlx",
|
||||
"util",
|
||||
"workspace",
|
||||
]
|
||||
|
@ -17,6 +17,10 @@ util = { path = "../util" }
|
||||
anyhow.workspace = true
|
||||
futures.workspace = true
|
||||
smol.workspace = true
|
||||
sqlx = { version = "0.6", features = ["sqlite","runtime-tokio-rustls"] }
|
||||
async-compat = "0.2.1"
|
||||
conv = "0.3.3"
|
||||
rand.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
gpui = { path = "../gpui", features = ["test-support"] }
|
||||
|
107
crates/vector_store/src/db.rs
Normal file
107
crates/vector_store/src/db.rs
Normal file
@ -0,0 +1,107 @@
|
||||
use anyhow::Result;
|
||||
use async_compat::{Compat, CompatExt};
|
||||
use conv::ValueFrom;
|
||||
use sqlx::{migrate::MigrateDatabase, Pool, Sqlite, SqlitePool};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::IndexedFile;
|
||||
|
||||
// This is saving to a local database store within the users dev zed path
|
||||
// Where do we want this to sit?
|
||||
// Assuming near where the workspace DB sits.
|
||||
const VECTOR_DB_URL: &str = "embeddings_db";
|
||||
|
||||
pub struct VectorDatabase {}
|
||||
|
||||
impl VectorDatabase {
|
||||
pub async fn initialize_database() -> Result<()> {
|
||||
// If database doesnt exist create database
|
||||
if !Sqlite::database_exists(VECTOR_DB_URL)
|
||||
.compat()
|
||||
.await
|
||||
.unwrap_or(false)
|
||||
{
|
||||
Sqlite::create_database(VECTOR_DB_URL).compat().await?;
|
||||
}
|
||||
|
||||
let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
|
||||
|
||||
// Initialize Vector Databasing Tables
|
||||
// We may be able to skip this assuming the database is never created
|
||||
// without creating the tables at the same time.
|
||||
sqlx::query(
|
||||
"CREATE TABLE IF NOT EXISTS files (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
path NVARCHAR(100) NOT NULL,
|
||||
sha1 NVARCHAR(40) NOT NULL
|
||||
)",
|
||||
)
|
||||
.execute(&db)
|
||||
.compat()
|
||||
.await?;
|
||||
|
||||
sqlx::query(
|
||||
"CREATE TABLE IF NOT EXISTS documents (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_id INTEGER NOT NULL,
|
||||
offset INTEGER NOT NULL,
|
||||
name NVARCHAR(100) NOT NULL,
|
||||
embedding BLOB NOT NULL,
|
||||
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||
)",
|
||||
)
|
||||
.execute(&db)
|
||||
.compat()
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn insert_file(indexed_file: IndexedFile) -> Result<()> {
|
||||
// Write to files table, and return generated id.
|
||||
let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
|
||||
|
||||
let files_insert = sqlx::query("INSERT INTO files (path, sha1) VALUES ($1, $2)")
|
||||
.bind(indexed_file.path.to_str())
|
||||
.bind(indexed_file.sha1)
|
||||
.execute(&db)
|
||||
.compat()
|
||||
.await?;
|
||||
|
||||
let inserted_id = files_insert.last_insert_rowid();
|
||||
|
||||
// I stole this from https://stackoverflow.com/questions/71829931/how-do-i-convert-a-negative-f32-value-to-binary-string-and-back-again
|
||||
// I imagine there is a better way to serialize to/from blob
|
||||
fn get_binary_from_values(values: Vec<f32>) -> String {
|
||||
let bits: Vec<_> = values.iter().map(|v| v.to_bits().to_string()).collect();
|
||||
bits.join(";")
|
||||
}
|
||||
|
||||
fn get_values_from_binary(bin: &str) -> Vec<f32> {
|
||||
(0..bin.len() / 32)
|
||||
.map(|i| {
|
||||
let start = i * 32;
|
||||
let end = start + 32;
|
||||
f32::from_bits(u32::from_str_radix(&bin[start..end], 2).unwrap())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Currently inserting at approximately 3400 documents a second
|
||||
// I imagine we can speed this up with a bulk insert of some kind.
|
||||
for document in indexed_file.documents {
|
||||
sqlx::query(
|
||||
"INSERT INTO documents (file_id, offset, name, embedding) VALUES ($1, $2, $3, $4)",
|
||||
)
|
||||
.bind(inserted_id)
|
||||
.bind(document.offset.to_string())
|
||||
.bind(document.name)
|
||||
.bind(get_binary_from_values(document.embedding))
|
||||
.execute(&db)
|
||||
.compat()
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -1,9 +1,12 @@
|
||||
use anyhow::{anyhow, Result};
|
||||
mod db;
|
||||
use anyhow::Result;
|
||||
use db::VectorDatabase;
|
||||
use gpui::{AppContext, Entity, ModelContext, ModelHandle};
|
||||
use language::LanguageRegistry;
|
||||
use project::{Fs, Project};
|
||||
use rand::Rng;
|
||||
use smol::channel;
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
use std::{path::PathBuf, sync::Arc, time::Instant};
|
||||
use util::ResultExt;
|
||||
use workspace::WorkspaceCreated;
|
||||
|
||||
@ -27,13 +30,15 @@ pub fn init(fs: Arc<dyn Fs>, language_registry: Arc<LanguageRegistry>, cx: &mut
|
||||
.detach();
|
||||
}
|
||||
|
||||
#[derive(Debug, sqlx::FromRow)]
|
||||
struct Document {
|
||||
offset: usize,
|
||||
name: String,
|
||||
embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
struct IndexedFile {
|
||||
#[derive(Debug, sqlx::FromRow)]
|
||||
pub struct IndexedFile {
|
||||
path: PathBuf,
|
||||
sha1: String,
|
||||
documents: Vec<Document>,
|
||||
@ -64,9 +69,24 @@ impl VectorStore {
|
||||
language_registry: &Arc<LanguageRegistry>,
|
||||
file_path: PathBuf,
|
||||
) -> Result<IndexedFile> {
|
||||
eprintln!("indexing file {file_path:?}");
|
||||
Err(anyhow!("not implemented"))
|
||||
// todo!();
|
||||
// This is creating dummy documents to test the database writes.
|
||||
let mut documents = vec![];
|
||||
let mut rng = rand::thread_rng();
|
||||
let rand_num_of_documents: u8 = rng.gen_range(0..200);
|
||||
for _ in 0..rand_num_of_documents {
|
||||
let doc = Document {
|
||||
offset: 0,
|
||||
name: "test symbol".to_string(),
|
||||
embedding: vec![0.32 as f32; 768],
|
||||
};
|
||||
documents.push(doc);
|
||||
}
|
||||
|
||||
return Ok(IndexedFile {
|
||||
path: file_path,
|
||||
sha1: "asdfasdfasdf".to_string(),
|
||||
documents,
|
||||
});
|
||||
}
|
||||
|
||||
fn add_project(&mut self, project: ModelHandle<Project>, cx: &mut ModelContext<Self>) {
|
||||
@ -100,13 +120,17 @@ impl VectorStore {
|
||||
}
|
||||
})
|
||||
.detach();
|
||||
|
||||
cx.background()
|
||||
.spawn(async move {
|
||||
// Initialize Database, creates database and tables if not exists
|
||||
VectorDatabase::initialize_database().await.log_err();
|
||||
while let Ok(indexed_file) = indexed_files_rx.recv().await {
|
||||
// write document to database
|
||||
VectorDatabase::insert_file(indexed_file).await.log_err();
|
||||
}
|
||||
})
|
||||
.detach();
|
||||
|
||||
cx.background()
|
||||
.scoped(|scope| {
|
||||
for _ in 0..cx.background().num_cpus() {
|
||||
|
Loading…
Reference in New Issue
Block a user