WIP: started DB creating and naive inserts

This commit is contained in:
KCaverly 2023-06-22 13:25:33 -04:00
parent 80a894b829
commit d4a4db42aa
4 changed files with 161 additions and 7 deletions

19
Cargo.lock generated
View File

@ -1389,6 +1389,15 @@ dependencies = [
"theme",
]
[[package]]
name = "conv"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299"
dependencies = [
"custom_derive",
]
[[package]]
name = "copilot"
version = "0.1.0"
@ -1766,6 +1775,12 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "custom_derive"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
[[package]]
name = "cxx"
version = "1.0.94"
@ -7882,11 +7897,15 @@ name = "vector_store"
version = "0.1.0"
dependencies = [
"anyhow",
"async-compat",
"conv",
"futures 0.3.28",
"gpui",
"language",
"project",
"rand 0.8.5",
"smol",
"sqlx",
"util",
"workspace",
]

View File

@ -17,6 +17,10 @@ util = { path = "../util" }
anyhow.workspace = true
futures.workspace = true
smol.workspace = true
sqlx = { version = "0.6", features = ["sqlite","runtime-tokio-rustls"] }
async-compat = "0.2.1"
conv = "0.3.3"
rand.workspace = true
[dev-dependencies]
gpui = { path = "../gpui", features = ["test-support"] }

View File

@ -0,0 +1,107 @@
use anyhow::Result;
use async_compat::{Compat, CompatExt};
use conv::ValueFrom;
use sqlx::{migrate::MigrateDatabase, Pool, Sqlite, SqlitePool};
use std::time::{Duration, Instant};
use crate::IndexedFile;
// This is saving to a local database store within the users dev zed path
// Where do we want this to sit?
// Assuming near where the workspace DB sits.
const VECTOR_DB_URL: &str = "embeddings_db";
pub struct VectorDatabase {}
impl VectorDatabase {
pub async fn initialize_database() -> Result<()> {
// If database doesnt exist create database
if !Sqlite::database_exists(VECTOR_DB_URL)
.compat()
.await
.unwrap_or(false)
{
Sqlite::create_database(VECTOR_DB_URL).compat().await?;
}
let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
// Initialize Vector Databasing Tables
// We may be able to skip this assuming the database is never created
// without creating the tables at the same time.
sqlx::query(
"CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY AUTOINCREMENT,
path NVARCHAR(100) NOT NULL,
sha1 NVARCHAR(40) NOT NULL
)",
)
.execute(&db)
.compat()
.await?;
sqlx::query(
"CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
offset INTEGER NOT NULL,
name NVARCHAR(100) NOT NULL,
embedding BLOB NOT NULL,
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
)",
)
.execute(&db)
.compat()
.await?;
Ok(())
}
pub async fn insert_file(indexed_file: IndexedFile) -> Result<()> {
// Write to files table, and return generated id.
let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
let files_insert = sqlx::query("INSERT INTO files (path, sha1) VALUES ($1, $2)")
.bind(indexed_file.path.to_str())
.bind(indexed_file.sha1)
.execute(&db)
.compat()
.await?;
let inserted_id = files_insert.last_insert_rowid();
// I stole this from https://stackoverflow.com/questions/71829931/how-do-i-convert-a-negative-f32-value-to-binary-string-and-back-again
// I imagine there is a better way to serialize to/from blob
fn get_binary_from_values(values: Vec<f32>) -> String {
let bits: Vec<_> = values.iter().map(|v| v.to_bits().to_string()).collect();
bits.join(";")
}
fn get_values_from_binary(bin: &str) -> Vec<f32> {
(0..bin.len() / 32)
.map(|i| {
let start = i * 32;
let end = start + 32;
f32::from_bits(u32::from_str_radix(&bin[start..end], 2).unwrap())
})
.collect()
}
// Currently inserting at approximately 3400 documents a second
// I imagine we can speed this up with a bulk insert of some kind.
for document in indexed_file.documents {
sqlx::query(
"INSERT INTO documents (file_id, offset, name, embedding) VALUES ($1, $2, $3, $4)",
)
.bind(inserted_id)
.bind(document.offset.to_string())
.bind(document.name)
.bind(get_binary_from_values(document.embedding))
.execute(&db)
.compat()
.await?;
}
Ok(())
}
}

View File

@ -1,9 +1,12 @@
use anyhow::{anyhow, Result};
mod db;
use anyhow::Result;
use db::VectorDatabase;
use gpui::{AppContext, Entity, ModelContext, ModelHandle};
use language::LanguageRegistry;
use project::{Fs, Project};
use rand::Rng;
use smol::channel;
use std::{path::PathBuf, sync::Arc};
use std::{path::PathBuf, sync::Arc, time::Instant};
use util::ResultExt;
use workspace::WorkspaceCreated;
@ -27,13 +30,15 @@ pub fn init(fs: Arc<dyn Fs>, language_registry: Arc<LanguageRegistry>, cx: &mut
.detach();
}
#[derive(Debug, sqlx::FromRow)]
struct Document {
offset: usize,
name: String,
embedding: Vec<f32>,
}
struct IndexedFile {
#[derive(Debug, sqlx::FromRow)]
pub struct IndexedFile {
path: PathBuf,
sha1: String,
documents: Vec<Document>,
@ -64,9 +69,24 @@ impl VectorStore {
language_registry: &Arc<LanguageRegistry>,
file_path: PathBuf,
) -> Result<IndexedFile> {
eprintln!("indexing file {file_path:?}");
Err(anyhow!("not implemented"))
// todo!();
// This is creating dummy documents to test the database writes.
let mut documents = vec![];
let mut rng = rand::thread_rng();
let rand_num_of_documents: u8 = rng.gen_range(0..200);
for _ in 0..rand_num_of_documents {
let doc = Document {
offset: 0,
name: "test symbol".to_string(),
embedding: vec![0.32 as f32; 768],
};
documents.push(doc);
}
return Ok(IndexedFile {
path: file_path,
sha1: "asdfasdfasdf".to_string(),
documents,
});
}
fn add_project(&mut self, project: ModelHandle<Project>, cx: &mut ModelContext<Self>) {
@ -100,13 +120,17 @@ impl VectorStore {
}
})
.detach();
cx.background()
.spawn(async move {
// Initialize Database, creates database and tables if not exists
VectorDatabase::initialize_database().await.log_err();
while let Ok(indexed_file) = indexed_files_rx.recv().await {
// write document to database
VectorDatabase::insert_file(indexed_file).await.log_err();
}
})
.detach();
cx.background()
.scoped(|scope| {
for _ in 0..cx.background().num_cpus() {