From cbe487ca3a646694c8613f0d35934dd9cbc7c342 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Sun, 16 Jun 2024 19:33:09 -0700 Subject: [PATCH] Almost working RAG --- Cargo.lock | 1 - crates/lsp-ai/Cargo.toml | 2 +- crates/lsp-ai/src/config.rs | 10 ++++- crates/lsp-ai/src/crawl.rs | 13 +++++- .../src/memory_backends/postgresml/mod.rs | 42 +++++++++++-------- crates/lsp-ai/src/splitters/mod.rs | 4 ++ crates/lsp-ai/src/splitters/text_splitter.rs | 40 ++++++++++++++++++ crates/lsp-ai/src/splitters/tree_sitter.rs | 15 +++---- 8 files changed, 97 insertions(+), 30 deletions(-) create mode 100644 crates/lsp-ai/src/splitters/text_splitter.rs diff --git a/Cargo.lock b/Cargo.lock index 20cdfff..9d6149a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3182,7 +3182,6 @@ dependencies = [ "regex", "strum", "thiserror", - "tree-sitter", "unicode-segmentation", ] diff --git a/crates/lsp-ai/Cargo.toml b/crates/lsp-ai/Cargo.toml index 1379ac0..bdb6223 100644 --- a/crates/lsp-ai/Cargo.toml +++ b/crates/lsp-ai/Cargo.toml @@ -35,7 +35,7 @@ async-trait = "0.1.78" tree-sitter = "0.22" utils-tree-sitter = { workspace = true, features = ["all"] } splitter-tree-sitter = { workspace = true } -text-splitter = { version = "0.13.3", features = ["code"] } +text-splitter = { version = "0.13.3" } [build-dependencies] cc="*" diff --git a/crates/lsp-ai/src/config.rs b/crates/lsp-ai/src/config.rs index 49f8e54..0c2e764 100644 --- a/crates/lsp-ai/src/config.rs +++ b/crates/lsp-ai/src/config.rs @@ -28,6 +28,8 @@ impl Default for PostProcess { pub enum ValidSplitter { #[serde(rename = "tree_sitter")] TreeSitter(TreeSitter), + #[serde(rename = "text_sitter")] + TextSplitter(TextSplitter), } impl Default for ValidSplitter { @@ -61,6 +63,12 @@ impl Default for TreeSitter { } } +#[derive(Debug, Clone, Deserialize)] +pub struct TextSplitter { + #[serde(default = "chunk_size_default")] + pub chunk_size: usize, +} + #[derive(Debug, Clone, Deserialize)] pub enum ValidMemoryBackend { #[serde(rename = "file_store")] @@ -123,7 +131,7 @@ pub struct FIM { } const fn max_crawl_memory_default() -> u64 { - 42 + 100_000_000 } const fn max_crawl_file_size_default() -> u64 { diff --git a/crates/lsp-ai/src/crawl.rs b/crates/lsp-ai/src/crawl.rs index beade33..edbd56c 100644 --- a/crates/lsp-ai/src/crawl.rs +++ b/crates/lsp-ai/src/crawl.rs @@ -8,6 +8,7 @@ pub struct Crawl { crawl_config: config::Crawl, config: Config, crawled_file_types: HashSet, + crawled_all: bool, } impl Crawl { @@ -16,6 +17,7 @@ impl Crawl { crawl_config, config, crawled_file_types: HashSet::new(), + crawled_all: false, } } @@ -25,6 +27,10 @@ impl Crawl { triggered_file: Option, mut f: impl FnMut(&config::Crawl, &str) -> anyhow::Result, ) -> anyhow::Result<()> { + if self.crawled_all { + return Ok(()); + } + if let Some(root_uri) = &self.config.client_params.root_uri { if !root_uri.starts_with("file://") { anyhow::bail!("Skipping crawling as root_uri does not begin with file://") @@ -51,13 +57,14 @@ impl Crawl { for result in WalkBuilder::new(&root_uri[7..]).build() { let result = result?; let path = result.path(); + eprintln!("CRAWLING: {}", path.display()); if !path.is_dir() { if let Some(path_str) = path.to_str() { if self.crawl_config.all_files { match f(&self.crawl_config, path_str) { Ok(c) => { if !c { - return Ok(()); + break; } } Err(e) => error!("{e:?}"), @@ -72,7 +79,7 @@ impl Crawl { match f(&self.crawl_config, path_str) { Ok(c) => { if !c { - return Ok(()); + break; } } Err(e) => error!("{e:?}"), @@ -88,6 +95,8 @@ impl Crawl { if let Some(extension_to_match) = extension_to_match { self.crawled_file_types.insert(extension_to_match); + } else { + self.crawled_all = true } } Ok(()) diff --git a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs index 2281efc..a849aa8 100644 --- a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs +++ b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs @@ -245,7 +245,7 @@ impl PostgresML { fn maybe_do_crawl(&self, triggered_file: Option) -> anyhow::Result<()> { if let Some(crawl) = &self.crawl { - let mut documents: Vec<(String, Vec)> = vec![]; + let mut documents = vec![]; let mut total_bytes = 0; let mut current_bytes = 0; crawl @@ -253,7 +253,7 @@ impl PostgresML { .maybe_do_crawl(triggered_file, |config, path| { // Break if total bytes is over the max crawl memory if total_bytes as u64 >= config.max_crawl_memory { - warn!("Ending crawl early due to `max_crawl_memory` resetraint"); + warn!("Ending crawl early due to `max_crawl_memory` restraint"); return Ok(false); } // This means it has been opened before @@ -274,26 +274,19 @@ impl PostgresML { let contents = String::from_utf8(contents)?; current_bytes += contents.len(); total_bytes += contents.len(); - let chunks = self.splitter.split_file_contents(&uri, &contents); - documents.push((uri, chunks)); + let chunks: Vec = self + .splitter + .split_file_contents(&uri, &contents) + .into_iter() + .map(|chunk| chunk_to_document(&uri, chunk).into()) + .collect(); + documents.extend(chunks); // If we have over 100 mega bytes of data do the upsert if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory { - // Prepare our chunks - let to_upsert_documents: Vec = - std::mem::take(&mut documents) - .into_iter() - .map(|(uri, chunks)| { - chunks - .into_iter() - .map(|chunk| chunk_to_document(&uri, chunk)) - .collect::>() - }) - .flatten() - .map(|f: Value| f.into()) - .collect(); - // Do the upsert + // Upsert the documents let mut collection = self.collection.clone(); + let to_upsert_documents = std::mem::take(&mut documents); TOKIO_RUNTIME.spawn(async move { if let Err(e) = collection .upsert_documents(to_upsert_documents, None) @@ -309,6 +302,19 @@ impl PostgresML { } Ok(true) })?; + // Upsert any remaining documents + if documents.len() > 0 { + let mut collection = self.collection.clone(); + TOKIO_RUNTIME.spawn(async move { + if let Err(e) = collection + .upsert_documents(documents, None) + .await + .context("PGML - Error upserting changed files") + { + error!("{e}"); + } + }); + } } Ok(()) } diff --git a/crates/lsp-ai/src/splitters/mod.rs b/crates/lsp-ai/src/splitters/mod.rs index ed5c15a..8e310f0 100644 --- a/crates/lsp-ai/src/splitters/mod.rs +++ b/crates/lsp-ai/src/splitters/mod.rs @@ -2,6 +2,7 @@ use serde::Serialize; use crate::{config::ValidSplitter, memory_backends::file_store::File}; +mod text_splitter; mod tree_sitter; #[derive(Serialize)] @@ -48,6 +49,9 @@ impl TryFrom for Box { ValidSplitter::TreeSitter(config) => { Ok(Box::new(tree_sitter::TreeSitter::new(config)?)) } + ValidSplitter::TextSplitter(config) => { + Ok(Box::new(text_splitter::TextSplitter::new(config))) + } } } } diff --git a/crates/lsp-ai/src/splitters/text_splitter.rs b/crates/lsp-ai/src/splitters/text_splitter.rs new file mode 100644 index 0000000..d4ad404 --- /dev/null +++ b/crates/lsp-ai/src/splitters/text_splitter.rs @@ -0,0 +1,40 @@ +use crate::{config, memory_backends::file_store::File}; + +use super::{ByteRange, Chunk, Splitter}; + +pub struct TextSplitter { + splitter: text_splitter::TextSplitter, +} + +impl TextSplitter { + pub fn new(config: config::TextSplitter) -> Self { + Self { + splitter: text_splitter::TextSplitter::new(config.chunk_size), + } + } + + pub fn new_with_chunk_size(chunk_size: usize) -> Self { + Self { + splitter: text_splitter::TextSplitter::new(chunk_size), + } + } +} + +impl Splitter for TextSplitter { + fn split(&self, file: &File) -> Vec { + self.split_file_contents("", &file.rope().to_string()) + } + + fn split_file_contents(&self, _uri: &str, contents: &str) -> Vec { + self.splitter + .chunk_indices(contents) + .fold(vec![], |mut acc, (start_byte, text)| { + let end_byte = start_byte + text.len(); + acc.push(Chunk::new( + text.to_string(), + ByteRange::new(start_byte, end_byte), + )); + acc + }) + } +} diff --git a/crates/lsp-ai/src/splitters/tree_sitter.rs b/crates/lsp-ai/src/splitters/tree_sitter.rs index ce44185..a804d86 100644 --- a/crates/lsp-ai/src/splitters/tree_sitter.rs +++ b/crates/lsp-ai/src/splitters/tree_sitter.rs @@ -4,18 +4,19 @@ use tree_sitter::Tree; use crate::{config, memory_backends::file_store::File, utils::parse_tree}; -use super::{ByteRange, Chunk, Splitter}; +use super::{text_splitter::TextSplitter, ByteRange, Chunk, Splitter}; pub struct TreeSitter { - _config: config::TreeSitter, splitter: TreeSitterCodeSplitter, + text_splitter: TextSplitter, } impl TreeSitter { pub fn new(config: config::TreeSitter) -> anyhow::Result { + let text_splitter = TextSplitter::new_with_chunk_size(config.chunk_size); Ok(Self { splitter: TreeSitterCodeSplitter::new(config.chunk_size, config.chunk_overlap)?, - _config: config, + text_splitter, }) } @@ -43,11 +44,11 @@ impl Splitter for TreeSitter { error!( "Failed to parse tree for file with error: {e:?}. Falling back to default splitter.", ); - todo!() + self.text_splitter.split(file) } } } else { - panic!("TreeSitter splitter requires a tree to split") + self.text_splitter.split(file) } } @@ -59,14 +60,14 @@ impl Splitter for TreeSitter { error!( "Failed to parse tree for file: {uri} with error: {e:?}. Falling back to default splitter.", ); - todo!() + self.text_splitter.split_file_contents(uri, contents) } }, Err(e) => { error!( "Failed to parse tree for file {uri} with error: {e:?}. Falling back to default splitter.", ); - todo!() + self.text_splitter.split_file_contents(uri, contents) } } }