From cbe487ca3a646694c8613f0d35934dd9cbc7c342 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sun, 16 Jun 2024 19:33:09 -0700
Subject: [PATCH] Almost working RAG

---
 Cargo.lock                                    |  1 -
 crates/lsp-ai/Cargo.toml                      |  2 +-
 crates/lsp-ai/src/config.rs                   | 10 ++++-
 crates/lsp-ai/src/crawl.rs                    | 13 +++++-
 .../src/memory_backends/postgresml/mod.rs     | 42 +++++++++++--------
 crates/lsp-ai/src/splitters/mod.rs            |  4 ++
 crates/lsp-ai/src/splitters/text_splitter.rs  | 40 ++++++++++++++++++
 crates/lsp-ai/src/splitters/tree_sitter.rs    | 15 +++----
 8 files changed, 97 insertions(+), 30 deletions(-)
 create mode 100644 crates/lsp-ai/src/splitters/text_splitter.rs
diff --git a/Cargo.lock b/Cargo.lock
index 20cdfff..9d6149a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3182,7 +3182,6 @@ dependencies = [
  "regex",
  "strum",
  "thiserror",
- "tree-sitter",
  "unicode-segmentation",
 ]
 
diff --git a/crates/lsp-ai/Cargo.toml b/crates/lsp-ai/Cargo.toml
index 1379ac0..bdb6223 100644
--- a/crates/lsp-ai/Cargo.toml
+++ b/crates/lsp-ai/Cargo.toml
@@ -35,7 +35,7 @@ async-trait = "0.1.78"
 tree-sitter = "0.22"
 utils-tree-sitter = { workspace = true, features = ["all"] }
 splitter-tree-sitter = { workspace = true }
-text-splitter = { version = "0.13.3", features = ["code"] }
+text-splitter = { version = "0.13.3" }
 
 [build-dependencies]
 cc="*"
diff --git a/crates/lsp-ai/src/config.rs b/crates/lsp-ai/src/config.rs
index 49f8e54..0c2e764 100644
--- a/crates/lsp-ai/src/config.rs
+++ b/crates/lsp-ai/src/config.rs
@@ -28,6 +28,8 @@ impl Default for PostProcess {
 pub enum ValidSplitter {
     #[serde(rename = "tree_sitter")]
     TreeSitter(TreeSitter),
+    #[serde(rename = "text_sitter")]
+    TextSplitter(TextSplitter),
 }
 
 impl Default for ValidSplitter {
@@ -61,6 +63,12 @@ impl Default for TreeSitter {
     }
 }
 
+#[derive(Debug, Clone, Deserialize)]
+pub struct TextSplitter {
+    #[serde(default = "chunk_size_default")]
+    pub chunk_size: usize,
+}
+
 #[derive(Debug, Clone, Deserialize)]
 pub enum ValidMemoryBackend {
     #[serde(rename = "file_store")]
@@ -123,7 +131,7 @@ pub struct FIM {
 }
 
 const fn max_crawl_memory_default() -> u64 {
-    42
+    100_000_000
 }
 
 const fn max_crawl_file_size_default() -> u64 {
diff --git a/crates/lsp-ai/src/crawl.rs b/crates/lsp-ai/src/crawl.rs
index beade33..edbd56c 100644
--- a/crates/lsp-ai/src/crawl.rs
+++ b/crates/lsp-ai/src/crawl.rs
@@ -8,6 +8,7 @@ pub struct Crawl {
     crawl_config: config::Crawl,
     config: Config,
     crawled_file_types: HashSet<String>,
+    crawled_all: bool,
 }
 
 impl Crawl {
@@ -16,6 +17,7 @@ impl Crawl {
             crawl_config,
             config,
             crawled_file_types: HashSet::new(),
+            crawled_all: false,
         }
     }
 
@@ -25,6 +27,10 @@ impl Crawl {
         triggered_file: Option<String>,
         mut f: impl FnMut(&config::Crawl, &str) -> anyhow::Result<bool>,
     ) -> anyhow::Result<()> {
+        if self.crawled_all {
+            return Ok(());
+        }
+
         if let Some(root_uri) = &self.config.client_params.root_uri {
             if !root_uri.starts_with("file://") {
                 anyhow::bail!("Skipping crawling as root_uri does not begin with file://")
@@ -51,13 +57,14 @@ impl Crawl {
             for result in WalkBuilder::new(&root_uri[7..]).build() {
                 let result = result?;
                 let path = result.path();
+                eprintln!("CRAWLING: {}", path.display());
                 if !path.is_dir() {
                     if let Some(path_str) = path.to_str() {
                         if self.crawl_config.all_files {
                             match f(&self.crawl_config, path_str) {
                                 Ok(c) => {
                                     if !c {
-                                        return Ok(());
+                                        break;
                                     }
                                 }
                                 Err(e) => error!("{e:?}"),
@@ -72,7 +79,7 @@ impl Crawl {
                                         match f(&self.crawl_config, path_str) {
                                             Ok(c) => {
                                                 if !c {
-                                                    return Ok(());
+                                                    break;
                                                 }
                                             }
                                             Err(e) => error!("{e:?}"),
@@ -88,6 +95,8 @@ impl Crawl {
 
             if let Some(extension_to_match) = extension_to_match {
                 self.crawled_file_types.insert(extension_to_match);
+            } else {
+                self.crawled_all = true
             }
         }
         Ok(())
diff --git a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
index 2281efc..a849aa8 100644
--- a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
+++ b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
@@ -245,7 +245,7 @@ impl PostgresML {
 
     fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
         if let Some(crawl) = &self.crawl {
-            let mut documents: Vec<(String, Vec<Chunk>)> = vec![];
+            let mut documents = vec![];
             let mut total_bytes = 0;
             let mut current_bytes = 0;
             crawl
@@ -253,7 +253,7 @@ impl PostgresML {
                 .maybe_do_crawl(triggered_file, |config, path| {
                     // Break if total bytes is over the max crawl memory
                     if total_bytes as u64 >= config.max_crawl_memory {
-                        warn!("Ending crawl early due to `max_crawl_memory` resetraint");
+                        warn!("Ending crawl early due to `max_crawl_memory` restraint");
                         return Ok(false);
                     }
                     // This means it has been opened before
@@ -274,26 +274,19 @@ impl PostgresML {
                     let contents = String::from_utf8(contents)?;
                     current_bytes += contents.len();
                     total_bytes += contents.len();
-                    let chunks = self.splitter.split_file_contents(&uri, &contents);
-                    documents.push((uri, chunks));
+                    let chunks: Vec<pgml::types::Json> = self
+                        .splitter
+                        .split_file_contents(&uri, &contents)
+                        .into_iter()
+                        .map(|chunk| chunk_to_document(&uri, chunk).into())
+                        .collect();
+                    documents.extend(chunks);
                     // If we have over 100 mega bytes of data do the upsert
                     if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory
                     {
-                        // Prepare our chunks
-                        let to_upsert_documents: Vec<pgml::types::Json> =
-                            std::mem::take(&mut documents)
-                                .into_iter()
-                                .map(|(uri, chunks)| {
-                                    chunks
-                                        .into_iter()
-                                        .map(|chunk| chunk_to_document(&uri, chunk))
-                                        .collect::<Vec<Value>>()
-                                })
-                                .flatten()
-                                .map(|f: Value| f.into())
-                                .collect();
-                        // Do the upsert
+                        // Upsert the documents
                         let mut collection = self.collection.clone();
+                        let to_upsert_documents = std::mem::take(&mut documents);
                         TOKIO_RUNTIME.spawn(async move {
                             if let Err(e) = collection
                                 .upsert_documents(to_upsert_documents, None)
@@ -309,6 +302,19 @@ impl PostgresML {
                     }
                     Ok(true)
                 })?;
+            // Upsert any remaining documents
+            if documents.len() > 0 {
+                let mut collection = self.collection.clone();
+                TOKIO_RUNTIME.spawn(async move {
+                    if let Err(e) = collection
+                        .upsert_documents(documents, None)
+                        .await
+                        .context("PGML - Error upserting changed files")
+                    {
+                        error!("{e}");
+                    }
+                });
+            }
         }
         Ok(())
     }
diff --git a/crates/lsp-ai/src/splitters/mod.rs b/crates/lsp-ai/src/splitters/mod.rs
index ed5c15a..8e310f0 100644
--- a/crates/lsp-ai/src/splitters/mod.rs
+++ b/crates/lsp-ai/src/splitters/mod.rs
@@ -2,6 +2,7 @@ use serde::Serialize;
 
 use crate::{config::ValidSplitter, memory_backends::file_store::File};
 
+mod text_splitter;
 mod tree_sitter;
 
 #[derive(Serialize)]
@@ -48,6 +49,9 @@ impl TryFrom<ValidSplitter> for Box<dyn Splitter + Send + Sync> {
             ValidSplitter::TreeSitter(config) => {
                 Ok(Box::new(tree_sitter::TreeSitter::new(config)?))
             }
+            ValidSplitter::TextSplitter(config) => {
+                Ok(Box::new(text_splitter::TextSplitter::new(config)))
+            }
         }
     }
 }
diff --git a/crates/lsp-ai/src/splitters/text_splitter.rs b/crates/lsp-ai/src/splitters/text_splitter.rs
new file mode 100644
index 0000000..d4ad404
--- /dev/null
+++ b/crates/lsp-ai/src/splitters/text_splitter.rs
@@ -0,0 +1,40 @@
+use crate::{config, memory_backends::file_store::File};
+
+use super::{ByteRange, Chunk, Splitter};
+
+pub struct TextSplitter {
+    splitter: text_splitter::TextSplitter<text_splitter::Characters>,
+}
+
+impl TextSplitter {
+    pub fn new(config: config::TextSplitter) -> Self {
+        Self {
+            splitter: text_splitter::TextSplitter::new(config.chunk_size),
+        }
+    }
+
+    pub fn new_with_chunk_size(chunk_size: usize) -> Self {
+        Self {
+            splitter: text_splitter::TextSplitter::new(chunk_size),
+        }
+    }
+}
+
+impl Splitter for TextSplitter {
+    fn split(&self, file: &File) -> Vec<Chunk> {
+        self.split_file_contents("", &file.rope().to_string())
+    }
+
+    fn split_file_contents(&self, _uri: &str, contents: &str) -> Vec<Chunk> {
+        self.splitter
+            .chunk_indices(contents)
+            .fold(vec![], |mut acc, (start_byte, text)| {
+                let end_byte = start_byte + text.len();
+                acc.push(Chunk::new(
+                    text.to_string(),
+                    ByteRange::new(start_byte, end_byte),
+                ));
+                acc
+            })
+    }
+}
diff --git a/crates/lsp-ai/src/splitters/tree_sitter.rs b/crates/lsp-ai/src/splitters/tree_sitter.rs
index ce44185..a804d86 100644
--- a/crates/lsp-ai/src/splitters/tree_sitter.rs
+++ b/crates/lsp-ai/src/splitters/tree_sitter.rs
@@ -4,18 +4,19 @@ use tree_sitter::Tree;
 
 use crate::{config, memory_backends::file_store::File, utils::parse_tree};
 
-use super::{ByteRange, Chunk, Splitter};
+use super::{text_splitter::TextSplitter, ByteRange, Chunk, Splitter};
 
 pub struct TreeSitter {
-    _config: config::TreeSitter,
     splitter: TreeSitterCodeSplitter,
+    text_splitter: TextSplitter,
 }
 
 impl TreeSitter {
     pub fn new(config: config::TreeSitter) -> anyhow::Result<Self> {
+        let text_splitter = TextSplitter::new_with_chunk_size(config.chunk_size);
         Ok(Self {
             splitter: TreeSitterCodeSplitter::new(config.chunk_size, config.chunk_overlap)?,
-            _config: config,
+            text_splitter,
         })
     }
 
@@ -43,11 +44,11 @@ impl Splitter for TreeSitter {
                     error!(
                         "Failed to parse tree for file with error: {e:?}. Falling back to default splitter.",
                     );
-                    todo!()
+                    self.text_splitter.split(file)
                 }
             }
         } else {
-            panic!("TreeSitter splitter requires a tree to split")
+            self.text_splitter.split(file)
         }
     }
 
@@ -59,14 +60,14 @@ impl Splitter for TreeSitter {
                     error!(
                             "Failed to parse tree for file: {uri} with error: {e:?}. Falling back to default splitter.",
                         );
-                    todo!()
+                    self.text_splitter.split_file_contents(uri, contents)
                 }
             },
             Err(e) => {
                 error!(
                     "Failed to parse tree for file {uri} with error: {e:?}. Falling back to default splitter.",
                 );
-                todo!()
+                self.text_splitter.split_file_contents(uri, contents)
             }
         }
     }