From a96793c562ba91473594cb377377eda4cb26be26 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sun, 9 Jun 2024 18:39:04 -0700
Subject: [PATCH 01/18] Added crawling for file_store backend

---
 src/config.rs                     |  42 +++++++----
 src/memory_backends/file_store.rs | 112 ++++++++++++++++++++++++++----
 src/memory_backends/mod.rs        |   2 +-
 3 files changed, 127 insertions(+), 29 deletions(-)
diff --git a/src/config.rs b/src/config.rs
index 8b7b394..f4e049a 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -85,19 +85,36 @@ pub struct FIM {
     pub end: String,
 }
 
+const fn max_crawl_memory_default() -> u32 {
+    42
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct Crawl {
+    #[serde(default = "max_crawl_memory_default")]
+    pub max_crawl_memory: u32,
+    #[serde(default)]
+    pub all_files: bool,
+}
+
 #[derive(Clone, Debug, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct PostgresML {
     pub database_url: Option<String>,
-    #[serde(default)]
-    pub crawl: bool,
+    pub crawl: Option<Crawl>,
 }
 
 #[derive(Clone, Debug, Deserialize, Default)]
 #[serde(deny_unknown_fields)]
 pub struct FileStore {
-    #[serde(default)]
-    pub crawl: bool,
+    pub crawl: Option<Crawl>,
+}
+
+impl FileStore {
+    pub fn new_without_crawl() -> Self {
+        Self { crawl: None }
+    }
 }
 
 const fn n_gpu_layers_default() -> u32 {
@@ -230,15 +247,14 @@ pub struct ValidConfig {
 
 #[derive(Clone, Debug, Deserialize, Default)]
 pub struct ValidClientParams {
-    #[serde(alias = "rootURI")]
-    _root_uri: Option<String>,
-    _workspace_folders: Option<Vec<String>>,
+    #[serde(alias = "rootUri")]
+    pub root_uri: Option<String>,
 }
 
 #[derive(Clone, Debug)]
 pub struct Config {
     pub config: ValidConfig,
-    _client_params: ValidClientParams,
+    pub client_params: ValidClientParams,
 }
 
 impl Config {
@@ -255,7 +271,7 @@ impl Config {
         let client_params: ValidClientParams = serde_json::from_value(args)?;
         Ok(Self {
             config: valid_args,
-            _client_params: client_params,
+            client_params,
         })
     }
 
@@ -306,13 +322,13 @@ impl Config {
     pub fn default_with_file_store_without_models() -> Self {
         Self {
             config: ValidConfig {
-                memory: ValidMemoryBackend::FileStore(FileStore { crawl: false }),
+                memory: ValidMemoryBackend::FileStore(FileStore { crawl: None }),
                 models: HashMap::new(),
                 completion: None,
             },
-            _client_params: ValidClientParams {
-                _root_uri: None,
-                _workspace_folders: None,
+            client_params: ValidClientParams {
+                root_uri: None,
+                workspace_folders: None,
             },
         }
     }
diff --git a/src/memory_backends/file_store.rs b/src/memory_backends/file_store.rs
index 4d70509..9f2123d 100644
--- a/src/memory_backends/file_store.rs
+++ b/src/memory_backends/file_store.rs
@@ -1,11 +1,12 @@
 use anyhow::Context;
+use ignore::WalkBuilder;
 use indexmap::IndexSet;
 use lsp_types::TextDocumentPositionParams;
 use parking_lot::Mutex;
 use ropey::Rope;
 use serde_json::Value;
-use std::collections::HashMap;
-use tracing::instrument;
+use std::collections::{HashMap, HashSet};
+use tracing::{error, instrument};
 
 use crate::{
     config::{self, Config},
@@ -15,28 +16,106 @@ use crate::{
 use super::{ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType};
 
 pub struct FileStore {
-    _crawl: bool,
-    _config: Config,
+    config: Config,
+    file_store_config: config::FileStore,
+    crawled_file_types: Mutex<HashSet<String>>,
     file_map: Mutex<HashMap<String, Rope>>,
     accessed_files: Mutex<IndexSet<String>>,
 }
 
 impl FileStore {
-    pub fn new(file_store_config: config::FileStore, config: Config) -> Self {
+    pub fn new(file_store_config: config::FileStore, config: Config) -> anyhow::Result<Self> {
+        let s = Self {
+            config,
+            file_store_config,
+            crawled_file_types: Mutex::new(HashSet::new()),
+            file_map: Mutex::new(HashMap::new()),
+            accessed_files: Mutex::new(IndexSet::new()),
+        };
+        if let Err(e) = s.maybe_do_crawl(None) {
+            error!("{e}")
+        }
+        Ok(s)
+    }
+
+    pub fn new_without_crawl(config: Config) -> Self {
         Self {
-            _crawl: file_store_config.crawl,
-            _config: config,
+            config,
+            file_store_config: config::FileStore::new_without_crawl(),
+            crawled_file_types: Mutex::new(HashSet::new()),
             file_map: Mutex::new(HashMap::new()),
             accessed_files: Mutex::new(IndexSet::new()),
         }
     }
 
-    pub fn new_without_crawl(config: Config) -> Self {
-        Self {
-            _crawl: false,
-            _config: config,
-            file_map: Mutex::new(HashMap::new()),
-            accessed_files: Mutex::new(IndexSet::new()),
+    pub fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
+        match (
+            &self.config.client_params.root_uri,
+            &self.file_store_config.crawl,
+        ) {
+            (Some(root_uri), Some(crawl)) => {
+                let extension_to_match = triggered_file
+                    .map(|tf| {
+                        let path = std::path::Path::new(&tf);
+                        path.extension().map(|f| f.to_str().map(|f| f.to_owned()))
+                    })
+                    .flatten()
+                    .flatten();
+
+                if let Some(extension_to_match) = &extension_to_match {
+                    if self.crawled_file_types.lock().contains(extension_to_match) {
+                        return Ok(());
+                    }
+                }
+
+                if !crawl.all_files && extension_to_match.is_none() {
+                    return Ok(());
+                }
+
+                if !root_uri.starts_with("file://") {
+                    anyhow::bail!("Skipping crawling as root_uri does not begin with file://")
+                }
+
+                for result in WalkBuilder::new(&root_uri[7..]).build() {
+                    let result = result?;
+                    let path = result.path();
+                    if !path.is_dir() {
+                        if let Some(path_str) = path.to_str() {
+                            let insert_uri = format!("file://{path_str}");
+                            if self.file_map.lock().contains_key(&insert_uri) {
+                                continue;
+                            }
+                            if crawl.all_files {
+                                let contents = std::fs::read_to_string(path)?;
+                                self.file_map
+                                    .lock()
+                                    .insert(insert_uri, Rope::from_str(&contents));
+                            } else {
+                                match (
+                                    path.extension().map(|pe| pe.to_str()).flatten(),
+                                    &extension_to_match,
+                                ) {
+                                    (Some(path_extension), Some(extension_to_match)) => {
+                                        if path_extension == extension_to_match {
+                                            let contents = std::fs::read_to_string(path)?;
+                                            self.file_map
+                                                .lock()
+                                                .insert(insert_uri, Rope::from_str(&contents));
+                                        }
+                                    }
+                                    _ => continue,
+                                }
+                            }
+                        }
+                    }
+                }
+
+                if let Some(extension_to_match) = extension_to_match {
+                    self.crawled_file_types.lock().insert(extension_to_match);
+                }
+                Ok(())
+            }
+            _ => Ok(()),
         }
     }
 
@@ -199,7 +278,10 @@ impl MemoryBackend for FileStore {
         let rope = Rope::from_str(&params.text_document.text);
         let uri = params.text_document.uri.to_string();
         self.file_map.lock().insert(uri.clone(), rope);
-        self.accessed_files.lock().shift_insert(0, uri);
+        self.accessed_files.lock().shift_insert(0, uri.clone());
+        if let Err(e) = self.maybe_do_crawl(Some(uri)) {
+            error!("{e}")
+        }
         Ok(())
     }
 
@@ -261,7 +343,7 @@ mod tests {
         } else {
             anyhow::bail!("requires a file_store_config")
         };
-        Ok(FileStore::new(file_store_config, config))
+        FileStore::new(file_store_config, config)
     }
 
     fn generate_filler_text_document(uri: Option<&str>, text: Option<&str>) -> TextDocumentItem {
diff --git a/src/memory_backends/mod.rs b/src/memory_backends/mod.rs
index 52a8974..9db0dbf 100644
--- a/src/memory_backends/mod.rs
+++ b/src/memory_backends/mod.rs
@@ -137,7 +137,7 @@ impl TryFrom<Config> for Box<dyn MemoryBackend + Send + Sync> {
     fn try_from(configuration: Config) -> Result<Self, Self::Error> {
         match configuration.config.memory.clone() {
             ValidMemoryBackend::FileStore(file_store_config) => Ok(Box::new(
-                file_store::FileStore::new(file_store_config, configuration),
+                file_store::FileStore::new(file_store_config, configuration)?,
             )),
             ValidMemoryBackend::PostgresML(postgresml_config) => Ok(Box::new(
                 postgresml::PostgresML::new(postgresml_config, configuration)?,

From d0423e10d2582099fd0364f3ab02daaa2d6fc557 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Tue, 11 Jun 2024 17:00:52 -0700
Subject: [PATCH 02/18] Periodic commit

---
 src/config.rs                         | 11 +++++++----
 src/crawl.rs                          |  0
 src/main.rs                           |  1 +
 src/memory_backends/file_store.rs     | 10 ----------
 src/memory_backends/postgresml/mod.rs |  7 ++++---
 5 files changed, 12 insertions(+), 17 deletions(-)
 create mode 100644 src/crawl.rs

diff --git a/src/config.rs b/src/config.rs
index f4e049a..d61f946 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -117,6 +117,12 @@ impl FileStore {
     }
 }
 
+impl From<PostgresML> for FileStore {
+    fn from(value: PostgresML) -> Self {
+        Self { crawl: value.crawl }
+    }
+}
+
 const fn n_gpu_layers_default() -> u32 {
     1000
 }
@@ -326,10 +332,7 @@ impl Config {
                 models: HashMap::new(),
                 completion: None,
             },
-            client_params: ValidClientParams {
-                root_uri: None,
-                workspace_folders: None,
-            },
+            client_params: ValidClientParams { root_uri: None },
         }
     }
 }
diff --git a/src/crawl.rs b/src/crawl.rs
new file mode 100644
index 0000000..e69de29
diff --git a/src/main.rs b/src/main.rs
index 37b1f3f..ff9654d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -14,6 +14,7 @@ use tracing::error;
 use tracing_subscriber::{EnvFilter, FmtSubscriber};
 
 mod config;
+mod crawl;
 mod custom_requests;
 mod memory_backends;
 mod memory_worker;
diff --git a/src/memory_backends/file_store.rs b/src/memory_backends/file_store.rs
index 9f2123d..9deb963 100644
--- a/src/memory_backends/file_store.rs
+++ b/src/memory_backends/file_store.rs
@@ -38,16 +38,6 @@ impl FileStore {
         Ok(s)
     }
 
-    pub fn new_without_crawl(config: Config) -> Self {
-        Self {
-            config,
-            file_store_config: config::FileStore::new_without_crawl(),
-            crawled_file_types: Mutex::new(HashSet::new()),
-            file_map: Mutex::new(HashMap::new()),
-            accessed_files: Mutex::new(IndexSet::new()),
-        }
-    }
-
     pub fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
         match (
             &self.config.client_params.root_uri,
diff --git a/src/memory_backends/postgresml/mod.rs b/src/memory_backends/postgresml/mod.rs
index 8b007ab..8af3db9 100644
--- a/src/memory_backends/postgresml/mod.rs
+++ b/src/memory_backends/postgresml/mod.rs
@@ -33,7 +33,8 @@ impl PostgresML {
         postgresml_config: config::PostgresML,
         configuration: Config,
     ) -> anyhow::Result<Self> {
-        let file_store = FileStore::new_without_crawl(configuration.clone());
+        let file_store_config: config::FileStore = postgresml_config.clone().into();
+        let file_store = FileStore::new(file_store_config, configuration.clone())?;
         let database_url = if let Some(database_url) = postgresml_config.database_url {
             database_url
         } else {
@@ -196,7 +197,7 @@ impl MemoryBackend for PostgresML {
             task_collection
                 .add_pipeline(&mut task_pipeline)
                 .await
-                .expect("PGML - Error adding pipeline to collection");
+                .context("PGML - Error adding pipeline to collection")?;
         }
         task_collection
             .upsert_documents(
@@ -208,7 +209,7 @@ impl MemoryBackend for PostgresML {
                 None,
             )
             .await
-            .expect("PGML - Error upserting documents");
+            .context("PGML - Error upserting documents")?;
         self.file_store.opened_text_document(params).await
     }
 

From ca753a2ba0e972cdf4223872469719d04edd1a28 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Wed, 12 Jun 2024 08:52:16 -0700
Subject: [PATCH 03/18] Overhauled async

---
 src/config.rs                         |   6 -
 src/crawl.rs                          |  79 ++++++
 src/main.rs                           |   3 -
 src/memory_backends/file_store.rs     | 136 ++++-------
 src/memory_backends/mod.rs            |  14 +-
 src/memory_backends/postgresml/mod.rs | 332 ++++++++++++++++----------
 src/memory_worker.rs                  |  54 +++--
 src/transformer_worker.rs             |   8 +-
 src/utils.rs                          |  10 +
 tests/integration_tests.rs            |  84 +++----
 10 files changed, 417 insertions(+), 309 deletions(-)

diff --git a/src/config.rs b/src/config.rs
index d61f946..db1cf63 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -117,12 +117,6 @@ impl FileStore {
     }
 }
 
-impl From<PostgresML> for FileStore {
-    fn from(value: PostgresML) -> Self {
-        Self { crawl: value.crawl }
-    }
-}
-
 const fn n_gpu_layers_default() -> u32 {
     1000
 }
diff --git a/src/crawl.rs b/src/crawl.rs
index e69de29..4a860e2 100644
--- a/src/crawl.rs
+++ b/src/crawl.rs
@@ -0,0 +1,79 @@
+use ignore::WalkBuilder;
+use std::collections::HashSet;
+
+use crate::config::{self, Config};
+
+pub struct Crawl {
+    crawl_config: config::Crawl,
+    config: Config,
+    crawled_file_types: HashSet<String>,
+}
+
+impl Crawl {
+    pub fn new(crawl_config: config::Crawl, config: Config) -> Self {
+        Self {
+            crawl_config,
+            config,
+            crawled_file_types: HashSet::new(),
+        }
+    }
+
+    pub fn maybe_do_crawl(
+        &mut self,
+        triggered_file: Option<String>,
+        mut f: impl FnMut(&str) -> anyhow::Result<()>,
+    ) -> anyhow::Result<()> {
+        if let Some(root_uri) = &self.config.client_params.root_uri {
+            if !root_uri.starts_with("file://") {
+                anyhow::bail!("Skipping crawling as root_uri does not begin with file://")
+            }
+
+            let extension_to_match = triggered_file
+                .map(|tf| {
+                    let path = std::path::Path::new(&tf);
+                    path.extension().map(|f| f.to_str().map(|f| f.to_owned()))
+                })
+                .flatten()
+                .flatten();
+
+            if let Some(extension_to_match) = &extension_to_match {
+                if self.crawled_file_types.contains(extension_to_match) {
+                    return Ok(());
+                }
+            }
+
+            if !self.crawl_config.all_files && extension_to_match.is_none() {
+                return Ok(());
+            }
+
+            for result in WalkBuilder::new(&root_uri[7..]).build() {
+                let result = result?;
+                let path = result.path();
+                if !path.is_dir() {
+                    if let Some(path_str) = path.to_str() {
+                        if self.crawl_config.all_files {
+                            f(path_str)?;
+                        } else {
+                            match (
+                                path.extension().map(|pe| pe.to_str()).flatten(),
+                                &extension_to_match,
+                            ) {
+                                (Some(path_extension), Some(extension_to_match)) => {
+                                    if path_extension == extension_to_match {
+                                        f(path_str)?;
+                                    }
+                                }
+                                _ => continue,
+                            }
+                        }
+                    }
+                }
+            }
+
+            if let Some(extension_to_match) = extension_to_match {
+                self.crawled_file_types.insert(extension_to_match);
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index ff9654d..82ef732 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -84,7 +84,6 @@ fn main_loop(connection: Connection, args: serde_json::Value) -> Result<()> {
     let connection = Arc::new(connection);
 
     // Our channel we use to communicate with our transformer worker
-    // let last_worker_request = Arc::new(Mutex::new(None));
     let (transformer_tx, transformer_rx) = mpsc::channel();
 
     // The channel we use to communicate with our memory worker
@@ -95,8 +94,6 @@ fn main_loop(connection: Connection, args: serde_json::Value) -> Result<()> {
     thread::spawn(move || memory_worker::run(memory_backend, memory_rx));
 
     // Setup our transformer worker
-    // let transformer_backend: Box<dyn TransformerBackend + Send + Sync> =
-    //     config.clone().try_into()?;
     let transformer_backends: HashMap<String, Box<dyn TransformerBackend + Send + Sync>> = config
         .config
         .models
diff --git a/src/memory_backends/file_store.rs b/src/memory_backends/file_store.rs
index 9deb963..e1f4ff2 100644
--- a/src/memory_backends/file_store.rs
+++ b/src/memory_backends/file_store.rs
@@ -1,36 +1,36 @@
 use anyhow::Context;
-use ignore::WalkBuilder;
 use indexmap::IndexSet;
 use lsp_types::TextDocumentPositionParams;
 use parking_lot::Mutex;
 use ropey::Rope;
 use serde_json::Value;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use tracing::{error, instrument};
 
 use crate::{
     config::{self, Config},
+    crawl::Crawl,
     utils::tokens_to_estimated_characters,
 };
 
 use super::{ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType};
 
 pub struct FileStore {
-    config: Config,
-    file_store_config: config::FileStore,
-    crawled_file_types: Mutex<HashSet<String>>,
     file_map: Mutex<HashMap<String, Rope>>,
     accessed_files: Mutex<IndexSet<String>>,
+    crawl: Option<Mutex<Crawl>>,
 }
 
 impl FileStore {
-    pub fn new(file_store_config: config::FileStore, config: Config) -> anyhow::Result<Self> {
+    pub fn new(mut file_store_config: config::FileStore, config: Config) -> anyhow::Result<Self> {
+        let crawl = file_store_config
+            .crawl
+            .take()
+            .map(|x| Mutex::new(Crawl::new(x, config.clone())));
         let s = Self {
-            config,
-            file_store_config,
-            crawled_file_types: Mutex::new(HashSet::new()),
             file_map: Mutex::new(HashMap::new()),
             accessed_files: Mutex::new(IndexSet::new()),
+            crawl,
         };
         if let Err(e) = s.maybe_do_crawl(None) {
             error!("{e}")
@@ -38,75 +38,21 @@ impl FileStore {
         Ok(s)
     }
 
-    pub fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
-        match (
-            &self.config.client_params.root_uri,
-            &self.file_store_config.crawl,
-        ) {
-            (Some(root_uri), Some(crawl)) => {
-                let extension_to_match = triggered_file
-                    .map(|tf| {
-                        let path = std::path::Path::new(&tf);
-                        path.extension().map(|f| f.to_str().map(|f| f.to_owned()))
-                    })
-                    .flatten()
-                    .flatten();
-
-                if let Some(extension_to_match) = &extension_to_match {
-                    if self.crawled_file_types.lock().contains(extension_to_match) {
-                        return Ok(());
-                    }
-                }
-
-                if !crawl.all_files && extension_to_match.is_none() {
+    fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
+        if let Some(crawl) = &self.crawl {
+            crawl.lock().maybe_do_crawl(triggered_file, |path| {
+                let insert_uri = format!("file://{path}");
+                if self.file_map.lock().contains_key(&insert_uri) {
                     return Ok(());
                 }
-
-                if !root_uri.starts_with("file://") {
-                    anyhow::bail!("Skipping crawling as root_uri does not begin with file://")
-                }
-
-                for result in WalkBuilder::new(&root_uri[7..]).build() {
-                    let result = result?;
-                    let path = result.path();
-                    if !path.is_dir() {
-                        if let Some(path_str) = path.to_str() {
-                            let insert_uri = format!("file://{path_str}");
-                            if self.file_map.lock().contains_key(&insert_uri) {
-                                continue;
-                            }
-                            if crawl.all_files {
-                                let contents = std::fs::read_to_string(path)?;
-                                self.file_map
-                                    .lock()
-                                    .insert(insert_uri, Rope::from_str(&contents));
-                            } else {
-                                match (
-                                    path.extension().map(|pe| pe.to_str()).flatten(),
-                                    &extension_to_match,
-                                ) {
-                                    (Some(path_extension), Some(extension_to_match)) => {
-                                        if path_extension == extension_to_match {
-                                            let contents = std::fs::read_to_string(path)?;
-                                            self.file_map
-                                                .lock()
-                                                .insert(insert_uri, Rope::from_str(&contents));
-                                        }
-                                    }
-                                    _ => continue,
-                                }
-                            }
-                        }
-                    }
-                }
-
-                if let Some(extension_to_match) = extension_to_match {
-                    self.crawled_file_types.lock().insert(extension_to_match);
-                }
+                let contents = std::fs::read_to_string(path)?;
+                self.file_map
+                    .lock()
+                    .insert(insert_uri, Rope::from_str(&contents));
                 Ok(())
-            }
-            _ => Ok(()),
+            })?;
         }
+        Ok(())
     }
 
     fn get_rope_for_position(
@@ -226,15 +172,20 @@ impl FileStore {
             }
         })
     }
+
+    pub fn get_file_contents(&self, uri: &str) -> Option<String> {
+        self.file_map.lock().get(uri).clone().map(|x| x.to_string())
+    }
+
+    pub fn contains_file(&self, uri: &str) -> bool {
+        self.file_map.lock().contains_key(uri)
+    }
 }
 
 #[async_trait::async_trait]
 impl MemoryBackend for FileStore {
     #[instrument(skip(self))]
-    async fn get_filter_text(
-        &self,
-        position: &TextDocumentPositionParams,
-    ) -> anyhow::Result<String> {
+    fn get_filter_text(&self, position: &TextDocumentPositionParams) -> anyhow::Result<String> {
         let rope = self
             .file_map
             .lock()
@@ -243,8 +194,9 @@ impl MemoryBackend for FileStore {
             .clone();
         let line = rope
             .get_line(position.position.line as usize)
-            .context("Error getting filter_text")?
-            .slice(0..position.position.character as usize)
+            .context("Error getting filter text")?
+            .get_slice(0..position.position.character as usize)
+            .context("Error getting filter text")?
             .to_string();
         Ok(line)
     }
@@ -261,7 +213,7 @@ impl MemoryBackend for FileStore {
     }
 
     #[instrument(skip(self))]
-    async fn opened_text_document(
+    fn opened_text_document(
         &self,
         params: lsp_types::DidOpenTextDocumentParams,
     ) -> anyhow::Result<()> {
@@ -276,7 +228,7 @@ impl MemoryBackend for FileStore {
     }
 
     #[instrument(skip(self))]
-    async fn changed_text_document(
+    fn changed_text_document(
         &self,
         params: lsp_types::DidChangeTextDocumentParams,
     ) -> anyhow::Result<()> {
@@ -303,7 +255,7 @@ impl MemoryBackend for FileStore {
     }
 
     #[instrument(skip(self))]
-    async fn renamed_files(&self, params: lsp_types::RenameFilesParams) -> anyhow::Result<()> {
+    fn renamed_files(&self, params: lsp_types::RenameFilesParams) -> anyhow::Result<()> {
         for file_rename in params.files {
             let mut file_map = self.file_map.lock();
             if let Some(rope) = file_map.remove(&file_rename.old_uri) {
@@ -353,7 +305,7 @@ mod tests {
             text_document: generate_filler_text_document(None, None),
         };
         let file_store = generate_base_file_store()?;
-        file_store.opened_text_document(params).await?;
+        file_store.opened_text_document(params)?;
         let file = file_store
             .file_map
             .lock()
@@ -370,7 +322,7 @@ mod tests {
             text_document: generate_filler_text_document(None, None),
         };
         let file_store = generate_base_file_store()?;
-        file_store.opened_text_document(params).await?;
+        file_store.opened_text_document(params)?;
 
         let params = RenameFilesParams {
             files: vec![FileRename {
@@ -378,7 +330,7 @@ mod tests {
                 new_uri: "file://filler2/".to_string(),
             }],
         };
-        file_store.renamed_files(params).await?;
+        file_store.renamed_files(params)?;
 
         let file = file_store
             .file_map
@@ -398,7 +350,7 @@ mod tests {
             text_document: text_document.clone(),
         };
         let file_store = generate_base_file_store()?;
-        file_store.opened_text_document(params).await?;
+        file_store.opened_text_document(params)?;
 
         let params = lsp_types::DidChangeTextDocumentParams {
             text_document: VersionedTextDocumentIdentifier {
@@ -420,7 +372,7 @@ mod tests {
                 text: "a".to_string(),
             }],
         };
-        file_store.changed_text_document(params).await?;
+        file_store.changed_text_document(params)?;
         let file = file_store
             .file_map
             .lock()
@@ -440,7 +392,7 @@ mod tests {
                 text: "abc".to_string(),
             }],
         };
-        file_store.changed_text_document(params).await?;
+        file_store.changed_text_document(params)?;
         let file = file_store
             .file_map
             .lock()
@@ -472,7 +424,7 @@ The end with a trailing new line
             text_document: text_document.clone(),
         };
         let file_store = generate_base_file_store()?;
-        file_store.opened_text_document(params).await?;
+        file_store.opened_text_document(params)?;
 
         let prompt = file_store
             .build_prompt(
@@ -568,7 +520,7 @@ The end with a trailing new line
         let params = lsp_types::DidOpenTextDocumentParams {
             text_document: text_document2.clone(),
         };
-        file_store.opened_text_document(params).await?;
+        file_store.opened_text_document(params)?;
 
         let prompt = file_store
             .build_prompt(
@@ -599,7 +551,7 @@ The end with a trailing new line
             text_document: text_document.clone(),
         };
         let file_store = generate_base_file_store()?;
-        file_store.opened_text_document(params).await?;
+        file_store.opened_text_document(params)?;
 
         // Test chat
         let prompt = file_store
diff --git a/src/memory_backends/mod.rs b/src/memory_backends/mod.rs
index 9db0dbf..6b54cff 100644
--- a/src/memory_backends/mod.rs
+++ b/src/memory_backends/mod.rs
@@ -113,22 +113,16 @@ pub trait MemoryBackend {
     async fn init(&self) -> anyhow::Result<()> {
         Ok(())
     }
-    async fn opened_text_document(&self, params: DidOpenTextDocumentParams) -> anyhow::Result<()>;
-    async fn changed_text_document(
-        &self,
-        params: DidChangeTextDocumentParams,
-    ) -> anyhow::Result<()>;
-    async fn renamed_files(&self, params: RenameFilesParams) -> anyhow::Result<()>;
+    fn opened_text_document(&self, params: DidOpenTextDocumentParams) -> anyhow::Result<()>;
+    fn changed_text_document(&self, params: DidChangeTextDocumentParams) -> anyhow::Result<()>;
+    fn renamed_files(&self, params: RenameFilesParams) -> anyhow::Result<()>;
+    fn get_filter_text(&self, position: &TextDocumentPositionParams) -> anyhow::Result<String>;
     async fn build_prompt(
         &self,
         position: &TextDocumentPositionParams,
         prompt_type: PromptType,
         params: &Value,
     ) -> anyhow::Result<Prompt>;
-    async fn get_filter_text(
-        &self,
-        position: &TextDocumentPositionParams,
-    ) -> anyhow::Result<String>;
 }
 
 impl TryFrom<Config> for Box<dyn MemoryBackend + Send + Sync> {
diff --git a/src/memory_backends/postgresml/mod.rs b/src/memory_backends/postgresml/mod.rs
index 8af3db9..d94f9d2 100644
--- a/src/memory_backends/postgresml/mod.rs
+++ b/src/memory_backends/postgresml/mod.rs
@@ -1,131 +1,191 @@
 use std::{
-    sync::mpsc::{self, Sender},
+    sync::{
+        mpsc::{self, Sender},
+        Arc,
+    },
     time::Duration,
 };
 
 use anyhow::Context;
 use lsp_types::TextDocumentPositionParams;
+use parking_lot::Mutex;
 use pgml::{Collection, Pipeline};
 use serde_json::{json, Value};
 use tokio::time;
-use tracing::instrument;
+use tracing::{error, instrument};
 
 use crate::{
     config::{self, Config},
-    utils::tokens_to_estimated_characters,
+    crawl::Crawl,
+    utils::{tokens_to_estimated_characters, TOKIO_RUNTIME},
 };
 
 use super::{
-    file_store::FileStore, ContextAndCodePrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType,
+    file_store::FileStore, ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt,
+    PromptType,
 };
 
+#[derive(Clone)]
 pub struct PostgresML {
     _config: Config,
-    file_store: FileStore,
+    file_store: Arc<FileStore>,
     collection: Collection,
     pipeline: Pipeline,
     debounce_tx: Sender<String>,
-    added_pipeline: bool,
+    crawl: Option<Arc<Mutex<Crawl>>>,
 }
 
 impl PostgresML {
+    #[instrument]
     pub fn new(
-        postgresml_config: config::PostgresML,
+        mut postgresml_config: config::PostgresML,
         configuration: Config,
     ) -> anyhow::Result<Self> {
-        let file_store_config: config::FileStore = postgresml_config.clone().into();
-        let file_store = FileStore::new(file_store_config, configuration.clone())?;
+        let crawl = postgresml_config
+            .crawl
+            .take()
+            .map(|x| Arc::new(Mutex::new(Crawl::new(x, configuration.clone()))));
+        let file_store = Arc::new(FileStore::new(
+            config::FileStore::new_without_crawl(),
+            configuration.clone(),
+        )?);
         let database_url = if let Some(database_url) = postgresml_config.database_url {
             database_url
         } else {
             std::env::var("PGML_DATABASE_URL")?
         };
-        // TODO: Think on the naming of the collection
-        // Maybe filter on metadata or I'm not sure
-        let collection = Collection::new("test-lsp-ai-3", Some(database_url))?;
-        // TODO: Review the pipeline
-        let pipeline = Pipeline::new(
+
+        // TODO: Think through Collections and Pipelines
+        let mut collection = Collection::new("test-lsp-ai-5", Some(database_url))?;
+        let mut pipeline = Pipeline::new(
             "v1",
             Some(
                 json!({
                     "text": {
-                        "splitter": {
-                            "model": "recursive_character",
-                            "parameters": {
-                                "chunk_size": 1500,
-                                "chunk_overlap": 40
-                            }
-                        },
                         "semantic_search": {
-                            "model": "intfloat/e5-small",
+                            "model": "intfloat/e5-small-v2",
+                            "parameters": {
+                                "prompt": "passage: "
+                            }
                         }
                     }
                 })
                 .into(),
             ),
         )?;
+
+        // Add the Pipeline to the Collection
+        TOKIO_RUNTIME.block_on(async {
+            collection
+                .add_pipeline(&mut pipeline)
+                .await
+                .context("PGML - Error adding pipeline to collection")
+        })?;
+
         // Setup up a debouncer for changed text documents
-        let runtime = tokio::runtime::Builder::new_multi_thread()
-            .worker_threads(2)
-            .enable_all()
-            .build()?;
-        let mut task_collection = collection.clone();
         let (debounce_tx, debounce_rx) = mpsc::channel::<String>();
-        runtime.spawn(async move {
+        let mut task_collection = collection.clone();
+        let task_file_store = file_store.clone();
+        TOKIO_RUNTIME.spawn(async move {
             let duration = Duration::from_millis(500);
-            let mut file_paths = Vec::new();
+            let mut file_uris = Vec::new();
             loop {
                 time::sleep(duration).await;
-                let new_paths: Vec<String> = debounce_rx.try_iter().collect();
-                if !new_paths.is_empty() {
-                    for path in new_paths {
-                        if !file_paths.iter().any(|p| *p == path) {
-                            file_paths.push(path);
+                let new_uris: Vec<String> = debounce_rx.try_iter().collect();
+                if !new_uris.is_empty() {
+                    for uri in new_uris {
+                        if !file_uris.iter().any(|p| *p == uri) {
+                            file_uris.push(uri);
                         }
                     }
                 } else {
-                    if file_paths.is_empty() {
+                    if file_uris.is_empty() {
                         continue;
                     }
-                    let documents = file_paths
-                        .into_iter()
-                        .map(|path| {
-                            let text = std::fs::read_to_string(&path)
-                                .unwrap_or_else(|_| panic!("Error reading path: {}", path));
-                            json!({
-                                "id": path,
-                                "text": text
-                            })
-                            .into()
+                    let documents = match file_uris
+                        .iter()
+                        .map(|uri| {
+                            let text = task_file_store
+                                .get_file_contents(&uri)
+                                .context("Error reading file contents from file_store")?;
+                            anyhow::Ok(
+                                json!({
+                                    "id": uri,
+                                    "text": text
+                                })
+                                .into(),
+                            )
                         })
-                        .collect();
-                    task_collection
+                        .collect()
+                    {
+                        Ok(documents) => documents,
+                        Err(e) => {
+                            error!("{e}");
+                            continue;
+                        }
+                    };
+                    if let Err(e) = task_collection
                         .upsert_documents(documents, None)
                         .await
-                        .expect("PGML - Error adding pipeline to collection");
-                    file_paths = Vec::new();
+                        .context("PGML - Error adding pipeline to collection")
+                    {
+                        error!("{e}");
+                        continue;
+                    }
+                    file_uris = Vec::new();
                 }
             }
         });
-        Ok(Self {
+
+        let s = Self {
             _config: configuration,
             file_store,
             collection,
             pipeline,
             debounce_tx,
-            added_pipeline: false,
-        })
+            crawl,
+        };
+
+        if let Err(e) = s.maybe_do_crawl(None) {
+            error!("{e}")
+        }
+        Ok(s)
+    }
+
+    fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
+        if let Some(crawl) = &self.crawl {
+            let mut _collection = self.collection.clone();
+            let mut _pipeline = self.pipeline.clone();
+            let mut documents: Vec<pgml::types::Json> = vec![];
+            crawl.lock().maybe_do_crawl(triggered_file, |path| {
+                let uri = format!("file://{path}");
+                // This means it has been opened before
+                if self.file_store.contains_file(&uri) {
+                    return Ok(());
+                }
+                // Get the contents, split, and upsert it
+                let contents = std::fs::read_to_string(path)?;
+                documents.push(
+                    json!({
+                        "id": uri,
+                        "text": contents
+                    })
+                    .into(),
+                );
+                // Track the size of the documents we have
+                // If it is over some amount in bytes, upsert it
+                Ok(())
+            })?;
+        }
+        Ok(())
     }
 }
 
 #[async_trait::async_trait]
 impl MemoryBackend for PostgresML {
     #[instrument(skip(self))]
-    async fn get_filter_text(
-        &self,
-        position: &TextDocumentPositionParams,
-    ) -> anyhow::Result<String> {
-        self.file_store.get_filter_text(position).await
+    fn get_filter_text(&self, position: &TextDocumentPositionParams) -> anyhow::Result<String> {
+        self.file_store.get_filter_text(position)
     }
 
     #[instrument(skip(self))]
@@ -136,9 +196,21 @@ impl MemoryBackend for PostgresML {
         params: &Value,
     ) -> anyhow::Result<Prompt> {
         let params: MemoryRunParams = params.try_into()?;
+
+        // Build the query
         let query = self
             .file_store
             .get_characters_around_position(position, 512)?;
+
+        // Get the code around the Cursor
+        let mut file_store_params = params.clone();
+        file_store_params.max_context_length = 512;
+        let code = self
+            .file_store
+            .build_code(position, prompt_type, file_store_params)?;
+
+        // Get the context
+        let limit = params.max_context_length / 512;
         let res = self
             .collection
             .vector_search_local(
@@ -146,11 +218,14 @@ impl MemoryBackend for PostgresML {
                     "query": {
                         "fields": {
                             "text": {
-                                "query": query
+                                "query": query,
+                                "parameters": {
+                                    "prompt": "query: "
+                                }
                             }
                         },
                     },
-                    "limit": 5
+                    "limit": limit
                 })
                 .into(),
                 &self.pipeline,
@@ -166,90 +241,93 @@ impl MemoryBackend for PostgresML {
             })
             .collect::<anyhow::Result<Vec<String>>>()?
             .join("\n\n");
-        let mut file_store_params = params.clone();
-        file_store_params.max_context_length = 512;
-        let code = self
-            .file_store
-            .build_code(position, prompt_type, file_store_params)?;
-        let code: ContextAndCodePrompt = code.try_into()?;
-        let code = code.code;
-        let max_characters = tokens_to_estimated_characters(params.max_context_length);
-        let _context: String = context
-            .chars()
-            .take(max_characters - code.chars().count())
-            .collect();
-        // We need to redo this section to work with the new memory backend system
-        todo!()
-        // Ok(Prompt::new(context, code))
+
+        let chars = tokens_to_estimated_characters(params.max_context_length.saturating_sub(512));
+        let context = &context[..chars.min(context.len())];
+
+        // Reconstruct the Prompts
+        Ok(match code {
+            Prompt::ContextAndCode(context_and_code) => Prompt::ContextAndCode(
+                ContextAndCodePrompt::new(context.to_owned(), context_and_code.code),
+            ),
+            Prompt::FIM(fim) => Prompt::FIM(FIMPrompt::new(
+                format!("{context}\n\n{}", fim.prompt),
+                fim.suffix,
+            )),
+        })
     }
 
     #[instrument(skip(self))]
-    async fn opened_text_document(
+    fn opened_text_document(
         &self,
         params: lsp_types::DidOpenTextDocumentParams,
     ) -> anyhow::Result<()> {
-        let text = params.text_document.text.clone();
-        let path = params.text_document.uri.path().to_owned();
-        let task_added_pipeline = self.added_pipeline;
+        self.file_store.opened_text_document(params.clone())?;
         let mut task_collection = self.collection.clone();
-        let mut task_pipeline = self.pipeline.clone();
-        if !task_added_pipeline {
-            task_collection
-                .add_pipeline(&mut task_pipeline)
-                .await
-                .context("PGML - Error adding pipeline to collection")?;
-        }
-        task_collection
-            .upsert_documents(
-                vec![json!({
-                    "id": path,
-                    "text": text
-                })
-                .into()],
-                None,
-            )
-            .await
-            .context("PGML - Error upserting documents")?;
-        self.file_store.opened_text_document(params).await
-    }
-
-    #[instrument(skip(self))]
-    async fn changed_text_document(
-        &self,
-        params: lsp_types::DidChangeTextDocumentParams,
-    ) -> anyhow::Result<()> {
-        let path = params.text_document.uri.path().to_owned();
-        self.debounce_tx.send(path)?;
-        self.file_store.changed_text_document(params).await
-    }
-
-    #[instrument(skip(self))]
-    async fn renamed_files(&self, params: lsp_types::RenameFilesParams) -> anyhow::Result<()> {
-        let mut task_collection = self.collection.clone();
-        let task_params = params.clone();
-        for file in task_params.files {
-            task_collection
-                .delete_documents(
-                    json!({
-                        "id": file.old_uri
-                    })
-                    .into(),
-                )
-                .await
-                .expect("PGML - Error deleting file");
-            let text = std::fs::read_to_string(&file.new_uri).expect("PGML - Error reading file");
+        let saved_uri = params.text_document.uri.to_string();
+        TOKIO_RUNTIME.spawn(async move {
+            let text = params.text_document.text.clone();
+            let uri = params.text_document.uri.to_string();
             task_collection
                 .upsert_documents(
                     vec![json!({
-                        "id": file.new_uri,
+                        "id": uri,
                         "text": text
                     })
                     .into()],
                     None,
                 )
                 .await
-                .expect("PGML - Error adding pipeline to collection");
+                .expect("PGML - Error upserting documents");
+        });
+        if let Err(e) = self.maybe_do_crawl(Some(saved_uri)) {
+            error!("{e}")
         }
-        self.file_store.renamed_files(params).await
+        Ok(())
+    }
+
+    #[instrument(skip(self))]
+    fn changed_text_document(
+        &self,
+        params: lsp_types::DidChangeTextDocumentParams,
+    ) -> anyhow::Result<()> {
+        self.file_store.changed_text_document(params.clone())?;
+        let uri = params.text_document.uri.to_string();
+        self.debounce_tx.send(uri)?;
+        Ok(())
+    }
+
+    #[instrument(skip(self))]
+    fn renamed_files(&self, params: lsp_types::RenameFilesParams) -> anyhow::Result<()> {
+        self.file_store.renamed_files(params.clone())?;
+        let mut task_collection = self.collection.clone();
+        let task_params = params.clone();
+        TOKIO_RUNTIME.spawn(async move {
+            for file in task_params.files {
+                task_collection
+                    .delete_documents(
+                        json!({
+                            "id": file.old_uri
+                        })
+                        .into(),
+                    )
+                    .await
+                    .expect("PGML - Error deleting file");
+                let text =
+                    std::fs::read_to_string(&file.new_uri).expect("PGML - Error reading file");
+                task_collection
+                    .upsert_documents(
+                        vec![json!({
+                            "id": file.new_uri,
+                            "text": text
+                        })
+                        .into()],
+                        None,
+                    )
+                    .await
+                    .expect("PGML - Error adding pipeline to collection");
+            }
+        });
+        Ok(())
     }
 }
diff --git a/src/memory_worker.rs b/src/memory_worker.rs
index 39cad6c..b48894c 100644
--- a/src/memory_worker.rs
+++ b/src/memory_worker.rs
@@ -7,7 +7,10 @@ use lsp_types::{
 use serde_json::Value;
 use tracing::error;
 
-use crate::memory_backends::{MemoryBackend, Prompt, PromptType};
+use crate::{
+    memory_backends::{MemoryBackend, Prompt, PromptType},
+    utils::TOKIO_RUNTIME,
+};
 
 #[derive(Debug)]
 pub struct PromptRequest {
@@ -56,34 +59,46 @@ pub enum WorkerRequest {
     DidRenameFiles(RenameFilesParams),
 }
 
-async fn do_task(
+async fn do_build_prompt(
+    params: PromptRequest,
+    memory_backend: Arc<Box<dyn MemoryBackend + Send + Sync>>,
+) -> anyhow::Result<()> {
+    let prompt = memory_backend
+        .build_prompt(&params.position, params.prompt_type, params.params)
+        .await?;
+    params
+        .tx
+        .send(prompt)
+        .map_err(|_| anyhow::anyhow!("sending on channel failed"))?;
+    Ok(())
+}
+
+fn do_task(
     request: WorkerRequest,
     memory_backend: Arc<Box<dyn MemoryBackend + Send + Sync>>,
 ) -> anyhow::Result<()> {
     match request {
         WorkerRequest::FilterText(params) => {
-            let filter_text = memory_backend.get_filter_text(&params.position).await?;
+            let filter_text = memory_backend.get_filter_text(&params.position)?;
             params
                 .tx
                 .send(filter_text)
                 .map_err(|_| anyhow::anyhow!("sending on channel failed"))?;
         }
         WorkerRequest::Prompt(params) => {
-            let prompt = memory_backend
-                .build_prompt(&params.position, params.prompt_type, &params.params)
-                .await?;
-            params
-                .tx
-                .send(prompt)
-                .map_err(|_| anyhow::anyhow!("sending on channel failed"))?;
+            TOKIO_RUNTIME.spawn(async move {
+                if let Err(e) = do_build_prompt(params, memory_backend).await {
+                    error!("error in memory worker building prompt: {e}")
+                }
+            });
         }
         WorkerRequest::DidOpenTextDocument(params) => {
-            memory_backend.opened_text_document(params).await?;
+            memory_backend.opened_text_document(params)?;
         }
         WorkerRequest::DidChangeTextDocument(params) => {
-            memory_backend.changed_text_document(params).await?;
+            memory_backend.changed_text_document(params)?;
         }
-        WorkerRequest::DidRenameFiles(params) => memory_backend.renamed_files(params).await?,
+        WorkerRequest::DidRenameFiles(params) => memory_backend.renamed_files(params)?,
     }
     anyhow::Ok(())
 }
@@ -93,18 +108,11 @@ fn do_run(
     rx: std::sync::mpsc::Receiver<WorkerRequest>,
 ) -> anyhow::Result<()> {
     let memory_backend = Arc::new(memory_backend);
-    let runtime = tokio::runtime::Builder::new_multi_thread()
-        .worker_threads(4)
-        .enable_all()
-        .build()?;
     loop {
         let request = rx.recv()?;
-        let thread_memory_backend = memory_backend.clone();
-        runtime.spawn(async move {
-            if let Err(e) = do_task(request, thread_memory_backend).await {
-                error!("error in memory worker task: {e}")
-            }
-        });
+        if let Err(e) = do_task(request, memory_backend.clone()) {
+            error!("error in memory worker task: {e}")
+        }
     }
 }
 
diff --git a/src/transformer_worker.rs b/src/transformer_worker.rs
index 196447b..aff089f 100644
--- a/src/transformer_worker.rs
+++ b/src/transformer_worker.rs
@@ -17,7 +17,7 @@ use crate::custom_requests::generation_stream::GenerationStreamParams;
 use crate::memory_backends::Prompt;
 use crate::memory_worker::{self, FilterRequest, PromptRequest};
 use crate::transformer_backends::TransformerBackend;
-use crate::utils::ToResponseError;
+use crate::utils::{ToResponseError, TOKIO_RUNTIME};
 
 #[derive(Clone, Debug)]
 pub struct CompletionRequest {
@@ -189,10 +189,6 @@ fn do_run(
     config: Config,
 ) -> anyhow::Result<()> {
     let transformer_backends = Arc::new(transformer_backends);
-    let runtime = tokio::runtime::Builder::new_multi_thread()
-        .worker_threads(4)
-        .enable_all()
-        .build()?;
 
     // If they have disabled completions, this function will fail. We set it to MIN_POSITIVE to never process a completions request
     let max_requests_per_second = config
@@ -206,7 +202,7 @@ fn do_run(
         let task_transformer_backends = transformer_backends.clone();
         let task_memory_backend_tx = memory_backend_tx.clone();
         let task_config = config.clone();
-        runtime.spawn(async move {
+        TOKIO_RUNTIME.spawn(async move {
             dispatch_request(
                 request,
                 task_connection,
diff --git a/src/utils.rs b/src/utils.rs
index ea5d652..29afd71 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,7 +1,17 @@
 use lsp_server::ResponseError;
+use once_cell::sync::Lazy;
+use tokio::runtime;
 
 use crate::{config::ChatMessage, memory_backends::ContextAndCodePrompt};
 
+pub static TOKIO_RUNTIME: Lazy<runtime::Runtime> = Lazy::new(|| {
+    runtime::Builder::new_multi_thread()
+        .worker_threads(4)
+        .enable_all()
+        .build()
+        .expect("Error building tokio runtime")
+});
+
 pub trait ToResponseError {
     fn to_response_error(&self, code: i32) -> ResponseError;
 }
diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs
index fcfa410..4d87611 100644
--- a/tests/integration_tests.rs
+++ b/tests/integration_tests.rs
@@ -62,51 +62,51 @@ fn send_message(stdin: &mut ChildStdin, message: &str) -> Result<()> {
 // I guess we should hardcode the seed or something if we want to do more of these
 #[test]
 fn test_completion_sequence() -> Result<()> {
-    let mut child = Command::new("cargo")
-        .arg("run")
-        .stdin(Stdio::piped())
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped())
-        .spawn()?;
+    // let mut child = Command::new("cargo")
+    //     .arg("run")
+    //     .stdin(Stdio::piped())
+    //     .stdout(Stdio::piped())
+    //     .stderr(Stdio::piped())
+    //     .spawn()?;
 
-    let mut stdin = child.stdin.take().unwrap();
-    let mut stdout = child.stdout.take().unwrap();
+    // let mut stdin = child.stdin.take().unwrap();
+    // let mut stdout = child.stdout.take().unwrap();
 
-    let initialization_message = r##"{"jsonrpc":"2.0","method":"initialize","params":{"capabilities":{"general":{"positionEncodings":["utf-8","utf-32","utf-16"]},"textDocument":{"codeAction":{"codeActionLiteralSupport":{"codeActionKind":{"valueSet":["","quickfix","refactor","refactor.extract","refactor.inline","refactor.rewrite","source","source.organizeImports"]}},"dataSupport":true,"disabledSupport":true,"isPreferredSupport":true,"resolveSupport":{"properties":["edit","command"]}},"completion":{"completionItem":{"deprecatedSupport":true,"insertReplaceSupport":true,"resolveSupport":{"properties":["documentation","detail","additionalTextEdits"]},"snippetSupport":true,"tagSupport":{"valueSet":[1]}},"completionItemKind":{}},"hover":{"contentFormat":["markdown"]},"inlayHint":{"dynamicRegistration":false},"publishDiagnostics":{"versionSupport":true},"rename":{"dynamicRegistration":false,"honorsChangeAnnotations":false,"prepareSupport":true},"signatureHelp":{"signatureInformation":{"activeParameterSupport":true,"documentationFormat":["markdown"],"parameterInformation":{"labelOffsetSupport":true}}}},"window":{"workDoneProgress":true},"workspace":{"applyEdit":true,"configuration":true,"didChangeConfiguration":{"dynamicRegistration":false},"didChangeWatchedFiles":{"dynamicRegistration":true,"relativePatternSupport":false},"executeCommand":{"dynamicRegistration":false},"inlayHint":{"refreshSupport":false},"symbol":{"dynamicRegistration":false},"workspaceEdit":{"documentChanges":true,"failureHandling":"abort","normalizesLineEndings":false,"resourceOperations":["create","rename","delete"]},"workspaceFolders":true}},"clientInfo":{"name":"helix","version":"23.10 (f6021dd0)"},"processId":70007,"rootPath":"/Users/silas/Projects/Tests/lsp-ai-tests","rootUri":null,"workspaceFolders":[]},"id":0}"##;
-    send_message(&mut stdin, initialization_message)?;
-    let _ = read_response(&mut stdout)?;
+    // let initialization_message = r##"{"jsonrpc":"2.0","method":"initialize","params":{"capabilities":{"general":{"positionEncodings":["utf-8","utf-32","utf-16"]},"textDocument":{"codeAction":{"codeActionLiteralSupport":{"codeActionKind":{"valueSet":["","quickfix","refactor","refactor.extract","refactor.inline","refactor.rewrite","source","source.organizeImports"]}},"dataSupport":true,"disabledSupport":true,"isPreferredSupport":true,"resolveSupport":{"properties":["edit","command"]}},"completion":{"completionItem":{"deprecatedSupport":true,"insertReplaceSupport":true,"resolveSupport":{"properties":["documentation","detail","additionalTextEdits"]},"snippetSupport":true,"tagSupport":{"valueSet":[1]}},"completionItemKind":{}},"hover":{"contentFormat":["markdown"]},"inlayHint":{"dynamicRegistration":false},"publishDiagnostics":{"versionSupport":true},"rename":{"dynamicRegistration":false,"honorsChangeAnnotations":false,"prepareSupport":true},"signatureHelp":{"signatureInformation":{"activeParameterSupport":true,"documentationFormat":["markdown"],"parameterInformation":{"labelOffsetSupport":true}}}},"window":{"workDoneProgress":true},"workspace":{"applyEdit":true,"configuration":true,"didChangeConfiguration":{"dynamicRegistration":false},"didChangeWatchedFiles":{"dynamicRegistration":true,"relativePatternSupport":false},"executeCommand":{"dynamicRegistration":false},"inlayHint":{"refreshSupport":false},"symbol":{"dynamicRegistration":false},"workspaceEdit":{"documentChanges":true,"failureHandling":"abort","normalizesLineEndings":false,"resourceOperations":["create","rename","delete"]},"workspaceFolders":true}},"clientInfo":{"name":"helix","version":"23.10 (f6021dd0)"},"processId":70007,"rootPath":"/Users/silas/Projects/Tests/lsp-ai-tests","rootUri":null,"workspaceFolders":[]},"id":0}"##;
+    // send_message(&mut stdin, initialization_message)?;
+    // let _ = read_response(&mut stdout)?;
 
-    send_message(
-        &mut stdin,
-        r#"{"jsonrpc":"2.0","method":"initialized","params":{}}"#,
-    )?;
-    send_message(
-        &mut stdin,
-        r##"{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"languageId":"python","text":"# Multiplies two numbers\ndef multiply_two_numbers(x, y):\n\n# A singular test\nassert multiply_two_numbers(2, 3) == 6\n","uri":"file:///fake.py","version":0}}}"##,
-    )?;
-    send_message(
-        &mut stdin,
-        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":31,"line":1},"start":{"character":31,"line":1}},"text":"\n    "}],"textDocument":{"uri":"file:///fake.py","version":1}}}"##,
-    )?;
-    send_message(
-        &mut stdin,
-        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":4,"line":2},"start":{"character":4,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":2}}}"##,
-    )?;
-    send_message(
-        &mut stdin,
-        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":5,"line":2},"start":{"character":5,"line":2}},"text":"e"}],"textDocument":{"uri":"file:///fake.py","version":3}}}"##,
-    )?;
-    send_message(
-        &mut stdin,
-        r##"{"jsonrpc":"2.0","method":"textDocument/completion","params":{"position":{"character":6,"line":2},"textDocument":{"uri":"file:///fake.py"}},"id":1}"##,
-    )?;
+    // send_message(
+    //     &mut stdin,
+    //     r#"{"jsonrpc":"2.0","method":"initialized","params":{}}"#,
+    // )?;
+    // send_message(
+    //     &mut stdin,
+    //     r##"{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"languageId":"python","text":"# Multiplies two numbers\ndef multiply_two_numbers(x, y):\n\n# A singular test\nassert multiply_two_numbers(2, 3) == 6\n","uri":"file:///fake.py","version":0}}}"##,
+    // )?;
+    // send_message(
+    //     &mut stdin,
+    //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":31,"line":1},"start":{"character":31,"line":1}},"text":"\n    "}],"textDocument":{"uri":"file:///fake.py","version":1}}}"##,
+    // )?;
+    // send_message(
+    //     &mut stdin,
+    //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":4,"line":2},"start":{"character":4,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":2}}}"##,
+    // )?;
+    // send_message(
+    //     &mut stdin,
+    //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":5,"line":2},"start":{"character":5,"line":2}},"text":"e"}],"textDocument":{"uri":"file:///fake.py","version":3}}}"##,
+    // )?;
+    // send_message(
+    //     &mut stdin,
+    //     r##"{"jsonrpc":"2.0","method":"textDocument/completion","params":{"position":{"character":6,"line":2},"textDocument":{"uri":"file:///fake.py"}},"id":1}"##,
+    // )?;
 
-    let output = read_response(&mut stdout)?;
-    assert_eq!(
-        output,
-        r##"{"jsonrpc":"2.0","id":1,"result":{"isIncomplete":false,"items":[{"filterText":"    re\n","kind":1,"label":"ai - turn x * y","textEdit":{"newText":"turn x * y","range":{"end":{"character":6,"line":2},"start":{"character":6,"line":2}}}}]}}"##
-    );
+    // let output = read_response(&mut stdout)?;
+    // assert_eq!(
+    //     output,
+    //     r##"{"jsonrpc":"2.0","id":1,"result":{"isIncomplete":false,"items":[{"filterText":"    re\n","kind":1,"label":"ai - turn x * y","textEdit":{"newText":"turn x * y","range":{"end":{"character":6,"line":2},"start":{"character":6,"line":2}}}}]}}"##
+    // );
 
-    child.kill()?;
-    Ok(())
+    // child.kill()?;
+    // Ok(())
 }

From 3eb5102399ddfd07b9cc3729a941ca532006a6df Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Wed, 12 Jun 2024 08:53:50 -0700
Subject: [PATCH 04/18] Comment out integration tests for now

---
 tests/integration_tests.rs | 208 ++++++++++++++++++-------------------
 1 file changed, 104 insertions(+), 104 deletions(-)

diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs
index 4d87611..913bdef 100644
--- a/tests/integration_tests.rs
+++ b/tests/integration_tests.rs
@@ -1,112 +1,112 @@
-use anyhow::Result;
-use std::{
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-};
+// use anyhow::Result;
+// use std::{
+//     io::{Read, Write},
+//     process::{ChildStdin, ChildStdout, Command, Stdio},
+// };
 
-// Note if you get an empty response with no error, that typically means
-// the language server died
-fn read_response(stdout: &mut ChildStdout) -> Result<String> {
-    let mut content_length = None;
-    let mut buf = vec![];
-    loop {
-        let mut buf2 = vec![0];
-        stdout.read_exact(&mut buf2)?;
-        buf.push(buf2[0]);
-        if let Some(content_length) = content_length {
-            if buf.len() == content_length {
-                break;
-            }
-        } else {
-            let len = buf.len();
-            if len > 4
-                && buf[len - 4] == 13
-                && buf[len - 3] == 10
-                && buf[len - 2] == 13
-                && buf[len - 1] == 10
-            {
-                content_length =
-                    Some(String::from_utf8(buf[16..len - 4].to_vec())?.parse::<usize>()?);
-                buf = vec![];
-            }
-        }
-    }
-    Ok(String::from_utf8(buf)?)
-}
+// // Note if you get an empty response with no error, that typically means
+// // the language server died
+// fn read_response(stdout: &mut ChildStdout) -> Result<String> {
+//     let mut content_length = None;
+//     let mut buf = vec![];
+//     loop {
+//         let mut buf2 = vec![0];
+//         stdout.read_exact(&mut buf2)?;
+//         buf.push(buf2[0]);
+//         if let Some(content_length) = content_length {
+//             if buf.len() == content_length {
+//                 break;
+//             }
+//         } else {
+//             let len = buf.len();
+//             if len > 4
+//                 && buf[len - 4] == 13
+//                 && buf[len - 3] == 10
+//                 && buf[len - 2] == 13
+//                 && buf[len - 1] == 10
+//             {
+//                 content_length =
+//                     Some(String::from_utf8(buf[16..len - 4].to_vec())?.parse::<usize>()?);
+//                 buf = vec![];
+//             }
+//         }
+//     }
+//     Ok(String::from_utf8(buf)?)
+// }
 
-fn send_message(stdin: &mut ChildStdin, message: &str) -> Result<()> {
-    stdin.write_all(format!("Content-Length: {}\r\n", message.as_bytes().len(),).as_bytes())?;
-    stdin.write_all("\r\n".as_bytes())?;
-    stdin.write_all(message.as_bytes())?;
-    Ok(())
-}
+// fn send_message(stdin: &mut ChildStdin, message: &str) -> Result<()> {
+//     stdin.write_all(format!("Content-Length: {}\r\n", message.as_bytes().len(),).as_bytes())?;
+//     stdin.write_all("\r\n".as_bytes())?;
+//     stdin.write_all(message.as_bytes())?;
+//     Ok(())
+// }
 
-// This completion sequence was created using helix with the lsp-ai analyzer and reading the logs
-// It starts with a Python file:
-// ```
-// # Multiplies two numbers
-// def multiply_two_numbers(x, y):
-//
-// # A singular test
-// assert multiply_two_numbers(2, 3) == 6
-// ```
-// And has the following sequence of key strokes:
-// o on line 2 (this creates an indented new line and enters insert mode)
-// r
-// e
-// The sequence has:
-// - 1 textDocument/DidOpen notification
-// - 3 textDocument/didChange notifications
-// - 1 textDocument/completion requests
-// This test can fail if the model gives a different response than normal, but that seems reasonably unlikely
-// I guess we should hardcode the seed or something if we want to do more of these
-#[test]
-fn test_completion_sequence() -> Result<()> {
-    // let mut child = Command::new("cargo")
-    //     .arg("run")
-    //     .stdin(Stdio::piped())
-    //     .stdout(Stdio::piped())
-    //     .stderr(Stdio::piped())
-    //     .spawn()?;
+// // This completion sequence was created using helix with the lsp-ai analyzer and reading the logs
+// // It starts with a Python file:
+// // ```
+// // # Multiplies two numbers
+// // def multiply_two_numbers(x, y):
+// //
+// // # A singular test
+// // assert multiply_two_numbers(2, 3) == 6
+// // ```
+// // And has the following sequence of key strokes:
+// // o on line 2 (this creates an indented new line and enters insert mode)
+// // r
+// // e
+// // The sequence has:
+// // - 1 textDocument/DidOpen notification
+// // - 3 textDocument/didChange notifications
+// // - 1 textDocument/completion requests
+// // This test can fail if the model gives a different response than normal, but that seems reasonably unlikely
+// // I guess we should hardcode the seed or something if we want to do more of these
+// #[test]
+// fn test_completion_sequence() -> Result<()> {
+//     // let mut child = Command::new("cargo")
+//     //     .arg("run")
+//     //     .stdin(Stdio::piped())
+//     //     .stdout(Stdio::piped())
+//     //     .stderr(Stdio::piped())
+//     //     .spawn()?;
 
-    // let mut stdin = child.stdin.take().unwrap();
-    // let mut stdout = child.stdout.take().unwrap();
+//     // let mut stdin = child.stdin.take().unwrap();
+//     // let mut stdout = child.stdout.take().unwrap();
 
-    // let initialization_message = r##"{"jsonrpc":"2.0","method":"initialize","params":{"capabilities":{"general":{"positionEncodings":["utf-8","utf-32","utf-16"]},"textDocument":{"codeAction":{"codeActionLiteralSupport":{"codeActionKind":{"valueSet":["","quickfix","refactor","refactor.extract","refactor.inline","refactor.rewrite","source","source.organizeImports"]}},"dataSupport":true,"disabledSupport":true,"isPreferredSupport":true,"resolveSupport":{"properties":["edit","command"]}},"completion":{"completionItem":{"deprecatedSupport":true,"insertReplaceSupport":true,"resolveSupport":{"properties":["documentation","detail","additionalTextEdits"]},"snippetSupport":true,"tagSupport":{"valueSet":[1]}},"completionItemKind":{}},"hover":{"contentFormat":["markdown"]},"inlayHint":{"dynamicRegistration":false},"publishDiagnostics":{"versionSupport":true},"rename":{"dynamicRegistration":false,"honorsChangeAnnotations":false,"prepareSupport":true},"signatureHelp":{"signatureInformation":{"activeParameterSupport":true,"documentationFormat":["markdown"],"parameterInformation":{"labelOffsetSupport":true}}}},"window":{"workDoneProgress":true},"workspace":{"applyEdit":true,"configuration":true,"didChangeConfiguration":{"dynamicRegistration":false},"didChangeWatchedFiles":{"dynamicRegistration":true,"relativePatternSupport":false},"executeCommand":{"dynamicRegistration":false},"inlayHint":{"refreshSupport":false},"symbol":{"dynamicRegistration":false},"workspaceEdit":{"documentChanges":true,"failureHandling":"abort","normalizesLineEndings":false,"resourceOperations":["create","rename","delete"]},"workspaceFolders":true}},"clientInfo":{"name":"helix","version":"23.10 (f6021dd0)"},"processId":70007,"rootPath":"/Users/silas/Projects/Tests/lsp-ai-tests","rootUri":null,"workspaceFolders":[]},"id":0}"##;
-    // send_message(&mut stdin, initialization_message)?;
-    // let _ = read_response(&mut stdout)?;
+//     // let initialization_message = r##"{"jsonrpc":"2.0","method":"initialize","params":{"capabilities":{"general":{"positionEncodings":["utf-8","utf-32","utf-16"]},"textDocument":{"codeAction":{"codeActionLiteralSupport":{"codeActionKind":{"valueSet":["","quickfix","refactor","refactor.extract","refactor.inline","refactor.rewrite","source","source.organizeImports"]}},"dataSupport":true,"disabledSupport":true,"isPreferredSupport":true,"resolveSupport":{"properties":["edit","command"]}},"completion":{"completionItem":{"deprecatedSupport":true,"insertReplaceSupport":true,"resolveSupport":{"properties":["documentation","detail","additionalTextEdits"]},"snippetSupport":true,"tagSupport":{"valueSet":[1]}},"completionItemKind":{}},"hover":{"contentFormat":["markdown"]},"inlayHint":{"dynamicRegistration":false},"publishDiagnostics":{"versionSupport":true},"rename":{"dynamicRegistration":false,"honorsChangeAnnotations":false,"prepareSupport":true},"signatureHelp":{"signatureInformation":{"activeParameterSupport":true,"documentationFormat":["markdown"],"parameterInformation":{"labelOffsetSupport":true}}}},"window":{"workDoneProgress":true},"workspace":{"applyEdit":true,"configuration":true,"didChangeConfiguration":{"dynamicRegistration":false},"didChangeWatchedFiles":{"dynamicRegistration":true,"relativePatternSupport":false},"executeCommand":{"dynamicRegistration":false},"inlayHint":{"refreshSupport":false},"symbol":{"dynamicRegistration":false},"workspaceEdit":{"documentChanges":true,"failureHandling":"abort","normalizesLineEndings":false,"resourceOperations":["create","rename","delete"]},"workspaceFolders":true}},"clientInfo":{"name":"helix","version":"23.10 (f6021dd0)"},"processId":70007,"rootPath":"/Users/silas/Projects/Tests/lsp-ai-tests","rootUri":null,"workspaceFolders":[]},"id":0}"##;
+//     // send_message(&mut stdin, initialization_message)?;
+//     // let _ = read_response(&mut stdout)?;
 
-    // send_message(
-    //     &mut stdin,
-    //     r#"{"jsonrpc":"2.0","method":"initialized","params":{}}"#,
-    // )?;
-    // send_message(
-    //     &mut stdin,
-    //     r##"{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"languageId":"python","text":"# Multiplies two numbers\ndef multiply_two_numbers(x, y):\n\n# A singular test\nassert multiply_two_numbers(2, 3) == 6\n","uri":"file:///fake.py","version":0}}}"##,
-    // )?;
-    // send_message(
-    //     &mut stdin,
-    //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":31,"line":1},"start":{"character":31,"line":1}},"text":"\n    "}],"textDocument":{"uri":"file:///fake.py","version":1}}}"##,
-    // )?;
-    // send_message(
-    //     &mut stdin,
-    //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":4,"line":2},"start":{"character":4,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":2}}}"##,
-    // )?;
-    // send_message(
-    //     &mut stdin,
-    //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":5,"line":2},"start":{"character":5,"line":2}},"text":"e"}],"textDocument":{"uri":"file:///fake.py","version":3}}}"##,
-    // )?;
-    // send_message(
-    //     &mut stdin,
-    //     r##"{"jsonrpc":"2.0","method":"textDocument/completion","params":{"position":{"character":6,"line":2},"textDocument":{"uri":"file:///fake.py"}},"id":1}"##,
-    // )?;
+//     // send_message(
+//     //     &mut stdin,
+//     //     r#"{"jsonrpc":"2.0","method":"initialized","params":{}}"#,
+//     // )?;
+//     // send_message(
+//     //     &mut stdin,
+//     //     r##"{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"languageId":"python","text":"# Multiplies two numbers\ndef multiply_two_numbers(x, y):\n\n# A singular test\nassert multiply_two_numbers(2, 3) == 6\n","uri":"file:///fake.py","version":0}}}"##,
+//     // )?;
+//     // send_message(
+//     //     &mut stdin,
+//     //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":31,"line":1},"start":{"character":31,"line":1}},"text":"\n    "}],"textDocument":{"uri":"file:///fake.py","version":1}}}"##,
+//     // )?;
+//     // send_message(
+//     //     &mut stdin,
+//     //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":4,"line":2},"start":{"character":4,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":2}}}"##,
+//     // )?;
+//     // send_message(
+//     //     &mut stdin,
+//     //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":5,"line":2},"start":{"character":5,"line":2}},"text":"e"}],"textDocument":{"uri":"file:///fake.py","version":3}}}"##,
+//     // )?;
+//     // send_message(
+//     //     &mut stdin,
+//     //     r##"{"jsonrpc":"2.0","method":"textDocument/completion","params":{"position":{"character":6,"line":2},"textDocument":{"uri":"file:///fake.py"}},"id":1}"##,
+//     // )?;
 
-    // let output = read_response(&mut stdout)?;
-    // assert_eq!(
-    //     output,
-    //     r##"{"jsonrpc":"2.0","id":1,"result":{"isIncomplete":false,"items":[{"filterText":"    re\n","kind":1,"label":"ai - turn x * y","textEdit":{"newText":"turn x * y","range":{"end":{"character":6,"line":2},"start":{"character":6,"line":2}}}}]}}"##
-    // );
+//     // let output = read_response(&mut stdout)?;
+//     // assert_eq!(
+//     //     output,
+//     //     r##"{"jsonrpc":"2.0","id":1,"result":{"isIncomplete":false,"items":[{"filterText":"    re\n","kind":1,"label":"ai - turn x * y","textEdit":{"newText":"turn x * y","range":{"end":{"character":6,"line":2},"start":{"character":6,"line":2}}}}]}}"##
+//     // );
 
-    // child.kill()?;
-    // Ok(())
-}
+//     // child.kill()?;
+//     Ok(())
+// }

From 58192c4182bddfe33d8e87591055de3bb7036c1b Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sun, 16 Jun 2024 09:49:16 -0700
Subject: [PATCH 05/18] Periodic commit

---
 Cargo.lock                            |  76 ++++-
 Cargo.toml                            |   8 +
 src/config.rs                         |  49 +++-
 src/crawl.rs                          |  10 +-
 src/main.rs                           |  11 +-
 src/memory_backends/file_store.rs     | 384 +++++++++++++++++++++-----
 src/memory_backends/postgresml/mod.rs | 294 ++++++++++++++------
 src/splitters/mod.rs                  |  53 ++++
 src/splitters/tree_sitter.rs          |  77 ++++++
 src/utils.rs                          |  18 +-
 10 files changed, 810 insertions(+), 170 deletions(-)
 create mode 100644 src/splitters/mod.rs
 create mode 100644 src/splitters/tree_sitter.rs

diff --git a/Cargo.lock b/Cargo.lock
index 524e12f..cd6001c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1523,6 +1523,7 @@ dependencies = [
  "anyhow",
  "assert_cmd",
  "async-trait",
+ "cc",
  "directories",
  "hf-hub",
  "ignore",
@@ -1539,10 +1540,13 @@ dependencies = [
  "ropey",
  "serde",
  "serde_json",
+ "splitter-tree-sitter",
  "tokenizers",
  "tokio",
  "tracing",
  "tracing-subscriber",
+ "tree-sitter",
+ "utils-tree-sitter",
  "xxhash-rust",
 ]
 
@@ -2196,9 +2200,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.10.3"
+version = "1.10.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
+checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -2756,6 +2760,15 @@ dependencies = [
  "der",
 ]
 
+[[package]]
+name = "splitter-tree-sitter"
+version = "0.1.0"
+dependencies = [
+ "cc",
+ "thiserror",
+ "tree-sitter",
+]
+
 [[package]]
 name = "spm_precompiled"
 version = "0.1.4"
@@ -3088,18 +3101,18 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
 
 [[package]]
 name = "thiserror"
-version = "1.0.58"
+version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
+checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.58"
+version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
+checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3339,6 +3352,45 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "tree-sitter"
+version = "0.22.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df7cc499ceadd4dcdf7ec6d4cbc34ece92c3fa07821e287aedecd4416c516dca"
+dependencies = [
+ "cc",
+ "regex",
+]
+
+[[package]]
+name = "tree-sitter-python"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4066c6cf678f962f8c2c4561f205945c84834cce73d981e71392624fdc390a9"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-rust"
+version = "0.21.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "277690f420bf90741dea984f3da038ace46c4fe6047cba57a66822226cde1c93"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-zig"
+version = "0.0.1"
+source = "git+https://github.com/SilasMarvin/tree-sitter-zig?branch=silas-update-tree-sitter-version#2eedab3ff6dda88aedddf0bb32a14f81bb709a73"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
 [[package]]
 name = "try-lock"
 version = "0.2.5"
@@ -3450,6 +3502,18 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
 
+[[package]]
+name = "utils-tree-sitter"
+version = "0.1.0"
+dependencies = [
+ "cc",
+ "thiserror",
+ "tree-sitter",
+ "tree-sitter-python",
+ "tree-sitter-rust",
+ "tree-sitter-zig",
+]
+
 [[package]]
 name = "uuid"
 version = "1.7.0"
diff --git a/Cargo.toml b/Cargo.toml
index 18dfb33..657589a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,14 @@ pgml = "1.0.4"
 tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
 indexmap = "2.2.5"
 async-trait = "0.1.78"
+tree-sitter = "0.22"
+# splitter-tree-sitter = { git = "https://github.com/SilasMarvin/splitter-tree-sitter" }
+splitter-tree-sitter = { path = "../../splitter-tree-sitter" }
+# utils-tree-sitter = { git = "https://github.com/SilasMarvin/utils-tree-sitter" }
+utils-tree-sitter = { path = "../../utils-tree-sitter", features = ["all"] }
+
+[build-dependencies]
+cc="*"
 
 [features]
 default = []
diff --git a/src/config.rs b/src/config.rs
index db1cf63..49f8e54 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -24,6 +24,43 @@ impl Default for PostProcess {
     }
 }
 
+#[derive(Debug, Clone, Deserialize)]
+pub enum ValidSplitter {
+    #[serde(rename = "tree_sitter")]
+    TreeSitter(TreeSitter),
+}
+
+impl Default for ValidSplitter {
+    fn default() -> Self {
+        ValidSplitter::TreeSitter(TreeSitter::default())
+    }
+}
+
+const fn chunk_size_default() -> usize {
+    1500
+}
+
+const fn chunk_overlap_default() -> usize {
+    0
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct TreeSitter {
+    #[serde(default = "chunk_size_default")]
+    pub chunk_size: usize,
+    #[serde(default = "chunk_overlap_default")]
+    pub chunk_overlap: usize,
+}
+
+impl Default for TreeSitter {
+    fn default() -> Self {
+        Self {
+            chunk_size: 1500,
+            chunk_overlap: 0,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Deserialize)]
 pub enum ValidMemoryBackend {
     #[serde(rename = "file_store")]
@@ -85,15 +122,21 @@ pub struct FIM {
     pub end: String,
 }
 
-const fn max_crawl_memory_default() -> u32 {
+const fn max_crawl_memory_default() -> u64 {
     42
 }
 
+const fn max_crawl_file_size_default() -> u64 {
+    10_000_000
+}
+
 #[derive(Clone, Debug, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct Crawl {
+    #[serde(default = "max_crawl_file_size_default")]
+    pub max_file_size: u64,
     #[serde(default = "max_crawl_memory_default")]
-    pub max_crawl_memory: u32,
+    pub max_crawl_memory: u64,
     #[serde(default)]
     pub all_files: bool,
 }
@@ -103,6 +146,8 @@ pub struct Crawl {
 pub struct PostgresML {
     pub database_url: Option<String>,
     pub crawl: Option<Crawl>,
+    #[serde(default)]
+    pub splitter: ValidSplitter,
 }
 
 #[derive(Clone, Debug, Deserialize, Default)]
diff --git a/src/crawl.rs b/src/crawl.rs
index 4a860e2..191d869 100644
--- a/src/crawl.rs
+++ b/src/crawl.rs
@@ -18,10 +18,14 @@ impl Crawl {
         }
     }
 
+    pub fn crawl_config(&self) -> &config::Crawl {
+        &self.crawl_config
+    }
+
     pub fn maybe_do_crawl(
         &mut self,
         triggered_file: Option<String>,
-        mut f: impl FnMut(&str) -> anyhow::Result<()>,
+        mut f: impl FnMut(&config::Crawl, &str) -> anyhow::Result<()>,
     ) -> anyhow::Result<()> {
         if let Some(root_uri) = &self.config.client_params.root_uri {
             if !root_uri.starts_with("file://") {
@@ -52,7 +56,7 @@ impl Crawl {
                 if !path.is_dir() {
                     if let Some(path_str) = path.to_str() {
                         if self.crawl_config.all_files {
-                            f(path_str)?;
+                            f(&self.crawl_config, path_str)?;
                         } else {
                             match (
                                 path.extension().map(|pe| pe.to_str()).flatten(),
@@ -60,7 +64,7 @@ impl Crawl {
                             ) {
                                 (Some(path_extension), Some(extension_to_match)) => {
                                     if path_extension == extension_to_match {
-                                        f(path_str)?;
+                                        f(&self.crawl_config, path_str)?;
                                     }
                                 }
                                 _ => continue,
diff --git a/src/main.rs b/src/main.rs
index 82ef732..106be0a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -18,6 +18,7 @@ mod crawl;
 mod custom_requests;
 mod memory_backends;
 mod memory_worker;
+mod splitters;
 #[cfg(feature = "llama_cpp")]
 mod template;
 mod transformer_backends;
@@ -51,15 +52,19 @@ where
     req.extract(R::METHOD)
 }
 
-fn main() -> Result<()> {
-    // Builds a tracing subscriber from the `LSP_AI_LOG` environment variable
-    // If the variables value is malformed or missing, sets the default log level to ERROR
+// Builds a tracing subscriber from the `LSP_AI_LOG` environment variable
+// If the variables value is malformed or missing, sets the default log level to ERROR
+fn init_logger() {
     FmtSubscriber::builder()
         .with_writer(std::io::stderr)
         .with_ansi(false)
         .without_time()
         .with_env_filter(EnvFilter::from_env("LSP_AI_LOG"))
         .init();
+}
+
+fn main() -> Result<()> {
+    init_logger();
 
     let (connection, io_threads) = Connection::stdio();
     let server_capabilities = serde_json::to_value(ServerCapabilities {
diff --git a/src/memory_backends/file_store.rs b/src/memory_backends/file_store.rs
index e1f4ff2..e93cb06 100644
--- a/src/memory_backends/file_store.rs
+++ b/src/memory_backends/file_store.rs
@@ -6,17 +6,50 @@ use ropey::Rope;
 use serde_json::Value;
 use std::collections::HashMap;
 use tracing::{error, instrument};
+use tree_sitter::{InputEdit, Point, Tree};
 
 use crate::{
     config::{self, Config},
     crawl::Crawl,
-    utils::tokens_to_estimated_characters,
+    utils::{parse_tree, tokens_to_estimated_characters},
 };
 
 use super::{ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType};
 
+#[derive(Default)]
+pub struct AdditionalFileStoreParams {
+    build_tree: bool,
+}
+
+impl AdditionalFileStoreParams {
+    pub fn new(build_tree: bool) -> Self {
+        Self { build_tree }
+    }
+}
+
+#[derive(Clone)]
+pub struct File {
+    rope: Rope,
+    tree: Option<Tree>,
+}
+
+impl File {
+    fn new(rope: Rope, tree: Option<Tree>) -> Self {
+        Self { rope, tree }
+    }
+
+    pub fn rope(&self) -> &Rope {
+        &self.rope
+    }
+
+    pub fn tree(&self) -> Option<&Tree> {
+        self.tree.as_ref()
+    }
+}
+
 pub struct FileStore {
-    file_map: Mutex<HashMap<String, Rope>>,
+    params: AdditionalFileStoreParams,
+    file_map: Mutex<HashMap<String, File>>,
     accessed_files: Mutex<IndexSet<String>>,
     crawl: Option<Mutex<Crawl>>,
 }
@@ -28,29 +61,72 @@ impl FileStore {
             .take()
             .map(|x| Mutex::new(Crawl::new(x, config.clone())));
         let s = Self {
+            params: AdditionalFileStoreParams::default(),
             file_map: Mutex::new(HashMap::new()),
             accessed_files: Mutex::new(IndexSet::new()),
             crawl,
         };
         if let Err(e) = s.maybe_do_crawl(None) {
-            error!("{e}")
+            error!("{e:?}")
         }
         Ok(s)
     }
 
+    pub fn new_with_params(
+        mut file_store_config: config::FileStore,
+        config: Config,
+        params: AdditionalFileStoreParams,
+    ) -> anyhow::Result<Self> {
+        let crawl = file_store_config
+            .crawl
+            .take()
+            .map(|x| Mutex::new(Crawl::new(x, config.clone())));
+        let s = Self {
+            params,
+            file_map: Mutex::new(HashMap::new()),
+            accessed_files: Mutex::new(IndexSet::new()),
+            crawl,
+        };
+        if let Err(e) = s.maybe_do_crawl(None) {
+            error!("{e:?}")
+        }
+        Ok(s)
+    }
+
+    fn add_new_file(&self, uri: &str, contents: String) {
+        let tree = if self.params.build_tree {
+            match parse_tree(uri, &contents, None) {
+                Ok(tree) => Some(tree),
+                Err(e) => {
+                    error!(
+                        "Failed to parse tree for {uri} with error {e}, falling back to no tree"
+                    );
+                    None
+                }
+            }
+        } else {
+            None
+        };
+        self.file_map
+            .lock()
+            .insert(uri.to_string(), File::new(Rope::from_str(&contents), tree));
+        self.accessed_files.lock().insert(uri.to_string());
+    }
+
     fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
         if let Some(crawl) = &self.crawl {
-            crawl.lock().maybe_do_crawl(triggered_file, |path| {
-                let insert_uri = format!("file://{path}");
-                if self.file_map.lock().contains_key(&insert_uri) {
-                    return Ok(());
-                }
-                let contents = std::fs::read_to_string(path)?;
-                self.file_map
-                    .lock()
-                    .insert(insert_uri, Rope::from_str(&contents));
-                Ok(())
-            })?;
+            crawl
+                .lock()
+                .maybe_do_crawl(triggered_file, |config, path| {
+                    let insert_uri = format!("file://{path}");
+                    if self.file_map.lock().contains_key(&insert_uri) {
+                        return Ok(());
+                    }
+                    // TODO: actually limit files based on config
+                    let contents = std::fs::read_to_string(path)?;
+                    self.add_new_file(&insert_uri, contents);
+                    Ok(())
+                })?;
         }
         Ok(())
     }
@@ -67,6 +143,7 @@ impl FileStore {
             .lock()
             .get(&current_document_uri)
             .context("Error file not found")?
+            .rope
             .clone();
         let mut cursor_index = rope.line_to_char(position.position.line as usize)
             + position.position.character as usize;
@@ -82,7 +159,7 @@ impl FileStore {
                 break;
             }
             let file_map = self.file_map.lock();
-            let r = file_map.get(file).context("Error file not found")?;
+            let r = &file_map.get(file).context("Error file not found")?.rope;
             let slice_max = needed.min(r.len_chars() + 1);
             let rope_str_slice = r
                 .get_slice(0..slice_max - 1)
@@ -105,6 +182,7 @@ impl FileStore {
             .lock()
             .get(position.text_document.uri.as_str())
             .context("Error file not found")?
+            .rope
             .clone();
         let cursor_index = rope.line_to_char(position.position.line as usize)
             + position.position.character as usize;
@@ -173,8 +251,8 @@ impl FileStore {
         })
     }
 
-    pub fn get_file_contents(&self, uri: &str) -> Option<String> {
-        self.file_map.lock().get(uri).clone().map(|x| x.to_string())
+    pub fn file_map(&self) -> &Mutex<HashMap<String, File>> {
+        &self.file_map
     }
 
     pub fn contains_file(&self, uri: &str) -> bool {
@@ -191,6 +269,7 @@ impl MemoryBackend for FileStore {
             .lock()
             .get(position.text_document.uri.as_str())
             .context("Error file not found")?
+            .rope
             .clone();
         let line = rope
             .get_line(position.position.line as usize)
@@ -217,12 +296,10 @@ impl MemoryBackend for FileStore {
         &self,
         params: lsp_types::DidOpenTextDocumentParams,
     ) -> anyhow::Result<()> {
-        let rope = Rope::from_str(&params.text_document.text);
         let uri = params.text_document.uri.to_string();
-        self.file_map.lock().insert(uri.clone(), rope);
-        self.accessed_files.lock().shift_insert(0, uri.clone());
+        self.add_new_file(&uri, params.text_document.text);
         if let Err(e) = self.maybe_do_crawl(Some(uri)) {
-            error!("{e}")
+            error!("{e:?}")
         }
         Ok(())
     }
@@ -234,20 +311,95 @@ impl MemoryBackend for FileStore {
     ) -> anyhow::Result<()> {
         let uri = params.text_document.uri.to_string();
         let mut file_map = self.file_map.lock();
-        let rope = file_map
+        let file = file_map
             .get_mut(&uri)
-            .context("Error trying to get file that does not exist")?;
+            .with_context(|| format!("Trying to get file that does not exist {uri}"))?;
         for change in params.content_changes {
             // If range is ommitted, text is the new text of the document
             if let Some(range) = change.range {
-                let start_index =
-                    rope.line_to_char(range.start.line as usize) + range.start.character as usize;
+                // Record old positions
+                let (old_end_position, old_end_byte) = {
+                    let last_line_index = file.rope.len_lines() - 1;
+                    (
+                        file.rope
+                            .get_line(last_line_index)
+                            .context("getting last line for edit")
+                            .map(|last_line| Point::new(last_line_index, last_line.len_chars())),
+                        file.rope.bytes().count(),
+                    )
+                };
+                // Update the document
+                let start_index = file.rope.line_to_char(range.start.line as usize)
+                    + range.start.character as usize;
                 let end_index =
-                    rope.line_to_char(range.end.line as usize) + range.end.character as usize;
-                rope.remove(start_index..end_index);
-                rope.insert(start_index, &change.text);
+                    file.rope.line_to_char(range.end.line as usize) + range.end.character as usize;
+                file.rope.remove(start_index..end_index);
+                file.rope.insert(start_index, &change.text);
+                // Set new end positions
+                let (new_end_position, new_end_byte) = {
+                    let last_line_index = file.rope.len_lines() - 1;
+                    (
+                        file.rope
+                            .get_line(last_line_index)
+                            .context("getting last line for edit")
+                            .map(|last_line| Point::new(last_line_index, last_line.len_chars())),
+                        file.rope.bytes().count(),
+                    )
+                };
+                // Update the tree
+                if self.params.build_tree {
+                    let mut old_tree = file.tree.take();
+                    let start_byte = file
+                        .rope
+                        .try_line_to_char(range.start.line as usize)
+                        .and_then(|start_char| {
+                            file.rope
+                                .try_char_to_byte(start_char + range.start.character as usize)
+                        })
+                        .map_err(anyhow::Error::msg);
+                    if let Some(old_tree) = &mut old_tree {
+                        match (start_byte, old_end_position, new_end_position) {
+                            (Ok(start_byte), Ok(old_end_position), Ok(new_end_position)) => {
+                                old_tree.edit(&InputEdit {
+                                    start_byte,
+                                    old_end_byte,
+                                    new_end_byte,
+                                    start_position: Point::new(
+                                        range.start.line as usize,
+                                        range.start.character as usize,
+                                    ),
+                                    old_end_position,
+                                    new_end_position,
+                                });
+                                file.tree = match parse_tree(
+                                    &uri,
+                                    &file.rope.to_string(),
+                                    Some(old_tree),
+                                ) {
+                                    Ok(tree) => Some(tree),
+                                    Err(e) => {
+                                        error!("failed to edit tree: {e:?}");
+                                        None
+                                    }
+                                };
+                            }
+                            (Err(e), _, _) | (_, Err(e), _) | (_, _, Err(e)) => {
+                                error!("failed to build tree edit: {e:?}");
+                            }
+                        }
+                    }
+                }
             } else {
-                *rope = Rope::from_str(&change.text);
+                file.rope = Rope::from_str(&change.text);
+                if self.params.build_tree {
+                    file.tree = match parse_tree(&uri, &change.text, None) {
+                        Ok(tree) => Some(tree),
+                        Err(e) => {
+                            error!("failed to parse new tree: {e:?}");
+                            None
+                        }
+                    };
+                }
             }
         }
         self.accessed_files.lock().shift_insert(0, uri);
@@ -299,8 +451,8 @@ mod tests {
         }
     }
 
-    #[tokio::test]
-    async fn can_open_document() -> anyhow::Result<()> {
+    #[test]
+    fn can_open_document() -> anyhow::Result<()> {
         let params = lsp_types::DidOpenTextDocumentParams {
             text_document: generate_filler_text_document(None, None),
         };
@@ -312,12 +464,12 @@ mod tests {
             .get("file://filler/")
             .unwrap()
             .clone();
-        assert_eq!(file.to_string(), "Here is the document body");
+        assert_eq!(file.rope.to_string(), "Here is the document body");
         Ok(())
     }
 
-    #[tokio::test]
-    async fn can_rename_document() -> anyhow::Result<()> {
+    #[test]
+    fn can_rename_document() -> anyhow::Result<()> {
         let params = lsp_types::DidOpenTextDocumentParams {
             text_document: generate_filler_text_document(None, None),
         };
@@ -338,12 +490,12 @@ mod tests {
             .get("file://filler2/")
             .unwrap()
             .clone();
-        assert_eq!(file.to_string(), "Here is the document body");
+        assert_eq!(file.rope.to_string(), "Here is the document body");
         Ok(())
     }
 
-    #[tokio::test]
-    async fn can_change_document() -> anyhow::Result<()> {
+    #[test]
+    fn can_change_document() -> anyhow::Result<()> {
         let text_document = generate_filler_text_document(None, None);
 
         let params = DidOpenTextDocumentParams {
@@ -379,7 +531,7 @@ mod tests {
             .get("file://filler/")
             .unwrap()
             .clone();
-        assert_eq!(file.to_string(), "Hae is the document body");
+        assert_eq!(file.rope.to_string(), "Hae is the document body");
 
         let params = lsp_types::DidChangeTextDocumentParams {
             text_document: VersionedTextDocumentIdentifier {
@@ -399,7 +551,7 @@ mod tests {
             .get("file://filler/")
             .unwrap()
             .clone();
-        assert_eq!(file.to_string(), "abc");
+        assert_eq!(file.rope.to_string(), "abc");
 
         Ok(())
     }
@@ -579,43 +731,123 @@ The end with a trailing new line
         Ok(())
     }
 
-    //     #[tokio::test]
-    //     async fn test_fim_placement_corner_cases() -> anyhow::Result<()> {
-    //         let text_document = generate_filler_text_document(None, Some("test\n"));
-    //         let params = lsp_types::DidOpenTextDocumentParams {
-    //             text_document: text_document.clone(),
-    //         };
-    //         let file_store = generate_base_file_store()?;
-    //         file_store.opened_text_document(params).await?;
+    #[test]
+    fn test_file_store_tree_sitter() -> anyhow::Result<()> {
+        crate::init_logger();
 
-    //         // Test FIM
-    //         let params = json!({
-    //             "fim": {
-    //                 "start": "SS",
-    //                 "middle": "MM",
-    //                 "end": "EE"
-    //             }
-    //         });
-    //         let prompt = file_store
-    //             .build_prompt(
-    //                 &TextDocumentPositionParams {
-    //                     text_document: TextDocumentIdentifier {
-    //                         uri: text_document.uri.clone(),
-    //                     },
-    //                     position: Position {
-    //                         line: 1,
-    //                         character: 0,
-    //                     },
-    //                 },
-    //                 params,
-    //             )
-    //             .await?;
-    //         assert_eq!(prompt.context, "");
-    //         let text = r#"test
-    // "#
-    //         .to_string();
-    //         assert_eq!(text, prompt.code);
+        let config = Config::default_with_file_store_without_models();
+        let file_store_config = if let config::ValidMemoryBackend::FileStore(file_store_config) =
+            config.config.memory.clone()
+        {
+            file_store_config
+        } else {
+            anyhow::bail!("requires a file_store_config")
+        };
+        let params = AdditionalFileStoreParams { build_tree: true };
+        let file_store = FileStore::new_with_params(file_store_config, config, params)?;
 
-    //         Ok(())
-    //     }
+        let uri = "file://filler/test.rs";
+        let text = r#"#[derive(Debug)]
+struct Rectangle {
+    width: u32,
+    height: u32,
+}
+
+impl Rectangle {
+    fn area(&self) -> u32 {
+
+    }
+}
+
+fn main() {
+    let rect1 = Rectangle {
+        width: 30,
+        height: 50,
+    };
+
+    println!(
+        "The area of the rectangle is {} square pixels.",
+        rect1.area()
+    );
+}"#;
+        let text_document = TextDocumentItem {
+            uri: reqwest::Url::parse(uri).unwrap(),
+            language_id: "".to_string(),
+            version: 0,
+            text: text.to_string(),
+        };
+        let params = DidOpenTextDocumentParams {
+            text_document: text_document.clone(),
+        };
+
+        file_store.opened_text_document(params)?;
+
+        // Test insert
+        let params = lsp_types::DidChangeTextDocumentParams {
+            text_document: VersionedTextDocumentIdentifier {
+                uri: text_document.uri.clone(),
+                version: 1,
+            },
+            content_changes: vec![TextDocumentContentChangeEvent {
+                range: Some(Range {
+                    start: Position {
+                        line: 8,
+                        character: 0,
+                    },
+                    end: Position {
+                        line: 8,
+                        character: 0,
+                    },
+                }),
+                range_length: None,
+                text: "        self.width * self.height".to_string(),
+            }],
+        };
+        file_store.changed_text_document(params)?;
+        let file = file_store.file_map.lock().get(uri).unwrap().clone();
+        assert_eq!(file.tree.unwrap().root_node().to_sexp(), "(source_file (attribute_item (attribute (identifier) arguments: (token_tree (identifier)))) (struct_item name: (type_identifier) body: (field_declaration_list (field_declaration name: (field_identifier) type: (primitive_type)) (field_declaration name: (field_identifier) type: (primitive_type)))) (impl_item type: (type_identifier) body: (declaration_list (function_item name: (identifier) parameters: (parameters (self_parameter (self))) return_type: (primitive_type) body: (block (binary_expression left: (field_expression value: (self) field: (field_identifier)) right: (field_expression value: (self) field: (field_identifier))))))) (function_item name: (identifier) parameters: (parameters) body: (block (let_declaration pattern: (identifier) value: (struct_expression name: (type_identifier) body: (field_initializer_list (field_initializer field: (field_identifier) value: (integer_literal)) (field_initializer field: (field_identifier) value: (integer_literal))))) (expression_statement (macro_invocation macro: (identifier) (token_tree (string_literal (string_content)) (identifier) (identifier) (token_tree)))))))");
+
+        // Test delete
+        let params = lsp_types::DidChangeTextDocumentParams {
+            text_document: VersionedTextDocumentIdentifier {
+                uri: text_document.uri.clone(),
+                version: 1,
+            },
+            content_changes: vec![TextDocumentContentChangeEvent {
+                range: Some(Range {
+                    start: Position {
+                        line: 0,
+                        character: 0,
+                    },
+                    end: Position {
+                        line: 12,
+                        character: 0,
+                    },
+                }),
+                range_length: None,
+                text: "".to_string(),
+            }],
+        };
+        file_store.changed_text_document(params)?;
+        let file = file_store.file_map.lock().get(uri).unwrap().clone();
+        assert_eq!(file.tree.unwrap().root_node().to_sexp(), "(source_file (function_item name: (identifier) parameters: (parameters) body: (block (let_declaration pattern: (identifier) value: (struct_expression name: (type_identifier) body: (field_initializer_list (field_initializer field: (field_identifier) value: (integer_literal)) (field_initializer field: (field_identifier) value: (integer_literal))))) (expression_statement (macro_invocation macro: (identifier) (token_tree (string_literal (string_content)) (identifier) (identifier) (token_tree)))))))");
+
+        // Test replace
+        let params = lsp_types::DidChangeTextDocumentParams {
+            text_document: VersionedTextDocumentIdentifier {
+                uri: text_document.uri,
+                version: 1,
+            },
+            content_changes: vec![TextDocumentContentChangeEvent {
+                range: None,
+                range_length: None,
+                text: "fn main() {}".to_string(),
+            }],
+        };
+        file_store.changed_text_document(params)?;
+        let file = file_store.file_map.lock().get(uri).unwrap().clone();
+        assert_eq!(file.tree.unwrap().root_node().to_sexp(), "(source_file (function_item name: (identifier) parameters: (parameters) body: (block)))");
+
+        Ok(())
+    }
 }
diff --git a/src/memory_backends/postgresml/mod.rs b/src/memory_backends/postgresml/mod.rs
index d94f9d2..2c091e1 100644
--- a/src/memory_backends/postgresml/mod.rs
+++ b/src/memory_backends/postgresml/mod.rs
@@ -1,30 +1,65 @@
+use anyhow::Context;
+use lsp_types::TextDocumentPositionParams;
+use parking_lot::Mutex;
+use pgml::{Collection, Pipeline};
+use serde_json::{json, Value};
 use std::{
+    io::Read,
     sync::{
         mpsc::{self, Sender},
         Arc,
     },
     time::Duration,
 };
-
-use anyhow::Context;
-use lsp_types::TextDocumentPositionParams;
-use parking_lot::Mutex;
-use pgml::{Collection, Pipeline};
-use serde_json::{json, Value};
 use tokio::time;
-use tracing::{error, instrument};
+use tracing::{error, instrument, warn};
 
 use crate::{
     config::{self, Config},
     crawl::Crawl,
-    utils::{tokens_to_estimated_characters, TOKIO_RUNTIME},
+    splitters::{Chunk, Splitter},
+    utils::{chunk_to_id, tokens_to_estimated_characters, TOKIO_RUNTIME},
 };
 
 use super::{
-    file_store::FileStore, ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt,
-    PromptType,
+    file_store::{AdditionalFileStoreParams, FileStore},
+    ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType,
 };
 
+fn chunk_to_document(uri: &str, chunk: Chunk) -> Value {
+    json!({
+        "id": chunk_to_id(uri, &chunk),
+        "uri": uri,
+        "text": chunk.text,
+        "range": chunk.range
+    })
+}
+
+async fn split_and_upsert_file(
+    uri: &str,
+    collection: &mut Collection,
+    file_store: Arc<FileStore>,
+    splitter: Arc<Box<dyn Splitter + Send + Sync>>,
+) -> anyhow::Result<()> {
+    // We need to make sure we don't hold the file_store lock while performing a network call
+    let chunks = {
+        file_store
+            .file_map()
+            .lock()
+            .get(uri)
+            .map(|f| splitter.split(f))
+    };
+    let chunks = chunks.with_context(|| format!("file not found for splitting: {uri}"))?;
+    let documents = chunks
+        .into_iter()
+        .map(|chunk| chunk_to_document(uri, chunk).into())
+        .collect();
+    collection
+        .upsert_documents(documents, None)
+        .await
+        .context("PGML - Error upserting documents")
+}
+
 #[derive(Clone)]
 pub struct PostgresML {
     _config: Config,
@@ -33,6 +68,7 @@ pub struct PostgresML {
     pipeline: Pipeline,
     debounce_tx: Sender<String>,
     crawl: Option<Arc<Mutex<Crawl>>>,
+    splitter: Arc<Box<dyn Splitter + Send + Sync>>,
 }
 
 impl PostgresML {
@@ -45,10 +81,16 @@ impl PostgresML {
             .crawl
             .take()
             .map(|x| Arc::new(Mutex::new(Crawl::new(x, configuration.clone()))));
-        let file_store = Arc::new(FileStore::new(
+
+        let splitter: Arc<Box<dyn Splitter + Send + Sync>> =
+            Arc::new(postgresml_config.splitter.try_into()?);
+
+        let file_store = Arc::new(FileStore::new_with_params(
             config::FileStore::new_without_crawl(),
             configuration.clone(),
+            AdditionalFileStoreParams::new(splitter.does_use_tree_sitter()),
         )?);
+
         let database_url = if let Some(database_url) = postgresml_config.database_url {
             database_url
         } else {
@@ -86,6 +128,7 @@ impl PostgresML {
         let (debounce_tx, debounce_rx) = mpsc::channel::<String>();
         let mut task_collection = collection.clone();
         let task_file_store = file_store.clone();
+        let task_splitter = splitter.clone();
         TOKIO_RUNTIME.spawn(async move {
             let duration = Duration::from_millis(500);
             let mut file_uris = Vec::new();
@@ -102,36 +145,83 @@ impl PostgresML {
                     if file_uris.is_empty() {
                         continue;
                     }
-                    let documents = match file_uris
+
+                    // Build the chunks for our changed files
+                    let chunks: Vec<Vec<Chunk>> = match file_uris
                         .iter()
                         .map(|uri| {
-                            let text = task_file_store
-                                .get_file_contents(&uri)
-                                .context("Error reading file contents from file_store")?;
-                            anyhow::Ok(
-                                json!({
-                                    "id": uri,
-                                    "text": text
-                                })
-                                .into(),
-                            )
+                            let file_store = task_file_store.file_map().lock();
+                            let file = file_store
+                                .get(uri)
+                                .with_context(|| format!("getting file for splitting: {uri}"))?;
+                            anyhow::Ok(task_splitter.split(file))
                         })
                         .collect()
                     {
-                        Ok(documents) => documents,
+                        Ok(chunks) => chunks,
                         Err(e) => {
                             error!("{e}");
                             continue;
                         }
                     };
+
+                    // Delete old chunks that no longer exist after the latest file changes
+                    let delete_or_statements: Vec<Value> = file_uris
+                        .iter()
+                        .zip(&chunks)
+                        .map(|(uri, chunks)| {
+                            let ids: Vec<String> =
+                                chunks.iter().map(|c| chunk_to_id(uri, c)).collect();
+                            json!({
+                                "$and": [
+                                    {
+                                        "uri": {
+                                            "$eq": uri
+                                        }
+                                    },
+                                    {
+                                        "id": {
+                                            "$nin": ids
+                                        }
+                                    }
+                                ]
+                            })
+                        })
+                        .collect();
+                    if let Err(e) = task_collection
+                        .delete_documents(
+                            json!({
+                                "$or": delete_or_statements
+                            })
+                            .into(),
+                        )
+                        .await
+                    {
+                        error!("PGML - Error deleting file: {e:?}");
+                    }
+
+                    // Prepare and upsert our new chunks
+                    let documents: Vec<pgml::types::Json> = chunks
+                        .into_iter()
+                        .zip(&file_uris)
+                        .map(|(chunks, uri)| {
+                            chunks
+                                .into_iter()
+                                .map(|chunk| chunk_to_document(&uri, chunk))
+                                .collect::<Vec<Value>>()
+                        })
+                        .flatten()
+                        .map(|f: Value| f.into())
+                        .collect();
                     if let Err(e) = task_collection
                         .upsert_documents(documents, None)
                         .await
-                        .context("PGML - Error adding pipeline to collection")
+                        .context("PGML - Error upserting changed files")
                     {
                         error!("{e}");
                         continue;
                     }
+
                     file_uris = Vec::new();
                 }
             }
@@ -144,6 +234,7 @@ impl PostgresML {
             pipeline,
             debounce_tx,
             crawl,
+            splitter,
         };
 
         if let Err(e) = s.maybe_do_crawl(None) {
@@ -154,28 +245,73 @@ impl PostgresML {
 
     fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
         if let Some(crawl) = &self.crawl {
-            let mut _collection = self.collection.clone();
-            let mut _pipeline = self.pipeline.clone();
-            let mut documents: Vec<pgml::types::Json> = vec![];
-            crawl.lock().maybe_do_crawl(triggered_file, |path| {
-                let uri = format!("file://{path}");
-                // This means it has been opened before
-                if self.file_store.contains_file(&uri) {
-                    return Ok(());
-                }
-                // Get the contents, split, and upsert it
-                let contents = std::fs::read_to_string(path)?;
-                documents.push(
-                    json!({
-                        "id": uri,
-                        "text": contents
-                    })
-                    .into(),
-                );
-                // Track the size of the documents we have
-                // If it is over some amount in bytes, upsert it
-                Ok(())
-            })?;
+            let mut documents: Vec<(String, Vec<Chunk>)> = vec![];
+            let mut total_bytes = 0;
+            let mut current_bytes = 0;
+            crawl
+                .lock()
+                .maybe_do_crawl(triggered_file, |config, path| {
+                    let uri = format!("file://{path}");
+                    // This means it has been opened before
+                    if self.file_store.contains_file(&uri) {
+                        return Ok(());
+                    }
+                    // Open the file and see if it is small enough to read
+                    let mut f = std::fs::File::open(path)?;
+                    if f.metadata()
+                        .map(|m| m.len() > config.max_file_size)
+                        .unwrap_or(true)
+                    {
+                        warn!("Skipping file because it is too large: {path}");
+                        return Ok(());
+                    }
+                    // Read the file contents
+                    let mut contents = vec![];
+                    f.read_to_end(&mut contents);
+                    if let Ok(contents) = String::from_utf8(contents) {
+                        current_bytes += contents.len();
+                        total_bytes += contents.len();
+                        let chunks = self.splitter.split_file_contents(&uri, &contents);
+                        documents.push((uri, chunks));
+                    }
+                    // If we have over 100 mega bytes of data do the upsert
+                    if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory
+                    {
+                        // Prepare our chunks
+                        let to_upsert_documents: Vec<pgml::types::Json> =
+                            std::mem::take(&mut documents)
+                                .into_iter()
+                                .map(|(uri, chunks)| {
+                                    chunks
+                                        .into_iter()
+                                        .map(|chunk| chunk_to_document(&uri, chunk))
+                                        .collect::<Vec<Value>>()
+                                })
+                                .flatten()
+                                .map(|f: Value| f.into())
+                                .collect();
+                        // Do the upsert
+                        let mut collection = self.collection.clone();
+                        TOKIO_RUNTIME.spawn(async move {
+                            if let Err(e) = collection
+                                .upsert_documents(to_upsert_documents, None)
+                                .await
+                                .context("PGML - Error upserting changed files")
+                            {
+                                error!("{e}");
+                            }
+                        });
+                        // Reset everything
+                        current_bytes = 0;
+                        documents = vec![];
+                    }
+                    // Break if total bytes is over the max crawl memory
+                    if total_bytes as u64 >= config.max_crawl_memory {
+                        warn!("Ending crawl eraly do to max_crawl_memory");
+                        return Ok(());
+                    }
+                    Ok(())
+                })?;
         }
         Ok(())
     }
@@ -263,25 +399,22 @@ impl MemoryBackend for PostgresML {
         params: lsp_types::DidOpenTextDocumentParams,
     ) -> anyhow::Result<()> {
         self.file_store.opened_text_document(params.clone())?;
-        let mut task_collection = self.collection.clone();
+
         let saved_uri = params.text_document.uri.to_string();
+
+        let mut collection = self.collection.clone();
+        let file_store = self.file_store.clone();
+        let splitter = self.splitter.clone();
         TOKIO_RUNTIME.spawn(async move {
-            let text = params.text_document.text.clone();
             let uri = params.text_document.uri.to_string();
-            task_collection
-                .upsert_documents(
-                    vec![json!({
-                        "id": uri,
-                        "text": text
-                    })
-                    .into()],
-                    None,
-                )
-                .await
-                .expect("PGML - Error upserting documents");
+            if let Err(e) = split_and_upsert_file(&uri, &mut collection, file_store, splitter).await
+            {
+                error!("{e:?}")
+            }
         });
+
         if let Err(e) = self.maybe_do_crawl(Some(saved_uri)) {
-            error!("{e}")
+            error!("{e:?}")
         }
         Ok(())
     }
@@ -300,32 +433,35 @@ impl MemoryBackend for PostgresML {
     #[instrument(skip(self))]
     fn renamed_files(&self, params: lsp_types::RenameFilesParams) -> anyhow::Result<()> {
         self.file_store.renamed_files(params.clone())?;
-        let mut task_collection = self.collection.clone();
-        let task_params = params.clone();
+
+        let mut collection = self.collection.clone();
+        let file_store = self.file_store.clone();
+        let splitter = self.splitter.clone();
         TOKIO_RUNTIME.spawn(async move {
-            for file in task_params.files {
-                task_collection
+            for file in params.files {
+                if let Err(e) = collection
                     .delete_documents(
                         json!({
-                            "id": file.old_uri
+                            "uri": {
+                                "$eq": file.old_uri
+                            }
                         })
                         .into(),
                     )
                     .await
-                    .expect("PGML - Error deleting file");
-                let text =
-                    std::fs::read_to_string(&file.new_uri).expect("PGML - Error reading file");
-                task_collection
-                    .upsert_documents(
-                        vec![json!({
-                            "id": file.new_uri,
-                            "text": text
-                        })
-                        .into()],
-                        None,
-                    )
-                    .await
-                    .expect("PGML - Error adding pipeline to collection");
+                {
+                    error!("PGML - Error deleting file: {e:?}");
+                }
+                if let Err(e) = split_and_upsert_file(
+                    &file.new_uri,
+                    &mut collection,
+                    file_store.clone(),
+                    splitter.clone(),
+                )
+                .await
+                {
+                    error!("{e:?}")
+                }
             }
         });
         Ok(())
diff --git a/src/splitters/mod.rs b/src/splitters/mod.rs
new file mode 100644
index 0000000..ed5c15a
--- /dev/null
+++ b/src/splitters/mod.rs
@@ -0,0 +1,53 @@
+use serde::Serialize;
+
+use crate::{config::ValidSplitter, memory_backends::file_store::File};
+
+mod tree_sitter;
+
+#[derive(Serialize)]
+pub struct ByteRange {
+    pub start_byte: usize,
+    pub end_byte: usize,
+}
+
+impl ByteRange {
+    pub fn new(start_byte: usize, end_byte: usize) -> Self {
+        Self {
+            start_byte,
+            end_byte,
+        }
+    }
+}
+
+#[derive(Serialize)]
+pub struct Chunk {
+    pub text: String,
+    pub range: ByteRange,
+}
+
+impl Chunk {
+    fn new(text: String, range: ByteRange) -> Self {
+        Self { text, range }
+    }
+}
+
+pub trait Splitter {
+    fn split(&self, file: &File) -> Vec<Chunk>;
+    fn split_file_contents(&self, uri: &str, contents: &str) -> Vec<Chunk>;
+
+    fn does_use_tree_sitter(&self) -> bool {
+        false
+    }
+}
+
+impl TryFrom<ValidSplitter> for Box<dyn Splitter + Send + Sync> {
+    type Error = anyhow::Error;
+
+    fn try_from(value: ValidSplitter) -> Result<Self, Self::Error> {
+        match value {
+            ValidSplitter::TreeSitter(config) => {
+                Ok(Box::new(tree_sitter::TreeSitter::new(config)?))
+            }
+        }
+    }
+}
diff --git a/src/splitters/tree_sitter.rs b/src/splitters/tree_sitter.rs
new file mode 100644
index 0000000..e8fb309
--- /dev/null
+++ b/src/splitters/tree_sitter.rs
@@ -0,0 +1,77 @@
+use splitter_tree_sitter::TreeSitterCodeSplitter;
+use tracing::error;
+use tree_sitter::Tree;
+
+use crate::{config, memory_backends::file_store::File, utils::parse_tree};
+
+use super::{ByteRange, Chunk, Splitter};
+
+pub struct TreeSitter {
+    _config: config::TreeSitter,
+    splitter: TreeSitterCodeSplitter,
+}
+
+impl TreeSitter {
+    pub fn new(config: config::TreeSitter) -> anyhow::Result<Self> {
+        Ok(Self {
+            splitter: TreeSitterCodeSplitter::new(config.chunk_size, config.chunk_overlap)?,
+            _config: config,
+        })
+    }
+
+    fn split_tree(&self, tree: &Tree, contents: &[u8]) -> anyhow::Result<Vec<Chunk>> {
+        Ok(self
+            .splitter
+            .split(tree, contents)?
+            .into_iter()
+            .map(|c| {
+                Chunk::new(
+                    c.text.to_owned(),
+                    ByteRange::new(c.range.start_byte, c.range.end_byte),
+                )
+            })
+            .collect())
+    }
+}
+
+impl Splitter for TreeSitter {
+    fn split(&self, file: &File) -> Vec<Chunk> {
+        if let Some(tree) = file.tree() {
+            match self.split_tree(tree, file.rope().to_string().as_bytes()) {
+                Ok(chunks) => chunks,
+                Err(e) => {
+                    error!(
+                        "Failed to parse tree for file with error {e:?}. Falling back to default splitter.",
+                    );
+                    todo!()
+                }
+            }
+        } else {
+            panic!("TreeSitter splitter requires a tree to split")
+        }
+    }
+
+    fn split_file_contents(&self, uri: &str, contents: &str) -> Vec<Chunk> {
+        match parse_tree(uri, contents, None) {
+            Ok(tree) => match self.split_tree(&tree, contents.as_bytes()) {
+                Ok(chunks) => chunks,
+                Err(e) => {
+                    error!(
+                            "Failed to parse tree for file: {uri} with error {e:?}. Falling back to default splitter.",
+                        );
+                    todo!()
+                }
+            },
+            Err(e) => {
+                error!(
+                    "Failed to parse tree for file {uri} with error {e:?}. Falling back to default splitter.",
+                );
+                todo!()
+            }
+        }
+    }
+
+    fn does_use_tree_sitter(&self) -> bool {
+        true
+    }
+}
diff --git a/src/utils.rs b/src/utils.rs
index 29afd71..8b5b8b4 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,8 +1,10 @@
+use anyhow::Context;
 use lsp_server::ResponseError;
 use once_cell::sync::Lazy;
 use tokio::runtime;
+use tree_sitter::Tree;
 
-use crate::{config::ChatMessage, memory_backends::ContextAndCodePrompt};
+use crate::{config::ChatMessage, memory_backends::ContextAndCodePrompt, splitters::Chunk};
 
 pub static TOKIO_RUNTIME: Lazy<runtime::Runtime> = Lazy::new(|| {
     runtime::Builder::new_multi_thread()
@@ -52,3 +54,17 @@ pub fn format_context_code_in_str(s: &str, context: &str, code: &str) -> String
 pub fn format_context_code(context: &str, code: &str) -> String {
     format!("{context}\n\n{code}")
 }
+
+pub fn chunk_to_id(uri: &str, chunk: &Chunk) -> String {
+    format!("{uri}#{}-{}", chunk.range.start_byte, chunk.range.end_byte)
+}
+
+pub fn parse_tree(uri: &str, contents: &str, old_tree: Option<&Tree>) -> anyhow::Result<Tree> {
+    let path = std::path::Path::new(uri);
+    let extension = path.extension().map(|x| x.to_string_lossy());
+    let extension = extension.as_deref().unwrap_or("");
+    let mut parser = utils_tree_sitter::get_parser_for_extension(extension)?;
+    parser
+        .parse(&contents, old_tree)
+        .with_context(|| format!("parsing tree failed for {uri}"))
+}

From f2b8c1eda30b5ce7b53d14705380169623161777 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sun, 16 Jun 2024 16:25:44 -0700
Subject: [PATCH 06/18] Made into a workspace

---
 Cargo.lock                                    | 103 ++++++++++++++++--
 Cargo.toml                                    |  56 ++--------
 crates/lsp-ai/Cargo.toml                      |  50 +++++++++
 {src => crates/lsp-ai/src}/config.rs          |   0
 {src => crates/lsp-ai/src}/crawl.rs           |  26 +++--
 .../lsp-ai/src}/custom_requests/generation.rs |   0
 .../src}/custom_requests/generation_stream.rs |   0
 .../lsp-ai/src}/custom_requests/mod.rs        |   0
 {src => crates/lsp-ai/src}/main.rs            |   0
 .../lsp-ai/src}/memory_backends/file_store.rs |  31 +++++-
 .../lsp-ai/src}/memory_backends/mod.rs        |   0
 .../src}/memory_backends/postgresml/mod.rs    |  39 +++----
 {src => crates/lsp-ai/src}/memory_worker.rs   |   0
 {src => crates/lsp-ai/src}/splitters/mod.rs   |   0
 .../lsp-ai/src}/splitters/tree_sitter.rs      |   6 +-
 {src => crates/lsp-ai/src}/template.rs        |   0
 .../src}/transformer_backends/anthropic.rs    |   0
 .../src}/transformer_backends/gemini.rs       |   0
 .../transformer_backends/llama_cpp/mod.rs     |   0
 .../transformer_backends/llama_cpp/model.rs   |   0
 .../src}/transformer_backends/mistral_fim.rs  |   0
 .../lsp-ai/src}/transformer_backends/mod.rs   |   0
 .../src}/transformer_backends/ollama.rs       |   0
 .../src}/transformer_backends/open_ai/mod.rs  |   0
 .../lsp-ai/src}/transformer_worker.rs         |   0
 {src => crates/lsp-ai/src}/utils.rs           |   0
 crates/splitter-tree-sitter                   |   1 +
 crates/utils-tree-sitter                      |   1 +
 28 files changed, 222 insertions(+), 91 deletions(-)
 create mode 100644 crates/lsp-ai/Cargo.toml
 rename {src => crates/lsp-ai/src}/config.rs (100%)
 rename {src => crates/lsp-ai/src}/crawl.rs (73%)
 rename {src => crates/lsp-ai/src}/custom_requests/generation.rs (100%)
 rename {src => crates/lsp-ai/src}/custom_requests/generation_stream.rs (100%)
 rename {src => crates/lsp-ai/src}/custom_requests/mod.rs (100%)
 rename {src => crates/lsp-ai/src}/main.rs (100%)
 rename {src => crates/lsp-ai/src}/memory_backends/file_store.rs (95%)
 rename {src => crates/lsp-ai/src}/memory_backends/mod.rs (100%)
 rename {src => crates/lsp-ai/src}/memory_backends/postgresml/mod.rs (94%)
 rename {src => crates/lsp-ai/src}/memory_worker.rs (100%)
 rename {src => crates/lsp-ai/src}/splitters/mod.rs (100%)
 rename {src => crates/lsp-ai/src}/splitters/tree_sitter.rs (93%)
 rename {src => crates/lsp-ai/src}/template.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_backends/anthropic.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_backends/gemini.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_backends/llama_cpp/mod.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_backends/llama_cpp/model.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_backends/mistral_fim.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_backends/mod.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_backends/ollama.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_backends/open_ai/mod.rs (100%)
 rename {src => crates/lsp-ai/src}/transformer_worker.rs (100%)
 rename {src => crates/lsp-ai/src}/utils.rs (100%)
 create mode 160000 crates/splitter-tree-sitter
 create mode 160000 crates/utils-tree-sitter

diff --git a/Cargo.lock b/Cargo.lock
index cd6001c..20cdfff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -149,6 +149,18 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "auto_enums"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1899bfcfd9340ceea3533ea157360ba8fa864354eccbceab58e1006ecab35393"
+dependencies = [
+ "derive_utils",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -356,7 +368,7 @@ version = "4.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -662,6 +674,17 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "derive_utils"
+version = "0.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61bb5a1014ce6dfc2a378578509abe775a5aa06bff584a547555d9efdb81b926"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "difflib"
 version = "0.4.0"
@@ -730,9 +753,9 @@ checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125"
 
 [[package]]
 name = "either"
-version = "1.10.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
+checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 dependencies = [
  "serde",
 ]
@@ -1056,6 +1079,12 @@ dependencies = [
  "unicode-segmentation",
 ]
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.3.9"
@@ -1364,6 +1393,15 @@ dependencies = [
  "either",
 ]
 
+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.10"
@@ -1518,7 +1556,7 @@ dependencies = [
 
 [[package]]
 name = "lsp-ai"
-version = "0.3.0"
+version = "0.2.0"
 dependencies = [
  "anyhow",
  "assert_cmd",
@@ -1541,6 +1579,7 @@ dependencies = [
  "serde",
  "serde_json",
  "splitter-tree-sitter",
+ "text-splitter",
  "tokenizers",
  "tokio",
  "tracing",
@@ -2419,6 +2458,12 @@ dependencies = [
  "untrusted",
 ]
 
+[[package]]
+name = "rustversion"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
+
 [[package]]
 name = "ryu"
 version = "1.0.17"
@@ -2479,7 +2524,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "878cf3d57f0e5bfacd425cdaccc58b4c06d68a7b71c63fc28710a20c88676808"
 dependencies = [
  "darling 0.14.4",
- "heck",
+ "heck 0.4.1",
  "quote",
  "syn 1.0.109",
 ]
@@ -2502,7 +2547,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "25a82fcb49253abcb45cdcb2adf92956060ec0928635eb21b4f7a6d8f25ab0bc"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -2767,6 +2812,8 @@ dependencies = [
  "cc",
  "thiserror",
  "tree-sitter",
+ "tree-sitter-rust",
+ "tree-sitter-zig",
 ]
 
 [[package]]
@@ -2870,7 +2917,7 @@ checksum = "5833ef53aaa16d860e92123292f1f6a3d53c34ba8b1969f152ef1a7bb803f3c8"
 dependencies = [
  "dotenvy",
  "either",
- "heck",
+ "heck 0.4.1",
  "hex",
  "once_cell",
  "proc-macro2",
@@ -3026,6 +3073,28 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
+[[package]]
+name = "strum"
+version = "0.26.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "subtle"
 version = "2.5.0"
@@ -3099,6 +3168,24 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
 
+[[package]]
+name = "text-splitter"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ab9dc04b7cf08eb01c07c272bf699fa55679a326ddf7dd075e14094efc80fb9"
+dependencies = [
+ "ahash",
+ "auto_enums",
+ "either",
+ "itertools 0.13.0",
+ "once_cell",
+ "regex",
+ "strum",
+ "thiserror",
+ "tree-sitter",
+ "unicode-segmentation",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.61"
@@ -3385,7 +3472,7 @@ dependencies = [
 [[package]]
 name = "tree-sitter-zig"
 version = "0.0.1"
-source = "git+https://github.com/SilasMarvin/tree-sitter-zig?branch=silas-update-tree-sitter-version#2eedab3ff6dda88aedddf0bb32a14f81bb709a73"
+source = "git+https://github.com/maxxnino/tree-sitter-zig#7c5a29b721d409be8842017351bf007d7e384401"
 dependencies = [
  "cc",
  "tree-sitter",
diff --git a/Cargo.toml b/Cargo.toml
index 657589a..afb3496 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,52 +1,16 @@
-[package]
-name = "lsp-ai"
-version = "0.3.0"
+[workspace]
+members = [
+    "crates/*",
+]
+resolver = "2"
+
+[workspace.package]
 edition = "2021"
 license = "MIT"
 description = "LSP-AI is an open-source language server that serves as a backend for AI-powered functionality, designed to assist and empower software engineers, not replace them."
 repository = "https://github.com/SilasMarvin/lsp-ai"
 readme = "README.md"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-anyhow = "1.0.75"
-lsp-server = "0.7.6"
-lsp-types = "0.95.0"
-ropey = "1.6.1"
-serde = "1.0.190"
-serde_json = "1.0.108"
-hf-hub = { git = "https://github.com/huggingface/hf-hub", version = "0.3.2" }
-rand = "0.8.5"
-tokenizers = "0.14.1"
-parking_lot = "0.12.1"
-once_cell = "1.19.0"
-directories = "5.0.1"
-llama-cpp-2 = { version = "0.1.55", optional = true }
-minijinja = { version = "1.0.12", features = ["loader"] }
-tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
-tracing = "0.1.40"
-xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
-reqwest = { version = "0.11.25", features = ["blocking", "json"] }
-ignore = "0.4.22"
-pgml = "1.0.4"
-tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
-indexmap = "2.2.5"
-async-trait = "0.1.78"
-tree-sitter = "0.22"
-# splitter-tree-sitter = { git = "https://github.com/SilasMarvin/splitter-tree-sitter" }
-splitter-tree-sitter = { path = "../../splitter-tree-sitter" }
-# utils-tree-sitter = { git = "https://github.com/SilasMarvin/utils-tree-sitter" }
-utils-tree-sitter = { path = "../../utils-tree-sitter", features = ["all"] }
-
-[build-dependencies]
-cc="*"
-
-[features]
-default = []
-llama_cpp = ["dep:llama-cpp-2"]
-metal = ["llama-cpp-2/metal"]
-cuda = ["llama-cpp-2/cuda"]
-
-[dev-dependencies]
-assert_cmd = "2.0.14"
+[workspace.dependencies]
+utils-tree-sitter = { path = "./crates/utils-tree-sitter" }
+splitter-tree-sitter = { path = "./crates/splitter-tree-sitter" }
diff --git a/crates/lsp-ai/Cargo.toml b/crates/lsp-ai/Cargo.toml
new file mode 100644
index 0000000..1379ac0
--- /dev/null
+++ b/crates/lsp-ai/Cargo.toml
@@ -0,0 +1,50 @@
+[package]
+name = "lsp-ai"
+version = "0.2.0"
+
+description.workspace = true
+repository.workspace = true
+readme.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow = "1.0.75"
+lsp-server = "0.7.6"
+lsp-types = "0.95.0"
+ropey = "1.6.1"
+serde = "1.0.190"
+serde_json = "1.0.108"
+hf-hub = { git = "https://github.com/huggingface/hf-hub", version = "0.3.2" }
+rand = "0.8.5"
+tokenizers = "0.14.1"
+parking_lot = "0.12.1"
+once_cell = "1.19.0"
+directories = "5.0.1"
+llama-cpp-2 = { version = "0.1.55", optional = true }
+minijinja = { version = "1.0.12", features = ["loader"] }
+tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
+tracing = "0.1.40"
+xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
+reqwest = { version = "0.11.25", features = ["blocking", "json"] }
+ignore = "0.4.22"
+pgml = "1.0.4"
+tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
+indexmap = "2.2.5"
+async-trait = "0.1.78"
+tree-sitter = "0.22"
+utils-tree-sitter = { workspace = true, features = ["all"] }
+splitter-tree-sitter = { workspace = true }
+text-splitter = { version = "0.13.3", features = ["code"] }
+
+[build-dependencies]
+cc="*"
+
+[features]
+default = []
+llama_cpp = ["dep:llama-cpp-2"]
+metal = ["llama-cpp-2/metal"]
+cuda = ["llama-cpp-2/cuda"]
+
+[dev-dependencies]
+assert_cmd = "2.0.14"
diff --git a/src/config.rs b/crates/lsp-ai/src/config.rs
similarity index 100%
rename from src/config.rs
rename to crates/lsp-ai/src/config.rs
diff --git a/src/crawl.rs b/crates/lsp-ai/src/crawl.rs
similarity index 73%
rename from src/crawl.rs
rename to crates/lsp-ai/src/crawl.rs
index 191d869..beade33 100644
--- a/src/crawl.rs
+++ b/crates/lsp-ai/src/crawl.rs
@@ -1,5 +1,6 @@
 use ignore::WalkBuilder;
 use std::collections::HashSet;
+use tracing::{error, instrument};
 
 use crate::config::{self, Config};
 
@@ -18,14 +19,11 @@ impl Crawl {
         }
     }
 
-    pub fn crawl_config(&self) -> &config::Crawl {
-        &self.crawl_config
-    }
-
+    #[instrument(skip(self, f))]
     pub fn maybe_do_crawl(
         &mut self,
         triggered_file: Option<String>,
-        mut f: impl FnMut(&config::Crawl, &str) -> anyhow::Result<()>,
+        mut f: impl FnMut(&config::Crawl, &str) -> anyhow::Result<bool>,
     ) -> anyhow::Result<()> {
         if let Some(root_uri) = &self.config.client_params.root_uri {
             if !root_uri.starts_with("file://") {
@@ -56,7 +54,14 @@ impl Crawl {
                 if !path.is_dir() {
                     if let Some(path_str) = path.to_str() {
                         if self.crawl_config.all_files {
-                            f(&self.crawl_config, path_str)?;
+                            match f(&self.crawl_config, path_str) {
+                                Ok(c) => {
+                                    if !c {
+                                        return Ok(());
+                                    }
+                                }
+                                Err(e) => error!("{e:?}"),
+                            }
                         } else {
                             match (
                                 path.extension().map(|pe| pe.to_str()).flatten(),
@@ -64,7 +69,14 @@ impl Crawl {
                             ) {
                                 (Some(path_extension), Some(extension_to_match)) => {
                                     if path_extension == extension_to_match {
-                                        f(&self.crawl_config, path_str)?;
+                                        match f(&self.crawl_config, path_str) {
+                                            Ok(c) => {
+                                                if !c {
+                                                    return Ok(());
+                                                }
+                                            }
+                                            Err(e) => error!("{e:?}"),
+                                        }
                                     }
                                 }
                                 _ => continue,
diff --git a/src/custom_requests/generation.rs b/crates/lsp-ai/src/custom_requests/generation.rs
similarity index 100%
rename from src/custom_requests/generation.rs
rename to crates/lsp-ai/src/custom_requests/generation.rs
diff --git a/src/custom_requests/generation_stream.rs b/crates/lsp-ai/src/custom_requests/generation_stream.rs
similarity index 100%
rename from src/custom_requests/generation_stream.rs
rename to crates/lsp-ai/src/custom_requests/generation_stream.rs
diff --git a/src/custom_requests/mod.rs b/crates/lsp-ai/src/custom_requests/mod.rs
similarity index 100%
rename from src/custom_requests/mod.rs
rename to crates/lsp-ai/src/custom_requests/mod.rs
diff --git a/src/main.rs b/crates/lsp-ai/src/main.rs
similarity index 100%
rename from src/main.rs
rename to crates/lsp-ai/src/main.rs
diff --git a/src/memory_backends/file_store.rs b/crates/lsp-ai/src/memory_backends/file_store.rs
similarity index 95%
rename from src/memory_backends/file_store.rs
rename to crates/lsp-ai/src/memory_backends/file_store.rs
index e93cb06..f02e3a2 100644
--- a/src/memory_backends/file_store.rs
+++ b/crates/lsp-ai/src/memory_backends/file_store.rs
@@ -4,8 +4,8 @@ use lsp_types::TextDocumentPositionParams;
 use parking_lot::Mutex;
 use ropey::Rope;
 use serde_json::Value;
-use std::collections::HashMap;
-use tracing::{error, instrument};
+use std::{collections::HashMap, io::Read};
+use tracing::{error, instrument, warn};
 use tree_sitter::{InputEdit, Point, Tree};
 
 use crate::{
@@ -114,18 +114,37 @@ impl FileStore {
     }
 
     fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
+        let mut total_bytes = 0;
+        let mut current_bytes = 0;
         if let Some(crawl) = &self.crawl {
             crawl
                 .lock()
                 .maybe_do_crawl(triggered_file, |config, path| {
+                    // Break if total bytes is over the max crawl memory
+                    if total_bytes as u64 >= config.max_crawl_memory {
+                        warn!("Ending crawl early due to `max_crawl_memory` resetraint");
+                        return Ok(false);
+                    }
+                    // This means it has been opened before
                     let insert_uri = format!("file://{path}");
                     if self.file_map.lock().contains_key(&insert_uri) {
-                        return Ok(());
+                        return Ok(true);
                     }
-                    // TODO: actually limit files based on config
-                    let contents = std::fs::read_to_string(path)?;
+                    // Open the file and see if it is small enough to read
+                    let mut f = std::fs::File::open(path)?;
+                    let metadata = f.metadata()?;
+                    if metadata.len() > config.max_file_size {
+                        warn!("Skipping file: {path} because it is too large");
+                        return Ok(true);
+                    }
+                    // Read the file contents
+                    let mut contents = vec![];
+                    f.read_to_end(&mut contents)?;
+                    let contents = String::from_utf8(contents)?;
+                    current_bytes += contents.len();
+                    total_bytes += contents.len();
                     self.add_new_file(&insert_uri, contents);
-                    Ok(())
+                    Ok(true)
                 })?;
         }
         Ok(())
diff --git a/src/memory_backends/mod.rs b/crates/lsp-ai/src/memory_backends/mod.rs
similarity index 100%
rename from src/memory_backends/mod.rs
rename to crates/lsp-ai/src/memory_backends/mod.rs
diff --git a/src/memory_backends/postgresml/mod.rs b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
similarity index 94%
rename from src/memory_backends/postgresml/mod.rs
rename to crates/lsp-ai/src/memory_backends/postgresml/mod.rs
index 2c091e1..2281efc 100644
--- a/src/memory_backends/postgresml/mod.rs
+++ b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
@@ -251,29 +251,31 @@ impl PostgresML {
             crawl
                 .lock()
                 .maybe_do_crawl(triggered_file, |config, path| {
-                    let uri = format!("file://{path}");
+                    // Break if total bytes is over the max crawl memory
+                    if total_bytes as u64 >= config.max_crawl_memory {
+                        warn!("Ending crawl early due to `max_crawl_memory` resetraint");
+                        return Ok(false);
+                    }
                     // This means it has been opened before
+                    let uri = format!("file://{path}");
                     if self.file_store.contains_file(&uri) {
-                        return Ok(());
+                        return Ok(true);
                     }
                     // Open the file and see if it is small enough to read
                     let mut f = std::fs::File::open(path)?;
-                    if f.metadata()
-                        .map(|m| m.len() > config.max_file_size)
-                        .unwrap_or(true)
-                    {
-                        warn!("Skipping file because it is too large: {path}");
-                        return Ok(());
+                    let metadata = f.metadata()?;
+                    if metadata.len() > config.max_file_size {
+                        warn!("Skipping file: {path} because it is too large");
+                        return Ok(true);
                     }
                     // Read the file contents
                     let mut contents = vec![];
-                    f.read_to_end(&mut contents);
-                    if let Ok(contents) = String::from_utf8(contents) {
-                        current_bytes += contents.len();
-                        total_bytes += contents.len();
-                        let chunks = self.splitter.split_file_contents(&uri, &contents);
-                        documents.push((uri, chunks));
-                    }
+                    f.read_to_end(&mut contents)?;
+                    let contents = String::from_utf8(contents)?;
+                    current_bytes += contents.len();
+                    total_bytes += contents.len();
+                    let chunks = self.splitter.split_file_contents(&uri, &contents);
+                    documents.push((uri, chunks));
                     // If we have over 100 mega bytes of data do the upsert
                     if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory
                     {
@@ -305,12 +307,7 @@ impl PostgresML {
                         current_bytes = 0;
                         documents = vec![];
                     }
-                    // Break if total bytes is over the max crawl memory
-                    if total_bytes as u64 >= config.max_crawl_memory {
-                        warn!("Ending crawl eraly do to max_crawl_memory");
-                        return Ok(());
-                    }
-                    Ok(())
+                    Ok(true)
                 })?;
         }
         Ok(())
diff --git a/src/memory_worker.rs b/crates/lsp-ai/src/memory_worker.rs
similarity index 100%
rename from src/memory_worker.rs
rename to crates/lsp-ai/src/memory_worker.rs
diff --git a/src/splitters/mod.rs b/crates/lsp-ai/src/splitters/mod.rs
similarity index 100%
rename from src/splitters/mod.rs
rename to crates/lsp-ai/src/splitters/mod.rs
diff --git a/src/splitters/tree_sitter.rs b/crates/lsp-ai/src/splitters/tree_sitter.rs
similarity index 93%
rename from src/splitters/tree_sitter.rs
rename to crates/lsp-ai/src/splitters/tree_sitter.rs
index e8fb309..ce44185 100644
--- a/src/splitters/tree_sitter.rs
+++ b/crates/lsp-ai/src/splitters/tree_sitter.rs
@@ -41,7 +41,7 @@ impl Splitter for TreeSitter {
                 Ok(chunks) => chunks,
                 Err(e) => {
                     error!(
-                        "Failed to parse tree for file with error {e:?}. Falling back to default splitter.",
+                        "Failed to parse tree for file with error: {e:?}. Falling back to default splitter.",
                     );
                     todo!()
                 }
@@ -57,14 +57,14 @@ impl Splitter for TreeSitter {
                 Ok(chunks) => chunks,
                 Err(e) => {
                     error!(
-                            "Failed to parse tree for file: {uri} with error {e:?}. Falling back to default splitter.",
+                            "Failed to parse tree for file: {uri} with error: {e:?}. Falling back to default splitter.",
                         );
                     todo!()
                 }
             },
             Err(e) => {
                 error!(
-                    "Failed to parse tree for file {uri} with error {e:?}. Falling back to default splitter.",
+                    "Failed to parse tree for file {uri} with error: {e:?}. Falling back to default splitter.",
                 );
                 todo!()
             }
diff --git a/src/template.rs b/crates/lsp-ai/src/template.rs
similarity index 100%
rename from src/template.rs
rename to crates/lsp-ai/src/template.rs
diff --git a/src/transformer_backends/anthropic.rs b/crates/lsp-ai/src/transformer_backends/anthropic.rs
similarity index 100%
rename from src/transformer_backends/anthropic.rs
rename to crates/lsp-ai/src/transformer_backends/anthropic.rs
diff --git a/src/transformer_backends/gemini.rs b/crates/lsp-ai/src/transformer_backends/gemini.rs
similarity index 100%
rename from src/transformer_backends/gemini.rs
rename to crates/lsp-ai/src/transformer_backends/gemini.rs
diff --git a/src/transformer_backends/llama_cpp/mod.rs b/crates/lsp-ai/src/transformer_backends/llama_cpp/mod.rs
similarity index 100%
rename from src/transformer_backends/llama_cpp/mod.rs
rename to crates/lsp-ai/src/transformer_backends/llama_cpp/mod.rs
diff --git a/src/transformer_backends/llama_cpp/model.rs b/crates/lsp-ai/src/transformer_backends/llama_cpp/model.rs
similarity index 100%
rename from src/transformer_backends/llama_cpp/model.rs
rename to crates/lsp-ai/src/transformer_backends/llama_cpp/model.rs
diff --git a/src/transformer_backends/mistral_fim.rs b/crates/lsp-ai/src/transformer_backends/mistral_fim.rs
similarity index 100%
rename from src/transformer_backends/mistral_fim.rs
rename to crates/lsp-ai/src/transformer_backends/mistral_fim.rs
diff --git a/src/transformer_backends/mod.rs b/crates/lsp-ai/src/transformer_backends/mod.rs
similarity index 100%
rename from src/transformer_backends/mod.rs
rename to crates/lsp-ai/src/transformer_backends/mod.rs
diff --git a/src/transformer_backends/ollama.rs b/crates/lsp-ai/src/transformer_backends/ollama.rs
similarity index 100%
rename from src/transformer_backends/ollama.rs
rename to crates/lsp-ai/src/transformer_backends/ollama.rs
diff --git a/src/transformer_backends/open_ai/mod.rs b/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs
similarity index 100%
rename from src/transformer_backends/open_ai/mod.rs
rename to crates/lsp-ai/src/transformer_backends/open_ai/mod.rs
diff --git a/src/transformer_worker.rs b/crates/lsp-ai/src/transformer_worker.rs
similarity index 100%
rename from src/transformer_worker.rs
rename to crates/lsp-ai/src/transformer_worker.rs
diff --git a/src/utils.rs b/crates/lsp-ai/src/utils.rs
similarity index 100%
rename from src/utils.rs
rename to crates/lsp-ai/src/utils.rs
diff --git a/crates/splitter-tree-sitter b/crates/splitter-tree-sitter
new file mode 160000
index 0000000..37a2e98
--- /dev/null
+++ b/crates/splitter-tree-sitter
@@ -0,0 +1 @@
+Subproject commit 37a2e98cce5a1b39f07aec7e5b3bc75eebb41ac2
diff --git a/crates/utils-tree-sitter b/crates/utils-tree-sitter
new file mode 160000
index 0000000..a38e714
--- /dev/null
+++ b/crates/utils-tree-sitter
@@ -0,0 +1 @@
+Subproject commit a38e7143bcab2412348fd92904cc5105117896a1

From cbe487ca3a646694c8613f0d35934dd9cbc7c342 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sun, 16 Jun 2024 19:33:09 -0700
Subject: [PATCH 07/18] Almost working RAG

---
 Cargo.lock                                    |  1 -
 crates/lsp-ai/Cargo.toml                      |  2 +-
 crates/lsp-ai/src/config.rs                   | 10 ++++-
 crates/lsp-ai/src/crawl.rs                    | 13 +++++-
 .../src/memory_backends/postgresml/mod.rs     | 42 +++++++++++--------
 crates/lsp-ai/src/splitters/mod.rs            |  4 ++
 crates/lsp-ai/src/splitters/text_splitter.rs  | 40 ++++++++++++++++++
 crates/lsp-ai/src/splitters/tree_sitter.rs    | 15 +++----
 8 files changed, 97 insertions(+), 30 deletions(-)
 create mode 100644 crates/lsp-ai/src/splitters/text_splitter.rs

diff --git a/Cargo.lock b/Cargo.lock
index 20cdfff..9d6149a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3182,7 +3182,6 @@ dependencies = [
  "regex",
  "strum",
  "thiserror",
- "tree-sitter",
  "unicode-segmentation",
 ]
 
diff --git a/crates/lsp-ai/Cargo.toml b/crates/lsp-ai/Cargo.toml
index 1379ac0..bdb6223 100644
--- a/crates/lsp-ai/Cargo.toml
+++ b/crates/lsp-ai/Cargo.toml
@@ -35,7 +35,7 @@ async-trait = "0.1.78"
 tree-sitter = "0.22"
 utils-tree-sitter = { workspace = true, features = ["all"] }
 splitter-tree-sitter = { workspace = true }
-text-splitter = { version = "0.13.3", features = ["code"] }
+text-splitter = { version = "0.13.3" }
 
 [build-dependencies]
 cc="*"
diff --git a/crates/lsp-ai/src/config.rs b/crates/lsp-ai/src/config.rs
index 49f8e54..0c2e764 100644
--- a/crates/lsp-ai/src/config.rs
+++ b/crates/lsp-ai/src/config.rs
@@ -28,6 +28,8 @@ impl Default for PostProcess {
 pub enum ValidSplitter {
     #[serde(rename = "tree_sitter")]
     TreeSitter(TreeSitter),
+    #[serde(rename = "text_sitter")]
+    TextSplitter(TextSplitter),
 }
 
 impl Default for ValidSplitter {
@@ -61,6 +63,12 @@ impl Default for TreeSitter {
     }
 }
 
+#[derive(Debug, Clone, Deserialize)]
+pub struct TextSplitter {
+    #[serde(default = "chunk_size_default")]
+    pub chunk_size: usize,
+}
+
 #[derive(Debug, Clone, Deserialize)]
 pub enum ValidMemoryBackend {
     #[serde(rename = "file_store")]
@@ -123,7 +131,7 @@ pub struct FIM {
 }
 
 const fn max_crawl_memory_default() -> u64 {
-    42
+    100_000_000
 }
 
 const fn max_crawl_file_size_default() -> u64 {
diff --git a/crates/lsp-ai/src/crawl.rs b/crates/lsp-ai/src/crawl.rs
index beade33..edbd56c 100644
--- a/crates/lsp-ai/src/crawl.rs
+++ b/crates/lsp-ai/src/crawl.rs
@@ -8,6 +8,7 @@ pub struct Crawl {
     crawl_config: config::Crawl,
     config: Config,
     crawled_file_types: HashSet<String>,
+    crawled_all: bool,
 }
 
 impl Crawl {
@@ -16,6 +17,7 @@ impl Crawl {
             crawl_config,
             config,
             crawled_file_types: HashSet::new(),
+            crawled_all: false,
         }
     }
 
@@ -25,6 +27,10 @@ impl Crawl {
         triggered_file: Option<String>,
         mut f: impl FnMut(&config::Crawl, &str) -> anyhow::Result<bool>,
     ) -> anyhow::Result<()> {
+        if self.crawled_all {
+            return Ok(());
+        }
+
         if let Some(root_uri) = &self.config.client_params.root_uri {
             if !root_uri.starts_with("file://") {
                 anyhow::bail!("Skipping crawling as root_uri does not begin with file://")
@@ -51,13 +57,14 @@ impl Crawl {
             for result in WalkBuilder::new(&root_uri[7..]).build() {
                 let result = result?;
                 let path = result.path();
+                eprintln!("CRAWLING: {}", path.display());
                 if !path.is_dir() {
                     if let Some(path_str) = path.to_str() {
                         if self.crawl_config.all_files {
                             match f(&self.crawl_config, path_str) {
                                 Ok(c) => {
                                     if !c {
-                                        return Ok(());
+                                        break;
                                     }
                                 }
                                 Err(e) => error!("{e:?}"),
@@ -72,7 +79,7 @@ impl Crawl {
                                         match f(&self.crawl_config, path_str) {
                                             Ok(c) => {
                                                 if !c {
-                                                    return Ok(());
+                                                    break;
                                                 }
                                             }
                                             Err(e) => error!("{e:?}"),
@@ -88,6 +95,8 @@ impl Crawl {
 
             if let Some(extension_to_match) = extension_to_match {
                 self.crawled_file_types.insert(extension_to_match);
+            } else {
+                self.crawled_all = true
             }
         }
         Ok(())
diff --git a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
index 2281efc..a849aa8 100644
--- a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
+++ b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
@@ -245,7 +245,7 @@ impl PostgresML {
 
     fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
         if let Some(crawl) = &self.crawl {
-            let mut documents: Vec<(String, Vec<Chunk>)> = vec![];
+            let mut documents = vec![];
             let mut total_bytes = 0;
             let mut current_bytes = 0;
             crawl
@@ -253,7 +253,7 @@ impl PostgresML {
                 .maybe_do_crawl(triggered_file, |config, path| {
                     // Break if total bytes is over the max crawl memory
                     if total_bytes as u64 >= config.max_crawl_memory {
-                        warn!("Ending crawl early due to `max_crawl_memory` resetraint");
+                        warn!("Ending crawl early due to `max_crawl_memory` restraint");
                         return Ok(false);
                     }
                     // This means it has been opened before
@@ -274,26 +274,19 @@ impl PostgresML {
                     let contents = String::from_utf8(contents)?;
                     current_bytes += contents.len();
                     total_bytes += contents.len();
-                    let chunks = self.splitter.split_file_contents(&uri, &contents);
-                    documents.push((uri, chunks));
+                    let chunks: Vec<pgml::types::Json> = self
+                        .splitter
+                        .split_file_contents(&uri, &contents)
+                        .into_iter()
+                        .map(|chunk| chunk_to_document(&uri, chunk).into())
+                        .collect();
+                    documents.extend(chunks);
                     // If we have over 100 mega bytes of data do the upsert
                     if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory
                     {
-                        // Prepare our chunks
-                        let to_upsert_documents: Vec<pgml::types::Json> =
-                            std::mem::take(&mut documents)
-                                .into_iter()
-                                .map(|(uri, chunks)| {
-                                    chunks
-                                        .into_iter()
-                                        .map(|chunk| chunk_to_document(&uri, chunk))
-                                        .collect::<Vec<Value>>()
-                                })
-                                .flatten()
-                                .map(|f: Value| f.into())
-                                .collect();
-                        // Do the upsert
+                        // Upsert the documents
                         let mut collection = self.collection.clone();
+                        let to_upsert_documents = std::mem::take(&mut documents);
                         TOKIO_RUNTIME.spawn(async move {
                             if let Err(e) = collection
                                 .upsert_documents(to_upsert_documents, None)
@@ -309,6 +302,19 @@ impl PostgresML {
                     }
                     Ok(true)
                 })?;
+            // Upsert any remaining documents
+            if documents.len() > 0 {
+                let mut collection = self.collection.clone();
+                TOKIO_RUNTIME.spawn(async move {
+                    if let Err(e) = collection
+                        .upsert_documents(documents, None)
+                        .await
+                        .context("PGML - Error upserting changed files")
+                    {
+                        error!("{e}");
+                    }
+                });
+            }
         }
         Ok(())
     }
diff --git a/crates/lsp-ai/src/splitters/mod.rs b/crates/lsp-ai/src/splitters/mod.rs
index ed5c15a..8e310f0 100644
--- a/crates/lsp-ai/src/splitters/mod.rs
+++ b/crates/lsp-ai/src/splitters/mod.rs
@@ -2,6 +2,7 @@ use serde::Serialize;
 
 use crate::{config::ValidSplitter, memory_backends::file_store::File};
 
+mod text_splitter;
 mod tree_sitter;
 
 #[derive(Serialize)]
@@ -48,6 +49,9 @@ impl TryFrom<ValidSplitter> for Box<dyn Splitter + Send + Sync> {
             ValidSplitter::TreeSitter(config) => {
                 Ok(Box::new(tree_sitter::TreeSitter::new(config)?))
             }
+            ValidSplitter::TextSplitter(config) => {
+                Ok(Box::new(text_splitter::TextSplitter::new(config)))
+            }
         }
     }
 }
diff --git a/crates/lsp-ai/src/splitters/text_splitter.rs b/crates/lsp-ai/src/splitters/text_splitter.rs
new file mode 100644
index 0000000..d4ad404
--- /dev/null
+++ b/crates/lsp-ai/src/splitters/text_splitter.rs
@@ -0,0 +1,40 @@
+use crate::{config, memory_backends::file_store::File};
+
+use super::{ByteRange, Chunk, Splitter};
+
+pub struct TextSplitter {
+    splitter: text_splitter::TextSplitter<text_splitter::Characters>,
+}
+
+impl TextSplitter {
+    pub fn new(config: config::TextSplitter) -> Self {
+        Self {
+            splitter: text_splitter::TextSplitter::new(config.chunk_size),
+        }
+    }
+
+    pub fn new_with_chunk_size(chunk_size: usize) -> Self {
+        Self {
+            splitter: text_splitter::TextSplitter::new(chunk_size),
+        }
+    }
+}
+
+impl Splitter for TextSplitter {
+    fn split(&self, file: &File) -> Vec<Chunk> {
+        self.split_file_contents("", &file.rope().to_string())
+    }
+
+    fn split_file_contents(&self, _uri: &str, contents: &str) -> Vec<Chunk> {
+        self.splitter
+            .chunk_indices(contents)
+            .fold(vec![], |mut acc, (start_byte, text)| {
+                let end_byte = start_byte + text.len();
+                acc.push(Chunk::new(
+                    text.to_string(),
+                    ByteRange::new(start_byte, end_byte),
+                ));
+                acc
+            })
+    }
+}
diff --git a/crates/lsp-ai/src/splitters/tree_sitter.rs b/crates/lsp-ai/src/splitters/tree_sitter.rs
index ce44185..a804d86 100644
--- a/crates/lsp-ai/src/splitters/tree_sitter.rs
+++ b/crates/lsp-ai/src/splitters/tree_sitter.rs
@@ -4,18 +4,19 @@ use tree_sitter::Tree;
 
 use crate::{config, memory_backends::file_store::File, utils::parse_tree};
 
-use super::{ByteRange, Chunk, Splitter};
+use super::{text_splitter::TextSplitter, ByteRange, Chunk, Splitter};
 
 pub struct TreeSitter {
-    _config: config::TreeSitter,
     splitter: TreeSitterCodeSplitter,
+    text_splitter: TextSplitter,
 }
 
 impl TreeSitter {
     pub fn new(config: config::TreeSitter) -> anyhow::Result<Self> {
+        let text_splitter = TextSplitter::new_with_chunk_size(config.chunk_size);
         Ok(Self {
             splitter: TreeSitterCodeSplitter::new(config.chunk_size, config.chunk_overlap)?,
-            _config: config,
+            text_splitter,
         })
     }
 
@@ -43,11 +44,11 @@ impl Splitter for TreeSitter {
                     error!(
                         "Failed to parse tree for file with error: {e:?}. Falling back to default splitter.",
                     );
-                    todo!()
+                    self.text_splitter.split(file)
                 }
             }
         } else {
-            panic!("TreeSitter splitter requires a tree to split")
+            self.text_splitter.split(file)
         }
     }
 
@@ -59,14 +60,14 @@ impl Splitter for TreeSitter {
                     error!(
                             "Failed to parse tree for file: {uri} with error: {e:?}. Falling back to default splitter.",
                         );
-                    todo!()
+                    self.text_splitter.split_file_contents(uri, contents)
                 }
             },
             Err(e) => {
                 error!(
                     "Failed to parse tree for file {uri} with error: {e:?}. Falling back to default splitter.",
                 );
-                todo!()
+                self.text_splitter.split_file_contents(uri, contents)
             }
         }
     }

From 3e8c99b237bdd56bc92cfeaa9bb75edb03ec3e0c Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Tue, 18 Jun 2024 20:03:10 -0700
Subject: [PATCH 08/18] Working PostgresML backend with resyncing

---
 Cargo.lock                                    |   1 +
 crates/lsp-ai/Cargo.toml                      |   1 +
 crates/lsp-ai/src/crawl.rs                    |   1 -
 .../src/memory_backends/postgresml/mod.rs     | 128 ++++++++++++++++--
 4 files changed, 118 insertions(+), 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9d6149a..3bcb32c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1569,6 +1569,7 @@ dependencies = [
  "llama-cpp-2",
  "lsp-server",
  "lsp-types",
+ "md5",
  "minijinja",
  "once_cell",
  "parking_lot",
diff --git a/crates/lsp-ai/Cargo.toml b/crates/lsp-ai/Cargo.toml
index bdb6223..76e882b 100644
--- a/crates/lsp-ai/Cargo.toml
+++ b/crates/lsp-ai/Cargo.toml
@@ -36,6 +36,7 @@ tree-sitter = "0.22"
 utils-tree-sitter = { workspace = true, features = ["all"] }
 splitter-tree-sitter = { workspace = true }
 text-splitter = { version = "0.13.3" }
+md5 = "0.7.0"
 
 [build-dependencies]
 cc="*"
diff --git a/crates/lsp-ai/src/crawl.rs b/crates/lsp-ai/src/crawl.rs
index edbd56c..2dc1721 100644
--- a/crates/lsp-ai/src/crawl.rs
+++ b/crates/lsp-ai/src/crawl.rs
@@ -57,7 +57,6 @@ impl Crawl {
             for result in WalkBuilder::new(&root_uri[7..]).build() {
                 let result = result?;
                 let path = result.path();
-                eprintln!("CRAWLING: {}", path.display());
                 if !path.is_dir() {
                     if let Some(path_str) = path.to_str() {
                         if self.crawl_config.all_files {
diff --git a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
index a849aa8..c2ea435 100644
--- a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
+++ b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
@@ -2,6 +2,7 @@ use anyhow::Context;
 use lsp_types::TextDocumentPositionParams;
 use parking_lot::Mutex;
 use pgml::{Collection, Pipeline};
+use rand::{distributions::Alphanumeric, Rng};
 use serde_json::{json, Value};
 use std::{
     io::Read,
@@ -26,6 +27,8 @@ use super::{
     ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType,
 };
 
+const RESYNC_MAX_FILE_SIZE: u64 = 10_000_000;
+
 fn chunk_to_document(uri: &str, chunk: Chunk) -> Value {
     json!({
         "id": chunk_to_id(uri, &chunk),
@@ -94,11 +97,21 @@ impl PostgresML {
         let database_url = if let Some(database_url) = postgresml_config.database_url {
             database_url
         } else {
-            std::env::var("PGML_DATABASE_URL")?
+            std::env::var("PGML_DATABASE_URL").context("please provide either the `database_url` in the `postgresml` config, or set the `PGML_DATABASE_URL` environment variable")?
         };
 
-        // TODO: Think through Collections and Pipelines
-        let mut collection = Collection::new("test-lsp-ai-5", Some(database_url))?;
+        let collection_name = match configuration.client_params.root_uri.clone() {
+            Some(root_uri) => format!("{:x}", md5::compute(root_uri.as_bytes())),
+            None => {
+                warn!("no root_uri provided in server configuration - generating random string for collection name");
+                rand::thread_rng()
+                    .sample_iter(&Alphanumeric)
+                    .take(21)
+                    .map(char::from)
+                    .collect()
+            }
+        };
+        let mut collection = Collection::new(&collection_name, Some(database_url))?;
         let mut pipeline = Pipeline::new(
             "v1",
             Some(
@@ -145,7 +158,6 @@ impl PostgresML {
                     if file_uris.is_empty() {
                         continue;
                     }
-
                     // Build the chunks for our changed files
                     let chunks: Vec<Vec<Chunk>> = match file_uris
                         .iter()
@@ -160,11 +172,10 @@ impl PostgresML {
                     {
                         Ok(chunks) => chunks,
                         Err(e) => {
-                            error!("{e}");
+                            error!("{e:?}");
                             continue;
                         }
                     };
-
                     // Delete old chunks that no longer exist after the latest file changes
                     let delete_or_statements: Vec<Value> = file_uris
                         .iter()
@@ -196,10 +207,10 @@ impl PostgresML {
                             .into(),
                         )
                         .await
+                        .context("PGML - error deleting documents")
                     {
-                        error!("PGML - Error deleting file: {e:?}");
+                        error!("{e:?}");
                     }
-
                     // Prepare and upsert our new chunks
                     let documents: Vec<pgml::types::Json> = chunks
                         .into_iter()
@@ -218,7 +229,7 @@ impl PostgresML {
                         .await
                         .context("PGML - Error upserting changed files")
                     {
-                        error!("{e}");
+                        error!("{e:?}");
                         continue;
                     }
 
@@ -237,12 +248,105 @@ impl PostgresML {
             splitter,
         };
 
+        // Resync our Collection
+        let task_s = s.clone();
+        TOKIO_RUNTIME.spawn(async move {
+            if let Err(e) = task_s.resync().await {
+                error!("{e:?}")
+            }
+        });
+
         if let Err(e) = s.maybe_do_crawl(None) {
-            error!("{e}")
+            error!("{e:?}")
         }
         Ok(s)
     }
 
+    async fn resync(&self) -> anyhow::Result<()> {
+        let mut collection = self.collection.clone();
+
+        let documents = collection
+            .get_documents(Some(
+                json!({
+                    "limit": 100_000_000,
+                    "keys": ["uri"]
+                })
+                .into(),
+            ))
+            .await?;
+
+        let try_get_file_contents = |path: &std::path::Path| {
+            // Open the file and see if it is small enough to read
+            let mut f = std::fs::File::open(path)?;
+            let metadata = f.metadata()?;
+            if metadata.len() > RESYNC_MAX_FILE_SIZE {
+                anyhow::bail!("file size is greater than: {RESYNC_MAX_FILE_SIZE}")
+            }
+            // Read the file contents
+            let mut contents = vec![];
+            f.read_to_end(&mut contents)?;
+            anyhow::Ok(String::from_utf8(contents)?)
+        };
+
+        let mut documents_to_delete = vec![];
+        let mut chunks_to_upsert = vec![];
+        let mut current_chunks_bytes = 0;
+        for document in documents.into_iter() {
+            let uri = match document["document"]["uri"].as_str() {
+                Some(uri) => uri,
+                None => continue, // This should never happen, but is really bad as we now have a document with essentially no way to delete it
+            };
+
+            let path = uri.replace("file://", "");
+            let path = std::path::Path::new(&path);
+            if !path.exists() {
+                documents_to_delete.push(uri.to_string());
+            } else {
+                // Try to read the file. If we fail delete it
+                let contents = match try_get_file_contents(path) {
+                    Ok(contents) => contents,
+                    Err(e) => {
+                        error!("{e:?}");
+                        documents_to_delete.push(uri.to_string());
+                        continue;
+                    }
+                };
+                // Split the file into chunks
+                current_chunks_bytes += contents.len();
+                let chunks: Vec<pgml::types::Json> = self
+                    .splitter
+                    .split_file_contents(&uri, &contents)
+                    .into_iter()
+                    .map(|chunk| chunk_to_document(&uri, chunk).into())
+                    .collect();
+                chunks_to_upsert.extend(chunks);
+                // If we have over 10 mega bytes of chunks do the upsert
+                if current_chunks_bytes > 10_000_000 {
+                    collection
+                        .upsert_documents(chunks_to_upsert, None)
+                        .await
+                        .context("PGML - error upserting documents during resync")?;
+                }
+                chunks_to_upsert = vec![];
+            }
+        }
+        // Delete documents
+        if !documents_to_delete.is_empty() {
+            collection
+                .delete_documents(
+                    json!({
+                        "uri": {
+                            "$in": documents_to_delete
+                        }
+                    })
+                    .into(),
+                )
+                .await
+                .context("PGML - error deleting documents during resync")?;
+        }
+        Ok(())
+    }
+
     fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
         if let Some(crawl) = &self.crawl {
             let mut documents = vec![];
@@ -281,8 +385,8 @@ impl PostgresML {
                         .map(|chunk| chunk_to_document(&uri, chunk).into())
                         .collect();
                     documents.extend(chunks);
-                    // If we have over 100 mega bytes of data do the upsert
-                    if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory
+                    // If we have over 10 mega bytes of data do the upsert
+                    if current_bytes >= 10_000_000 || total_bytes as u64 >= config.max_crawl_memory
                     {
                         // Upsert the documents
                         let mut collection = self.collection.clone();

From 9166aaf4b699b29b8fd67a9f4f30ad9b12573f49 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Wed, 19 Jun 2024 12:14:56 -0700
Subject: [PATCH 09/18] Getting closer

---
 crates/lsp-ai/src/config.rs                   |   2 +
 .../lsp-ai/src/memory_backends/file_store.rs  |  29 ++-
 .../src/memory_backends/postgresml/mod.rs     | 169 ++++++++++++++----
 crates/lsp-ai/src/splitters/mod.rs            |   2 +
 crates/lsp-ai/src/splitters/text_splitter.rs  |   7 +
 crates/lsp-ai/src/splitters/tree_sitter.rs    |   6 +
 .../src/transformer_backends/open_ai/mod.rs   |   6 +
 7 files changed, 186 insertions(+), 35 deletions(-)

diff --git a/crates/lsp-ai/src/config.rs b/crates/lsp-ai/src/config.rs
index 0c2e764..92ef755 100644
--- a/crates/lsp-ai/src/config.rs
+++ b/crates/lsp-ai/src/config.rs
@@ -156,6 +156,8 @@ pub struct PostgresML {
     pub crawl: Option<Crawl>,
     #[serde(default)]
     pub splitter: ValidSplitter,
+    pub embedding_model: Option<String>,
+    pub embedding_model_parameters: Option<Value>,
 }
 
 #[derive(Clone, Debug, Deserialize, Default)]
diff --git a/crates/lsp-ai/src/memory_backends/file_store.rs b/crates/lsp-ai/src/memory_backends/file_store.rs
index f02e3a2..ec50151 100644
--- a/crates/lsp-ai/src/memory_backends/file_store.rs
+++ b/crates/lsp-ai/src/memory_backends/file_store.rs
@@ -1,6 +1,6 @@
 use anyhow::Context;
 use indexmap::IndexSet;
-use lsp_types::TextDocumentPositionParams;
+use lsp_types::{Position, TextDocumentPositionParams};
 use parking_lot::Mutex;
 use ropey::Rope;
 use serde_json::Value;
@@ -154,6 +154,7 @@ impl FileStore {
         &self,
         position: &TextDocumentPositionParams,
         characters: usize,
+        pull_from_multiple_files: bool,
     ) -> anyhow::Result<(Rope, usize)> {
         // Get the rope and set our initial cursor index
         let current_document_uri = position.text_document.uri.to_string();
@@ -174,7 +175,7 @@ impl FileStore {
             .filter(|f| **f != current_document_uri)
         {
             let needed = characters.saturating_sub(rope.len_chars() + 1);
-            if needed == 0 {
+            if needed == 0 || !pull_from_multiple_files {
                 break;
             }
             let file_map = self.file_map.lock();
@@ -220,9 +221,13 @@ impl FileStore {
         position: &TextDocumentPositionParams,
         prompt_type: PromptType,
         params: MemoryRunParams,
+        pull_from_multiple_files: bool,
     ) -> anyhow::Result<Prompt> {
-        let (mut rope, cursor_index) =
-            self.get_rope_for_position(position, params.max_context_length)?;
+        let (mut rope, cursor_index) = self.get_rope_for_position(
+            position,
+            params.max_context_length,
+            pull_from_multiple_files,
+        )?;
 
         Ok(match prompt_type {
             PromptType::ContextAndCode => {
@@ -277,6 +282,20 @@ impl FileStore {
     pub fn contains_file(&self, uri: &str) -> bool {
         self.file_map.lock().contains_key(uri)
     }
+
+    pub fn position_to_byte(&self, position: &TextDocumentPositionParams) -> anyhow::Result<usize> {
+        let file_map = self.file_map.lock();
+        let uri = position.text_document.uri.to_string();
+        let file = file_map
+            .get(&uri)
+            .with_context(|| format!("trying to get file that does not exist {uri}"))?;
+        let line_char_index = file
+            .rope
+            .try_line_to_char(position.position.line as usize)?;
+        Ok(file
+            .rope
+            .try_char_to_byte(line_char_index + position.position.character as usize)?)
+    }
 }
 
 #[async_trait::async_trait]
@@ -307,7 +326,7 @@ impl MemoryBackend for FileStore {
         params: &Value,
     ) -> anyhow::Result<Prompt> {
         let params: MemoryRunParams = params.try_into()?;
-        self.build_code(position, prompt_type, params)
+        self.build_code(position, prompt_type, params, true)
     }
 
     #[instrument(skip(self))]
diff --git a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
index c2ea435..858e29e 100644
--- a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
+++ b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
@@ -29,11 +29,30 @@ use super::{
 
 const RESYNC_MAX_FILE_SIZE: u64 = 10_000_000;
 
-fn chunk_to_document(uri: &str, chunk: Chunk) -> Value {
+fn format_chunk_chunk(uri: &str, chunk: &Chunk, root_uri: Option<&str>) -> String {
+    let path = match root_uri {
+        Some(root_uri) => {
+            if uri.starts_with(root_uri) {
+                &uri[root_uri.chars().count()..]
+            } else {
+                uri
+            }
+        }
+        None => uri,
+    };
+    format!(
+        r#"--{path}--
+{}
+"#,
+        chunk.text
+    )
+}
+
+fn chunk_to_document(uri: &str, chunk: Chunk, root_uri: Option<&str>) -> Value {
     json!({
         "id": chunk_to_id(uri, &chunk),
         "uri": uri,
-        "text": chunk.text,
+        "text": format_chunk_chunk(uri, &chunk, root_uri),
         "range": chunk.range
     })
 }
@@ -43,6 +62,7 @@ async fn split_and_upsert_file(
     collection: &mut Collection,
     file_store: Arc<FileStore>,
     splitter: Arc<Box<dyn Splitter + Send + Sync>>,
+    root_uri: Option<&str>,
 ) -> anyhow::Result<()> {
     // We need to make sure we don't hold the file_store lock while performing a network call
     let chunks = {
@@ -55,7 +75,7 @@ async fn split_and_upsert_file(
     let chunks = chunks.with_context(|| format!("file not found for splitting: {uri}"))?;
     let documents = chunks
         .into_iter()
-        .map(|chunk| chunk_to_document(uri, chunk).into())
+        .map(|chunk| chunk_to_document(uri, chunk, root_uri).into())
         .collect();
     collection
         .upsert_documents(documents, None)
@@ -65,7 +85,7 @@ async fn split_and_upsert_file(
 
 #[derive(Clone)]
 pub struct PostgresML {
-    _config: Config,
+    config: Config,
     file_store: Arc<FileStore>,
     collection: Collection,
     pipeline: Pipeline,
@@ -100,21 +120,19 @@ impl PostgresML {
             std::env::var("PGML_DATABASE_URL").context("please provide either the `database_url` in the `postgresml` config, or set the `PGML_DATABASE_URL` environment variable")?
         };
 
-        let collection_name = match configuration.client_params.root_uri.clone() {
-            Some(root_uri) => format!("{:x}", md5::compute(root_uri.as_bytes())),
-            None => {
-                warn!("no root_uri provided in server configuration - generating random string for collection name");
-                rand::thread_rng()
-                    .sample_iter(&Alphanumeric)
-                    .take(21)
-                    .map(char::from)
-                    .collect()
+        // Build our pipeline schema
+        let pipeline = match postgresml_config.embedding_model {
+            Some(embedding_model) => {
+                json!({
+                    "text": {
+                        "semantic_search": {
+                            "model": embedding_model,
+                            "parameters": postgresml_config.embedding_model_parameters
+                        }
+                    }
+                })
             }
-        };
-        let mut collection = Collection::new(&collection_name, Some(database_url))?;
-        let mut pipeline = Pipeline::new(
-            "v1",
-            Some(
+            None => {
                 json!({
                     "text": {
                         "semantic_search": {
@@ -125,16 +143,36 @@ impl PostgresML {
                         }
                     }
                 })
-                .into(),
+            }
+        };
+
+        // When building the collection name we include the Pipeline schema
+        // If the user changes the Pipeline schema, it will take affect without them having to delete the old files
+        let collection_name = match configuration.client_params.root_uri.clone() {
+            Some(root_uri) => format!(
+                "{:x}",
+                md5::compute(
+                    format!("{root_uri}_{}", serde_json::to_string(&pipeline)?).as_bytes()
+                )
             ),
-        )?;
+            None => {
+                warn!("no root_uri provided in server configuration - generating random string for collection name");
+                rand::thread_rng()
+                    .sample_iter(&Alphanumeric)
+                    .take(21)
+                    .map(char::from)
+                    .collect()
+            }
+        };
+        let mut collection = Collection::new(&collection_name, Some(database_url))?;
+        let mut pipeline = Pipeline::new("v1", Some(pipeline.into()))?;
 
         // Add the Pipeline to the Collection
         TOKIO_RUNTIME.block_on(async {
             collection
                 .add_pipeline(&mut pipeline)
                 .await
-                .context("PGML - Error adding pipeline to collection")
+                .context("PGML - error adding pipeline to collection")
         })?;
 
         // Setup up a debouncer for changed text documents
@@ -142,6 +180,7 @@ impl PostgresML {
         let mut task_collection = collection.clone();
         let task_file_store = file_store.clone();
         let task_splitter = splitter.clone();
+        let task_root_uri = configuration.client_params.root_uri.clone();
         TOKIO_RUNTIME.spawn(async move {
             let duration = Duration::from_millis(500);
             let mut file_uris = Vec::new();
@@ -218,7 +257,9 @@ impl PostgresML {
                         .map(|(chunks, uri)| {
                             chunks
                                 .into_iter()
-                                .map(|chunk| chunk_to_document(&uri, chunk))
+                                .map(|chunk| {
+                                    chunk_to_document(&uri, chunk, task_root_uri.as_deref())
+                                })
                                 .collect::<Vec<Value>>()
                         })
                         .flatten()
@@ -227,7 +268,7 @@ impl PostgresML {
                     if let Err(e) = task_collection
                         .upsert_documents(documents, None)
                         .await
-                        .context("PGML - Error upserting changed files")
+                        .context("PGML - error upserting changed files")
                     {
                         error!("{e:?}");
                         continue;
@@ -239,7 +280,7 @@ impl PostgresML {
         });
 
         let s = Self {
-            _config: configuration,
+            config: configuration,
             file_store,
             collection,
             pipeline,
@@ -317,7 +358,14 @@ impl PostgresML {
                     .splitter
                     .split_file_contents(&uri, &contents)
                     .into_iter()
-                    .map(|chunk| chunk_to_document(&uri, chunk).into())
+                    .map(|chunk| {
+                        chunk_to_document(
+                            &uri,
+                            chunk,
+                            self.config.client_params.root_uri.as_deref(),
+                        )
+                        .into()
+                    })
                     .collect();
                 chunks_to_upsert.extend(chunks);
                 // If we have over 10 mega bytes of chunks do the upsert
@@ -326,10 +374,18 @@ impl PostgresML {
                         .upsert_documents(chunks_to_upsert, None)
                         .await
                         .context("PGML - error upserting documents during resync")?;
+                    chunks_to_upsert = vec![];
+                    current_chunks_bytes = 0;
                 }
-                chunks_to_upsert = vec![];
             }
         }
+        // Upsert any remaining chunks
+        if chunks_to_upsert.len() > 0 {
+            collection
+                .upsert_documents(chunks_to_upsert, None)
+                .await
+                .context("PGML - error upserting documents during resync")?;
+        }
         // Delete documents
         if !documents_to_delete.is_empty() {
             collection
@@ -382,7 +438,14 @@ impl PostgresML {
                         .splitter
                         .split_file_contents(&uri, &contents)
                         .into_iter()
-                        .map(|chunk| chunk_to_document(&uri, chunk).into())
+                        .map(|chunk| {
+                            chunk_to_document(
+                                &uri,
+                                chunk,
+                                self.config.client_params.root_uri.as_deref(),
+                            )
+                            .into()
+                        })
                         .collect();
                     documents.extend(chunks);
                     // If we have over 10 mega bytes of data do the upsert
@@ -440,17 +503,28 @@ impl MemoryBackend for PostgresML {
     ) -> anyhow::Result<Prompt> {
         let params: MemoryRunParams = params.try_into()?;
 
+        // TOOD: FIGURE THIS OUT
+        // let prompt_size = params.max_context_length
+
         // Build the query
         let query = self
             .file_store
             .get_characters_around_position(position, 512)?;
 
-        // Get the code around the Cursor
+        // Build the prompt
         let mut file_store_params = params.clone();
         file_store_params.max_context_length = 512;
         let code = self
             .file_store
-            .build_code(position, prompt_type, file_store_params)?;
+            .build_code(position, prompt_type, file_store_params, false)?;
+
+        // Get the byte of the cursor
+        let cursor_byte = self.file_store.position_to_byte(position)?;
+        eprintln!(
+            "CURSOR BYTE: {} IN DOCUMENT: {}",
+            cursor_byte,
+            position.text_document.uri.to_string()
+        );
 
         // Get the context
         let limit = params.max_context_length / 512;
@@ -467,6 +541,29 @@ impl MemoryBackend for PostgresML {
                                 }
                             }
                         },
+                        "filter": {
+                            "$or": [
+                                {
+                                    "uri": {
+                                        "$ne": position.text_document.uri.to_string()
+                                    }
+                                },
+                                {
+                                    "range": {
+                                        "start": {
+                                            "$gt": cursor_byte
+                                        },
+                                    },
+                                },
+                                {
+                                    "range": {
+                                        "end": {
+                                            "$lt": cursor_byte
+                                        },
+                                    }
+                                }
+                            ]
+                        }
                     },
                     "limit": limit
                 })
@@ -485,6 +582,8 @@ impl MemoryBackend for PostgresML {
             .collect::<anyhow::Result<Vec<String>>>()?
             .join("\n\n");
 
+        eprintln!("THE CONTEXT:\n\n{context}\n\n");
+
         let chars = tokens_to_estimated_characters(params.max_context_length.saturating_sub(512));
         let context = &context[..chars.min(context.len())];
 
@@ -512,9 +611,17 @@ impl MemoryBackend for PostgresML {
         let mut collection = self.collection.clone();
         let file_store = self.file_store.clone();
         let splitter = self.splitter.clone();
+        let root_uri = self.config.client_params.root_uri.clone();
         TOKIO_RUNTIME.spawn(async move {
             let uri = params.text_document.uri.to_string();
-            if let Err(e) = split_and_upsert_file(&uri, &mut collection, file_store, splitter).await
+            if let Err(e) = split_and_upsert_file(
+                &uri,
+                &mut collection,
+                file_store,
+                splitter,
+                root_uri.as_deref(),
+            )
+            .await
             {
                 error!("{e:?}")
             }
@@ -544,6 +651,7 @@ impl MemoryBackend for PostgresML {
         let mut collection = self.collection.clone();
         let file_store = self.file_store.clone();
         let splitter = self.splitter.clone();
+        let root_uri = self.config.client_params.root_uri.clone();
         TOKIO_RUNTIME.spawn(async move {
             for file in params.files {
                 if let Err(e) = collection
@@ -564,6 +672,7 @@ impl MemoryBackend for PostgresML {
                     &mut collection,
                     file_store.clone(),
                     splitter.clone(),
+                    root_uri.as_deref(),
                 )
                 .await
                 {
diff --git a/crates/lsp-ai/src/splitters/mod.rs b/crates/lsp-ai/src/splitters/mod.rs
index 8e310f0..72db6b7 100644
--- a/crates/lsp-ai/src/splitters/mod.rs
+++ b/crates/lsp-ai/src/splitters/mod.rs
@@ -39,6 +39,8 @@ pub trait Splitter {
     fn does_use_tree_sitter(&self) -> bool {
         false
     }
+
+    fn chunk_size(&self) -> usize;
 }
 
 impl TryFrom<ValidSplitter> for Box<dyn Splitter + Send + Sync> {
diff --git a/crates/lsp-ai/src/splitters/text_splitter.rs b/crates/lsp-ai/src/splitters/text_splitter.rs
index d4ad404..9b280a1 100644
--- a/crates/lsp-ai/src/splitters/text_splitter.rs
+++ b/crates/lsp-ai/src/splitters/text_splitter.rs
@@ -3,18 +3,21 @@ use crate::{config, memory_backends::file_store::File};
 use super::{ByteRange, Chunk, Splitter};
 
 pub struct TextSplitter {
+    chunk_size: usize,
     splitter: text_splitter::TextSplitter<text_splitter::Characters>,
 }
 
 impl TextSplitter {
     pub fn new(config: config::TextSplitter) -> Self {
         Self {
+            chunk_size: config.chunk_size,
             splitter: text_splitter::TextSplitter::new(config.chunk_size),
         }
     }
 
     pub fn new_with_chunk_size(chunk_size: usize) -> Self {
         Self {
+            chunk_size,
             splitter: text_splitter::TextSplitter::new(chunk_size),
         }
     }
@@ -37,4 +40,8 @@ impl Splitter for TextSplitter {
                 acc
             })
     }
+
+    fn chunk_size(&self) -> usize {
+        self.chunk_size
+    }
 }
diff --git a/crates/lsp-ai/src/splitters/tree_sitter.rs b/crates/lsp-ai/src/splitters/tree_sitter.rs
index a804d86..dbbb9ce 100644
--- a/crates/lsp-ai/src/splitters/tree_sitter.rs
+++ b/crates/lsp-ai/src/splitters/tree_sitter.rs
@@ -7,6 +7,7 @@ use crate::{config, memory_backends::file_store::File, utils::parse_tree};
 use super::{text_splitter::TextSplitter, ByteRange, Chunk, Splitter};
 
 pub struct TreeSitter {
+    chunk_size: usize,
     splitter: TreeSitterCodeSplitter,
     text_splitter: TextSplitter,
 }
@@ -15,6 +16,7 @@ impl TreeSitter {
     pub fn new(config: config::TreeSitter) -> anyhow::Result<Self> {
         let text_splitter = TextSplitter::new_with_chunk_size(config.chunk_size);
         Ok(Self {
+            chunk_size: config.chunk_size,
             splitter: TreeSitterCodeSplitter::new(config.chunk_size, config.chunk_overlap)?,
             text_splitter,
         })
@@ -75,4 +77,8 @@ impl Splitter for TreeSitter {
     fn does_use_tree_sitter(&self) -> bool {
         true
     }
+
+    fn chunk_size(&self) -> usize {
+        self.chunk_size
+    }
 }
diff --git a/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs b/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs
index d516adf..dd9c98e 100644
--- a/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs
+++ b/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs
@@ -156,6 +156,12 @@ impl OpenAI {
         messages: Vec<ChatMessage>,
         params: OpenAIRunParams,
     ) -> anyhow::Result<String> {
+        eprintln!("\n\n\n\n");
+        for message in &messages {
+            eprintln!("{}:\n{}\n", message.role.to_string(), message.content);
+        }
+        eprintln!("\n\n\n\n");
+
         let client = reqwest::Client::new();
         let token = self.get_token()?;
         let res: OpenAIChatResponse = client

From 09f602ee1218b3046726eae06f443ca7b03f6af2 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Fri, 21 Jun 2024 08:30:22 -0700
Subject: [PATCH 10/18] Working RAG

---
 crates/lsp-ai/src/config.rs                   | 10 ++-
 .../lsp-ai/src/memory_backends/file_store.rs  | 13 ++-
 crates/lsp-ai/src/memory_backends/mod.rs      |  4 +-
 .../src/memory_backends/postgresml/mod.rs     | 86 +++++++++++--------
 4 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/crates/lsp-ai/src/config.rs b/crates/lsp-ai/src/config.rs
index 92ef755..5d29b66 100644
--- a/crates/lsp-ai/src/config.rs
+++ b/crates/lsp-ai/src/config.rs
@@ -149,6 +149,13 @@ pub struct Crawl {
     pub all_files: bool,
 }
 
+#[derive(Clone, Debug, Deserialize)]
+pub struct PostgresMLEmbeddingModel {
+    pub model: String,
+    pub embed_parameters: Option<Value>,
+    pub query_parameters: Option<Value>,
+}
+
 #[derive(Clone, Debug, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct PostgresML {
@@ -156,8 +163,7 @@ pub struct PostgresML {
     pub crawl: Option<Crawl>,
     #[serde(default)]
     pub splitter: ValidSplitter,
-    pub embedding_model: Option<String>,
-    pub embedding_model_parameters: Option<Value>,
+    pub embedding_model: Option<PostgresMLEmbeddingModel>,
 }
 
 #[derive(Clone, Debug, Deserialize, Default)]
diff --git a/crates/lsp-ai/src/memory_backends/file_store.rs b/crates/lsp-ai/src/memory_backends/file_store.rs
index ec50151..9f8b805 100644
--- a/crates/lsp-ai/src/memory_backends/file_store.rs
+++ b/crates/lsp-ai/src/memory_backends/file_store.rs
@@ -223,16 +223,13 @@ impl FileStore {
         params: MemoryRunParams,
         pull_from_multiple_files: bool,
     ) -> anyhow::Result<Prompt> {
-        let (mut rope, cursor_index) = self.get_rope_for_position(
-            position,
-            params.max_context_length,
-            pull_from_multiple_files,
-        )?;
+        let (mut rope, cursor_index) =
+            self.get_rope_for_position(position, params.max_context, pull_from_multiple_files)?;
 
         Ok(match prompt_type {
             PromptType::ContextAndCode => {
                 if params.is_for_chat {
-                    let max_length = tokens_to_estimated_characters(params.max_context_length);
+                    let max_length = tokens_to_estimated_characters(params.max_context);
                     let start = cursor_index.saturating_sub(max_length / 2);
                     let end = rope
                         .len_chars()
@@ -248,7 +245,7 @@ impl FileStore {
                     ))
                 } else {
                     let start = cursor_index
-                        .saturating_sub(tokens_to_estimated_characters(params.max_context_length));
+                        .saturating_sub(tokens_to_estimated_characters(params.max_context));
                     let rope_slice = rope
                         .get_slice(start..cursor_index)
                         .context("Error getting rope slice")?;
@@ -259,7 +256,7 @@ impl FileStore {
                 }
             }
             PromptType::FIM => {
-                let max_length = tokens_to_estimated_characters(params.max_context_length);
+                let max_length = tokens_to_estimated_characters(params.max_context);
                 let start = cursor_index.saturating_sub(max_length / 2);
                 let end = rope
                     .len_chars()
diff --git a/crates/lsp-ai/src/memory_backends/mod.rs b/crates/lsp-ai/src/memory_backends/mod.rs
index 6b54cff..9d6fcc5 100644
--- a/crates/lsp-ai/src/memory_backends/mod.rs
+++ b/crates/lsp-ai/src/memory_backends/mod.rs
@@ -18,13 +18,13 @@ pub enum PromptType {
 #[derive(Clone)]
 pub struct MemoryRunParams {
     pub is_for_chat: bool,
-    pub max_context_length: usize,
+    pub max_context: usize,
 }
 
 impl From<&Value> for MemoryRunParams {
     fn from(value: &Value) -> Self {
         Self {
-            max_context_length: value["max_context_length"].as_u64().unwrap_or(1024) as usize,
+            max_context: value["max_context"].as_u64().unwrap_or(1024) as usize,
             // messages are for most backends, contents are for Gemini
             is_for_chat: value["messages"].is_array() || value["contents"].is_array(),
         }
diff --git a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
index 858e29e..2c08065 100644
--- a/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
+++ b/crates/lsp-ai/src/memory_backends/postgresml/mod.rs
@@ -5,6 +5,7 @@ use pgml::{Collection, Pipeline};
 use rand::{distributions::Alphanumeric, Rng};
 use serde_json::{json, Value};
 use std::{
+    collections::HashSet,
     io::Read,
     sync::{
         mpsc::{self, Sender},
@@ -29,7 +30,7 @@ use super::{
 
 const RESYNC_MAX_FILE_SIZE: u64 = 10_000_000;
 
-fn format_chunk_chunk(uri: &str, chunk: &Chunk, root_uri: Option<&str>) -> String {
+fn format_file_excerpt(uri: &str, excerpt: &str, root_uri: Option<&str>) -> String {
     let path = match root_uri {
         Some(root_uri) => {
             if uri.starts_with(root_uri) {
@@ -42,9 +43,8 @@ fn format_chunk_chunk(uri: &str, chunk: &Chunk, root_uri: Option<&str>) -> Strin
     };
     format!(
         r#"--{path}--
-{}
+{excerpt}
 "#,
-        chunk.text
     )
 }
 
@@ -52,7 +52,7 @@ fn chunk_to_document(uri: &str, chunk: Chunk, root_uri: Option<&str>) -> Value {
     json!({
         "id": chunk_to_id(uri, &chunk),
         "uri": uri,
-        "text": format_chunk_chunk(uri, &chunk, root_uri),
+        "text": format_file_excerpt(uri, &chunk.text, root_uri),
         "range": chunk.range
     })
 }
@@ -86,6 +86,7 @@ async fn split_and_upsert_file(
 #[derive(Clone)]
 pub struct PostgresML {
     config: Config,
+    postgresml_config: config::PostgresML,
     file_store: Arc<FileStore>,
     collection: Collection,
     pipeline: Pipeline,
@@ -106,7 +107,7 @@ impl PostgresML {
             .map(|x| Arc::new(Mutex::new(Crawl::new(x, configuration.clone()))));
 
         let splitter: Arc<Box<dyn Splitter + Send + Sync>> =
-            Arc::new(postgresml_config.splitter.try_into()?);
+            Arc::new(postgresml_config.splitter.clone().try_into()?);
 
         let file_store = Arc::new(FileStore::new_with_params(
             config::FileStore::new_without_crawl(),
@@ -114,20 +115,20 @@ impl PostgresML {
             AdditionalFileStoreParams::new(splitter.does_use_tree_sitter()),
         )?);
 
-        let database_url = if let Some(database_url) = postgresml_config.database_url {
+        let database_url = if let Some(database_url) = postgresml_config.database_url.clone() {
             database_url
         } else {
             std::env::var("PGML_DATABASE_URL").context("please provide either the `database_url` in the `postgresml` config, or set the `PGML_DATABASE_URL` environment variable")?
         };
 
         // Build our pipeline schema
-        let pipeline = match postgresml_config.embedding_model {
+        let pipeline = match &postgresml_config.embedding_model {
             Some(embedding_model) => {
                 json!({
                     "text": {
                         "semantic_search": {
-                            "model": embedding_model,
-                            "parameters": postgresml_config.embedding_model_parameters
+                            "model": embedding_model.model,
+                            "parameters": embedding_model.embed_parameters
                         }
                     }
                 })
@@ -281,6 +282,7 @@ impl PostgresML {
 
         let s = Self {
             config: configuration,
+            postgresml_config,
             file_store,
             collection,
             pipeline,
@@ -332,12 +334,19 @@ impl PostgresML {
         let mut documents_to_delete = vec![];
         let mut chunks_to_upsert = vec![];
         let mut current_chunks_bytes = 0;
+        let mut checked_uris = HashSet::new();
         for document in documents.into_iter() {
             let uri = match document["document"]["uri"].as_str() {
                 Some(uri) => uri,
                 None => continue, // This should never happen, but is really bad as we now have a document with essentially no way to delete it
             };
 
+            // Check if we have already loaded in this file
+            if checked_uris.contains(uri) {
+                continue;
+            }
+            checked_uris.insert(uri.to_string());
+
             let path = uri.replace("file://", "");
             let path = std::path::Path::new(&path);
             if !path.exists() {
@@ -458,9 +467,9 @@ impl PostgresML {
                             if let Err(e) = collection
                                 .upsert_documents(to_upsert_documents, None)
                                 .await
-                                .context("PGML - Error upserting changed files")
+                                .context("PGML - error upserting changed files")
                             {
-                                error!("{e}");
+                                error!("{e:?}");
                             }
                         });
                         // Reset everything
@@ -476,9 +485,9 @@ impl PostgresML {
                     if let Err(e) = collection
                         .upsert_documents(documents, None)
                         .await
-                        .context("PGML - Error upserting changed files")
+                        .context("PGML - error upserting changed files")
                     {
-                        error!("{e}");
+                        error!("{e:?}");
                     }
                 });
             }
@@ -502,32 +511,38 @@ impl MemoryBackend for PostgresML {
         params: &Value,
     ) -> anyhow::Result<Prompt> {
         let params: MemoryRunParams = params.try_into()?;
-
-        // TOOD: FIGURE THIS OUT
-        // let prompt_size = params.max_context_length
+        let chunk_size = self.splitter.chunk_size();
+        let total_allowed_characters = tokens_to_estimated_characters(params.max_context);
 
         // Build the query
         let query = self
             .file_store
-            .get_characters_around_position(position, 512)?;
+            .get_characters_around_position(position, chunk_size)?;
 
         // Build the prompt
         let mut file_store_params = params.clone();
-        file_store_params.max_context_length = 512;
+        file_store_params.max_context = chunk_size;
         let code = self
             .file_store
             .build_code(position, prompt_type, file_store_params, false)?;
 
         // Get the byte of the cursor
         let cursor_byte = self.file_store.position_to_byte(position)?;
-        eprintln!(
-            "CURSOR BYTE: {} IN DOCUMENT: {}",
-            cursor_byte,
-            position.text_document.uri.to_string()
-        );
 
         // Get the context
-        let limit = params.max_context_length / 512;
+        let limit = (total_allowed_characters / chunk_size).saturating_sub(1);
+        let parameters = match self
+            .postgresml_config
+            .embedding_model
+            .as_ref()
+            .map(|m| m.query_parameters.clone())
+            .flatten()
+        {
+            Some(query_parameters) => query_parameters,
+            None => json!({
+                "prompt": "query: "
+            }),
+        };
         let res = self
             .collection
             .vector_search_local(
@@ -536,9 +551,7 @@ impl MemoryBackend for PostgresML {
                         "fields": {
                             "text": {
                                 "query": query,
-                                "parameters": {
-                                    "prompt": "query: "
-                                }
+                                "parameters": parameters
                             }
                         },
                         "filter": {
@@ -581,17 +594,20 @@ impl MemoryBackend for PostgresML {
             })
             .collect::<anyhow::Result<Vec<String>>>()?
             .join("\n\n");
-
-        eprintln!("THE CONTEXT:\n\n{context}\n\n");
-
-        let chars = tokens_to_estimated_characters(params.max_context_length.saturating_sub(512));
-        let context = &context[..chars.min(context.len())];
+        let context = &context[..(total_allowed_characters - chunk_size).min(context.len())];
 
         // Reconstruct the Prompts
         Ok(match code {
-            Prompt::ContextAndCode(context_and_code) => Prompt::ContextAndCode(
-                ContextAndCodePrompt::new(context.to_owned(), context_and_code.code),
-            ),
+            Prompt::ContextAndCode(context_and_code) => {
+                Prompt::ContextAndCode(ContextAndCodePrompt::new(
+                    context.to_owned(),
+                    format_file_excerpt(
+                        &position.text_document.uri.to_string(),
+                        &context_and_code.code,
+                        self.config.client_params.root_uri.as_deref(),
+                    ),
+                ))
+            }
             Prompt::FIM(fim) => Prompt::FIM(FIMPrompt::new(
                 format!("{context}\n\n{}", fim.prompt),
                 fim.suffix,

From a4ce96803467041ebb62d92ebdba28123c9b72e6 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Fri, 21 Jun 2024 08:42:57 -0700
Subject: [PATCH 11/18] Rebased and cleaned

---
 crates/lsp-ai/src/memory_backends/file_store.rs | 2 +-
 crates/lsp-ai/src/memory_worker.rs              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/lsp-ai/src/memory_backends/file_store.rs b/crates/lsp-ai/src/memory_backends/file_store.rs
index 9f8b805..a8828a6 100644
--- a/crates/lsp-ai/src/memory_backends/file_store.rs
+++ b/crates/lsp-ai/src/memory_backends/file_store.rs
@@ -1,6 +1,6 @@
 use anyhow::Context;
 use indexmap::IndexSet;
-use lsp_types::{Position, TextDocumentPositionParams};
+use lsp_types::TextDocumentPositionParams;
 use parking_lot::Mutex;
 use ropey::Rope;
 use serde_json::Value;
diff --git a/crates/lsp-ai/src/memory_worker.rs b/crates/lsp-ai/src/memory_worker.rs
index b48894c..bea5f85 100644
--- a/crates/lsp-ai/src/memory_worker.rs
+++ b/crates/lsp-ai/src/memory_worker.rs
@@ -64,7 +64,7 @@ async fn do_build_prompt(
     memory_backend: Arc<Box<dyn MemoryBackend + Send + Sync>>,
 ) -> anyhow::Result<()> {
     let prompt = memory_backend
-        .build_prompt(&params.position, params.prompt_type, params.params)
+        .build_prompt(&params.position, params.prompt_type, &params.params)
         .await?;
     params
         .tx

From 0dcc35d8019313e0ddd7762679a0c30c874cfa6a Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Fri, 21 Jun 2024 08:43:49 -0700
Subject: [PATCH 12/18] Removed eprintlns

---
 crates/lsp-ai/src/transformer_backends/open_ai/mod.rs | 6 ------
 crates/lsp-ai/src/transformer_worker.rs               | 1 -
 2 files changed, 7 deletions(-)

diff --git a/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs b/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs
index dd9c98e..d516adf 100644
--- a/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs
+++ b/crates/lsp-ai/src/transformer_backends/open_ai/mod.rs
@@ -156,12 +156,6 @@ impl OpenAI {
         messages: Vec<ChatMessage>,
         params: OpenAIRunParams,
     ) -> anyhow::Result<String> {
-        eprintln!("\n\n\n\n");
-        for message in &messages {
-            eprintln!("{}:\n{}\n", message.role.to_string(), message.content);
-        }
-        eprintln!("\n\n\n\n");
-
         let client = reqwest::Client::new();
         let token = self.get_token()?;
         let res: OpenAIChatResponse = client
diff --git a/crates/lsp-ai/src/transformer_worker.rs b/crates/lsp-ai/src/transformer_worker.rs
index aff089f..3597aba 100644
--- a/crates/lsp-ai/src/transformer_worker.rs
+++ b/crates/lsp-ai/src/transformer_worker.rs
@@ -342,7 +342,6 @@ async fn do_completion(
 
     // Get the response
     let mut response = transformer_backend.do_completion(&prompt, params).await?;
-    eprintln!("\n\n\n\nGOT RESPONSE: {}\n\n\n\n", response.insert_text);
 
     if let Some(post_process) = config.get_completions_post_process() {
         response.insert_text = post_process_response(response.insert_text, &prompt, &post_process);

From 0ba7571e0f2b123e816a3f0206b6cb15da5d969b Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Fri, 21 Jun 2024 08:55:50 -0700
Subject: [PATCH 13/18] Put versions

---
 Cargo.lock               | 2 +-
 crates/lsp-ai/Cargo.toml | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3bcb32c..2f1dd7b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1556,7 +1556,7 @@ dependencies = [
 
 [[package]]
 name = "lsp-ai"
-version = "0.2.0"
+version = "0.3.0"
 dependencies = [
  "anyhow",
  "assert_cmd",
diff --git a/crates/lsp-ai/Cargo.toml b/crates/lsp-ai/Cargo.toml
index 76e882b..ee4a3ee 100644
--- a/crates/lsp-ai/Cargo.toml
+++ b/crates/lsp-ai/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lsp-ai"
-version = "0.2.0"
+version = "0.3.0"
 
 description.workspace = true
 repository.workspace = true
@@ -33,8 +33,8 @@ tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
 indexmap = "2.2.5"
 async-trait = "0.1.78"
 tree-sitter = "0.22"
-utils-tree-sitter = { workspace = true, features = ["all"] }
-splitter-tree-sitter = { workspace = true }
+utils-tree-sitter = { workspace = true, features = ["all"], version = "0.1.0" }
+splitter-tree-sitter = { workspace = true, version = "0.1.0" }
 text-splitter = { version = "0.13.3" }
 md5 = "0.7.0"
 

From 444393281ab33de84678963c21b33adea67ad369 Mon Sep 17 00:00:00 2001
From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sat, 22 Jun 2024 09:19:43 -0700
Subject: [PATCH 14/18] Added integration tests

---
 .../lsp-ai/src/transformer_backends/ollama.rs |  20 +-
 crates/lsp-ai/tests/integration_tests.rs      | 289 ++++++++++++++++++
 tests/integration_tests.rs                    | 112 -------
 3 files changed, 299 insertions(+), 122 deletions(-)
 create mode 100644 crates/lsp-ai/tests/integration_tests.rs
 delete mode 100644 tests/integration_tests.rs

diff --git a/crates/lsp-ai/src/transformer_backends/ollama.rs b/crates/lsp-ai/src/transformer_backends/ollama.rs
index 6f1a6b1..16486bf 100644
--- a/crates/lsp-ai/src/transformer_backends/ollama.rs
+++ b/crates/lsp-ai/src/transformer_backends/ollama.rs
@@ -67,11 +67,11 @@ impl Ollama {
     ) -> anyhow::Result<String> {
         let client = reqwest::Client::new();
         let res: OllamaCompletionsResponse = client
-            .post(self
-                .configuration
-                .generate_endpoint
-                .as_deref()
-                .unwrap_or("http://localhost:11434/api/generate")
+            .post(
+                self.configuration
+                    .generate_endpoint
+                    .as_deref()
+                    .unwrap_or("http://localhost:11434/api/generate"),
             )
             .header("Content-Type", "application/json")
             .header("Accept", "application/json")
@@ -106,11 +106,11 @@ impl Ollama {
     ) -> anyhow::Result<String> {
         let client = reqwest::Client::new();
         let res: OllamaChatResponse = client
-            .post(self
-                .configuration
-                .chat_endpoint
-                .as_deref()
-                .unwrap_or("http://localhost:11434/api/chat")
+            .post(
+                self.configuration
+                    .chat_endpoint
+                    .as_deref()
+                    .unwrap_or("http://localhost:11434/api/chat"),
             )
             .header("Content-Type", "application/json")
             .header("Accept", "application/json")
diff --git a/crates/lsp-ai/tests/integration_tests.rs b/crates/lsp-ai/tests/integration_tests.rs
new file mode 100644
index 0000000..b523e12
--- /dev/null
+++ b/crates/lsp-ai/tests/integration_tests.rs
@@ -0,0 +1,289 @@
+use anyhow::Result;
+use std::{
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+};
+
+// Note if you get an empty response with no error, that typically means
+// the language server died
+fn read_response(stdout: &mut ChildStdout) -> Result<String> {
+    let mut content_length = None;
+    let mut buf = vec![];
+    loop {
+        let mut buf2 = vec![0];
+        stdout.read_exact(&mut buf2)?;
+        buf.push(buf2[0]);
+        if let Some(content_length) = content_length {
+            if buf.len() == content_length {
+                break;
+            }
+        } else {
+            let len = buf.len();
+            if len > 4
+                && buf[len - 4] == 13
+                && buf[len - 3] == 10
+                && buf[len - 2] == 13
+                && buf[len - 1] == 10
+            {
+                content_length =
+                    Some(String::from_utf8(buf[16..len - 4].to_vec())?.parse::<usize>()?);
+                buf = vec![];
+            }
+        }
+    }
+    Ok(String::from_utf8(buf)?)
+}
+
+fn send_message(stdin: &mut ChildStdin, message: &str) -> Result<()> {
+    stdin.write_all(format!("Content-Length: {}\r\n", message.as_bytes().len(),).as_bytes())?;
+    stdin.write_all("\r\n".as_bytes())?;
+    stdin.write_all(message.as_bytes())?;
+    Ok(())
+}
+
+// This chat completion sequence was created using helix with lsp-ai and reading the logs
+// It utilizes Ollama with llama3:8b-instruct-q4_0 and a temperature of 0
+// It starts with a Python file:
+// ```
+// # Multiplies two numbers
+// def multiply_two_numbers(x, y):
+//
+// # A singular test
+// assert multiply_two_numbers(2, 3) == 6
+//
+// ```
+// And has the following sequence of key strokes:
+// o on line 2 (this creates an indented new line and enters insert mode)
+// r
+// e
+// t
+// u
+// r
+// n
+// The sequence has:
+// - 1 textDocument/DidOpen notification
+// - 7 textDocument/didChange notifications
+// - 1 textDocument/completion requests
+#[test]
+fn test_chat_completion_sequence() -> Result<()> {
+    let mut child = Command::new("cargo")
+        .arg("run")
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()?;
+
+    let mut stdin = child.stdin.take().unwrap();
+    let mut stdout = child.stdout.take().unwrap();
+
+    let initialization_message = r##"{"jsonrpc":"2.0","method":"initialize","params":{"capabilities":{"general":{"positionEncodings":["utf-8","utf-32","utf-16"]},"textDocument":{"codeAction":{"codeActionLiteralSupport":{"codeActionKind":{"valueSet":["","quickfix","refactor","refactor.extract","refactor.inline","refactor.rewrite","source","source.organizeImports"]}},"dataSupport":true,"disabledSupport":true,"isPreferredSupport":true,"resolveSupport":{"properties":["edit","command"]}},"completion":{"completionItem":{"deprecatedSupport":true,"insertReplaceSupport":true,"resolveSupport":{"properties":["documentation","detail","additionalTextEdits"]},"snippetSupport":true,"tagSupport":{"valueSet":[1]}},"completionItemKind":{}},"hover":{"contentFormat":["markdown"]},"inlayHint":{"dynamicRegistration":false},"publishDiagnostics":{"tagSupport":{"valueSet":[1,2]},"versionSupport":true},"rename":{"dynamicRegistration":false,"honorsChangeAnnotations":false,"prepareSupport":true},"signatureHelp":{"signatureInformation":{"activeParameterSupport":true,"documentationFormat":["markdown"],"parameterInformation":{"labelOffsetSupport":true}}}},"window":{"workDoneProgress":true},"workspace":{"applyEdit":true,"configuration":true,"didChangeConfiguration":{"dynamicRegistration":false},"didChangeWatchedFiles":{"dynamicRegistration":true,"relativePatternSupport":false},"executeCommand":{"dynamicRegistration":false},"fileOperations":{"didRename":true,"willRename":true},"inlayHint":{"refreshSupport":false},"symbol":{"dynamicRegistration":false},"workspaceEdit":{"documentChanges":true,"failureHandling":"abort","normalizesLineEndings":false,"resourceOperations":["create","rename","delete"]},"workspaceFolders":true}},"clientInfo":{"name":"helix","version":"24.3 (beb5afcb)"},"initializationOptions":{"completion":{"model":"model1","parameters":{"max_context":1024,"messages":[{"content":"Instructions:\n- You are an AI programming assistant.\n- Given a piece of code with the cursor location marked by \"<CURSOR>\", replace \"<CURSOR>\" with the correct code or comment.\n- First, think step-by-step.\n- Describe your plan for what to build in pseudocode, written out in great detail.\n- Then output the code replacing the \"<CURSOR>\"\n- Ensure that your completion fits within the language context of the provided code snippet (e.g., Python, JavaScript, Rust).\n\nRules:\n- Only respond with code or comments.\n- Only replace \"<CURSOR>\"; do not include any previously written code.\n- Never include \"<CURSOR>\" in your response\n- If the cursor is within a comment, complete the comment meaningfully.\n- Handle ambiguous cases by providing the most contextually appropriate completion.\n- Be consistent with your responses.","role":"system"},{"content":"def greet(name):\n    print(f\"Hello, {<CURSOR>}\")","role":"user"},{"content":"name","role":"assistant"},{"content":"function sum(a, b) {\n    return a + <CURSOR>;\n}","role":"user"},{"content":"b","role":"assistant"},{"content":"fn multiply(a: i32, b: i32) -> i32 {\n    a * <CURSOR>\n}","role":"user"},{"content":"b","role":"assistant"},{"content":"# <CURSOR>\ndef add(a, b):\n    return a + b","role":"user"},{"content":"Adds two numbers","role":"assistant"},{"content":"# This function checks if a number is even\n<CURSOR>","role":"user"},{"content":"def is_even(n):\n    return n % 2 == 0","role":"assistant"},{"content":"{CODE}","role":"user"}],"options":{"num_predict":32,"temperature":0}}},"memory":{"file_store":{}},"models":{"model1":{"model":"llama3:8b-instruct-q4_0","type":"ollama"}}},"processId":66009,"rootPath":"/home/silas/Projects/test","rootUri":null,"workspaceFolders":[]},"id":0}"##;
+    send_message(&mut stdin, initialization_message)?;
+    let _ = read_response(&mut stdout)?;
+
+    send_message(
+        &mut stdin,
+        r#"{"jsonrpc":"2.0","method":"initialized","params":{}}"#,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"languageId":"python","text":"# Multiplies two numbers\ndef multiply_two_numbers(x, y):\n\n# A singular test\nassert multiply_two_numbers(2, 3) == 6\n","uri":"file:///fake.py","version":0}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":31,"line":1},"start":{"character":31,"line":1}},"text":"\n    "}],"textDocument":{"uri":"file:///fake.py","version":1}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":4,"line":2},"start":{"character":4,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":2}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":5,"line":2},"start":{"character":5,"line":2}},"text":"e"}],"textDocument":{"uri":"file:///fake.py","version":3}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":6,"line":2},"start":{"character":6,"line":2}},"text":"t"}],"textDocument":{"uri":"file:///fake.py","version":4}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":7,"line":2},"start":{"character":7,"line":2}},"text":"u"}],"textDocument":{"uri":"file:///fake.py","version":5}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":8,"line":2},"start":{"character":8,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":6}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":9,"line":2},"start":{"character":9,"line":2}},"text":"n"}],"textDocument":{"uri":"file:///fake.py","version":7}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/completion","params":{"position":{"character":10,"line":2},"textDocument":{"uri":"file:///fake.py"}},"id":1}"##,
+    )?;
+
+    let output = read_response(&mut stdout)?;
+    assert_eq!(
+        output,
+        r##"{"jsonrpc":"2.0","id":1,"result":{"isIncomplete":false,"items":[{"filterText":"    return","kind":1,"label":"ai - x * y","textEdit":{"newText":"x * y","range":{"end":{"character":10,"line":2},"start":{"character":10,"line":2}}}}]}}"##
+    );
+
+    child.kill()?;
+    Ok(())
+}
+
+// This FIM completion sequence was created using helix with lsp-ai and reading the logs
+// It utilizes Ollama with deepseek-coder:1.3b-base and a temperature of 0
+// It starts with a Python file:
+// ```
+// # Multiplies two numbers
+// def multiply_two_numbers(x, y):
+//
+// # A singular test
+// assert multiply_two_numbers(2, 3) == 6
+//
+// ```
+// And has the following sequence of key strokes:
+// o on line 2 (this creates an indented new line and enters insert mode)
+// r
+// e
+// The sequence has:
+// - 1 textDocument/DidOpen notification
+// - 3 textDocument/didChange notifications
+// - 1 textDocument/completion requests
+#[test]
+fn test_fim_completion_sequence() -> Result<()> {
+    let mut child = Command::new("cargo")
+        .arg("run")
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()?;
+
+    let mut stdin = child.stdin.take().unwrap();
+    let mut stdout = child.stdout.take().unwrap();
+
+    let initialization_message = r##"{"jsonrpc":"2.0","method":"initialize","params":{"capabilities":{"general":{"positionEncodings":["utf-8","utf-32","utf-16"]},"textDocument":{"codeAction":{"codeActionLiteralSupport":{"codeActionKind":{"valueSet":["","quickfix","refactor","refactor.extract","refactor.inline","refactor.rewrite","source","source.organizeImports"]}},"dataSupport":true,"disabledSupport":true,"isPreferredSupport":true,"resolveSupport":{"properties":["edit","command"]}},"completion":{"completionItem":{"deprecatedSupport":true,"insertReplaceSupport":true,"resolveSupport":{"properties":["documentation","detail","additionalTextEdits"]},"snippetSupport":true,"tagSupport":{"valueSet":[1]}},"completionItemKind":{}},"hover":{"contentFormat":["markdown"]},"inlayHint":{"dynamicRegistration":false},"publishDiagnostics":{"tagSupport":{"valueSet":[1,2]},"versionSupport":true},"rename":{"dynamicRegistration":false,"honorsChangeAnnotations":false,"prepareSupport":true},"signatureHelp":{"signatureInformation":{"activeParameterSupport":true,"documentationFormat":["markdown"],"parameterInformation":{"labelOffsetSupport":true}}}},"window":{"workDoneProgress":true},"workspace":{"applyEdit":true,"configuration":true,"didChangeConfiguration":{"dynamicRegistration":false},"didChangeWatchedFiles":{"dynamicRegistration":true,"relativePatternSupport":false},"executeCommand":{"dynamicRegistration":false},"fileOperations":{"didRename":true,"willRename":true},"inlayHint":{"refreshSupport":false},"symbol":{"dynamicRegistration":false},"workspaceEdit":{"documentChanges":true,"failureHandling":"abort","normalizesLineEndings":false,"resourceOperations":["create","rename","delete"]},"workspaceFolders":true}},"clientInfo":{"name":"helix","version":"24.3 (beb5afcb)"},"initializationOptions":{"completion":{"model":"model1","parameters":{"fim":{"end":"<｜fim▁end｜>","middle":"<｜fim▁hole｜>","start":"<｜fim▁begin｜>"},"max_context":1024,"options":{"num_predict":32,"temperature":0}}},"memory":{"file_store":{}},"models":{"model1":{"model":"deepseek-coder:1.3b-base","type":"ollama"}}},"processId":50347,"rootPath":"/home/silas/Projects/test","rootUri":null,"workspaceFolders":[]},"id":0}"##;
+    send_message(&mut stdin, initialization_message)?;
+    let _ = read_response(&mut stdout)?;
+
+    send_message(
+        &mut stdin,
+        r#"{"jsonrpc":"2.0","method":"initialized","params":{}}"#,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"languageId":"python","text":"# Multiplies two numbers\ndef multiply_two_numbers(x, y):\n\n# A singular test\nassert multiply_two_numbers(2, 3) == 6\n","uri":"file:///fake.py","version":0}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":31,"line":1},"start":{"character":31,"line":1}},"text":"\n    "}],"textDocument":{"uri":"file:///fake.py","version":1}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":4,"line":2},"start":{"character":4,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":2}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":5,"line":2},"start":{"character":5,"line":2}},"text":"e"}],"textDocument":{"uri":"file:///fake.py","version":3}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/completion","params":{"position":{"character":6,"line":2},"textDocument":{"uri":"file:///fake.py"}},"id":1}"##,
+    )?;
+
+    let output = read_response(&mut stdout)?;
+    assert_eq!(
+        output,
+        r##"{"jsonrpc":"2.0","id":1,"result":{"isIncomplete":false,"items":[{"filterText":"    re","kind":1,"label":"ai - turn x * y","textEdit":{"newText":"turn x * y","range":{"end":{"character":6,"line":2},"start":{"character":6,"line":2}}}}]}}"##
+    );
+
+    child.kill()?;
+    Ok(())
+}
+
+// This completion sequence was created using helix with lsp-ai and reading the logs
+// It utilizes Ollama with deepseek-coder:1.3b-base and a temperature of 0
+// It starts with a Python file:
+// ```
+// # Multiplies two numbers
+// def multiply_two_numbers(x, y):
+//
+// ```
+// And has the following sequence of key strokes:
+// o on line 2 (this creates an indented new line and enters insert mode)
+// r
+// e
+// t
+// u
+// r
+// n
+// The sequence has:
+// - 1 textDocument/DidOpen notification
+// - 7 textDocument/didChange notifications
+// - 1 textDocument/completion requests
+#[test]
+fn test_completion_sequence() -> Result<()> {
+    let mut child = Command::new("cargo")
+        .arg("run")
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()?;
+
+    let mut stdin = child.stdin.take().unwrap();
+    let mut stdout = child.stdout.take().unwrap();
+
+    let initialization_message = r##"{"jsonrpc":"2.0","method":"initialize","params":{"capabilities":{"general":{"positionEncodings":["utf-8","utf-32","utf-16"]},"textDocument":{"codeAction":{"codeActionLiteralSupport":{"codeActionKind":{"valueSet":["","quickfix","refactor","refactor.extract","refactor.inline","refactor.rewrite","source","source.organizeImports"]}},"dataSupport":true,"disabledSupport":true,"isPreferredSupport":true,"resolveSupport":{"properties":["edit","command"]}},"completion":{"completionItem":{"deprecatedSupport":true,"insertReplaceSupport":true,"resolveSupport":{"properties":["documentation","detail","additionalTextEdits"]},"snippetSupport":true,"tagSupport":{"valueSet":[1]}},"completionItemKind":{}},"hover":{"contentFormat":["markdown"]},"inlayHint":{"dynamicRegistration":false},"publishDiagnostics":{"tagSupport":{"valueSet":[1,2]},"versionSupport":true},"rename":{"dynamicRegistration":false,"honorsChangeAnnotations":false,"prepareSupport":true},"signatureHelp":{"signatureInformation":{"activeParameterSupport":true,"documentationFormat":["markdown"],"parameterInformation":{"labelOffsetSupport":true}}}},"window":{"workDoneProgress":true},"workspace":{"applyEdit":true,"configuration":true,"didChangeConfiguration":{"dynamicRegistration":false},"didChangeWatchedFiles":{"dynamicRegistration":true,"relativePatternSupport":false},"executeCommand":{"dynamicRegistration":false},"fileOperations":{"didRename":true,"willRename":true},"inlayHint":{"refreshSupport":false},"symbol":{"dynamicRegistration":false},"workspaceEdit":{"documentChanges":true,"failureHandling":"abort","normalizesLineEndings":false,"resourceOperations":["create","rename","delete"]},"workspaceFolders":true}},"clientInfo":{"name":"helix","version":"24.3 (beb5afcb)"},"initializationOptions":{"completion":{"model":"model1","parameters":{"max_context":1024,"options":{"num_predict":32,"temperature":0}}},"memory":{"file_store":{}},"models":{"model1":{"model":"deepseek-coder:1.3b-base","type":"ollama"}}},"processId":62322,"rootPath":"/home/silas/Projects/test","rootUri":null,"workspaceFolders":[]},"id":0}"##;
+    send_message(&mut stdin, initialization_message)?;
+    let _ = read_response(&mut stdout)?;
+
+    send_message(
+        &mut stdin,
+        r#"{"jsonrpc":"2.0","method":"initialized","params":{}}"#,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"languageId":"python","text":"# Multiplies two numbers\ndef multiply_two_numbers(x, y):\n","uri":"file:///fake.py","version":0}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":31,"line":1},"start":{"character":31,"line":1}},"text":"\n    "}],"textDocument":{"uri":"file:///fake.py","version":1}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":4,"line":2},"start":{"character":4,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":2}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":5,"line":2},"start":{"character":5,"line":2}},"text":"e"}],"textDocument":{"uri":"file:///fake.py","version":3}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":6,"line":2},"start":{"character":6,"line":2}},"text":"t"}],"textDocument":{"uri":"file:///fake.py","version":4}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":7,"line":2},"start":{"character":7,"line":2}},"text":"u"}],"textDocument":{"uri":"file:///fake.py","version":5}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":8,"line":2},"start":{"character":8,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":6}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":9,"line":2},"start":{"character":9,"line":2}},"text":"n"}],"textDocument":{"uri":"file:///fake.py","version":7}}}"##,
+    )?;
+    send_message(
+        &mut stdin,
+        r##"{"jsonrpc":"2.0","method":"textDocument/completion","params":{"position":{"character":10,"line":2},"textDocument":{"uri":"file:///fake.py"}},"id":1}"##,
+    )?;
+
+    let output = read_response(&mut stdout)?;
+    assert_eq!(
+        output,
+        r##"{"jsonrpc":"2.0","id":1,"result":{"isIncomplete":false,"items":[{"filterText":"    return","kind":1,"label":"ai -  x * y","textEdit":{"newText":" x * y","range":{"end":{"character":10,"line":2},"start":{"character":10,"line":2}}}}]}}"##
+    );
+
+    child.kill()?;
+    Ok(())
+}
diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs
deleted file mode 100644
index 913bdef..0000000
--- a/tests/integration_tests.rs
+++ /dev/null
@@ -1,112 +0,0 @@
-// use anyhow::Result;
-// use std::{
-//     io::{Read, Write},
-//     process::{ChildStdin, ChildStdout, Command, Stdio},
-// };
-
-// // Note if you get an empty response with no error, that typically means
-// // the language server died
-// fn read_response(stdout: &mut ChildStdout) -> Result<String> {
-//     let mut content_length = None;
-//     let mut buf = vec![];
-//     loop {
-//         let mut buf2 = vec![0];
-//         stdout.read_exact(&mut buf2)?;
-//         buf.push(buf2[0]);
-//         if let Some(content_length) = content_length {
-//             if buf.len() == content_length {
-//                 break;
-//             }
-//         } else {
-//             let len = buf.len();
-//             if len > 4
-//                 && buf[len - 4] == 13
-//                 && buf[len - 3] == 10
-//                 && buf[len - 2] == 13
-//                 && buf[len - 1] == 10
-//             {
-//                 content_length =
-//                     Some(String::from_utf8(buf[16..len - 4].to_vec())?.parse::<usize>()?);
-//                 buf = vec![];
-//             }
-//         }
-//     }
-//     Ok(String::from_utf8(buf)?)
-// }
-
-// fn send_message(stdin: &mut ChildStdin, message: &str) -> Result<()> {
-//     stdin.write_all(format!("Content-Length: {}\r\n", message.as_bytes().len(),).as_bytes())?;
-//     stdin.write_all("\r\n".as_bytes())?;
-//     stdin.write_all(message.as_bytes())?;
-//     Ok(())
-// }
-
-// // This completion sequence was created using helix with the lsp-ai analyzer and reading the logs
-// // It starts with a Python file:
-// // ```
-// // # Multiplies two numbers
-// // def multiply_two_numbers(x, y):
-// //
-// // # A singular test
-// // assert multiply_two_numbers(2, 3) == 6
-// // ```
-// // And has the following sequence of key strokes:
-// // o on line 2 (this creates an indented new line and enters insert mode)
-// // r
-// // e
-// // The sequence has:
-// // - 1 textDocument/DidOpen notification
-// // - 3 textDocument/didChange notifications
-// // - 1 textDocument/completion requests
-// // This test can fail if the model gives a different response than normal, but that seems reasonably unlikely
-// // I guess we should hardcode the seed or something if we want to do more of these
-// #[test]
-// fn test_completion_sequence() -> Result<()> {
-//     // let mut child = Command::new("cargo")
-//     //     .arg("run")
-//     //     .stdin(Stdio::piped())
-//     //     .stdout(Stdio::piped())
-//     //     .stderr(Stdio::piped())
-//     //     .spawn()?;
-
-//     // let mut stdin = child.stdin.take().unwrap();
-//     // let mut stdout = child.stdout.take().unwrap();
-
-//     // let initialization_message = r##"{"jsonrpc":"2.0","method":"initialize","params":{"capabilities":{"general":{"positionEncodings":["utf-8","utf-32","utf-16"]},"textDocument":{"codeAction":{"codeActionLiteralSupport":{"codeActionKind":{"valueSet":["","quickfix","refactor","refactor.extract","refactor.inline","refactor.rewrite","source","source.organizeImports"]}},"dataSupport":true,"disabledSupport":true,"isPreferredSupport":true,"resolveSupport":{"properties":["edit","command"]}},"completion":{"completionItem":{"deprecatedSupport":true,"insertReplaceSupport":true,"resolveSupport":{"properties":["documentation","detail","additionalTextEdits"]},"snippetSupport":true,"tagSupport":{"valueSet":[1]}},"completionItemKind":{}},"hover":{"contentFormat":["markdown"]},"inlayHint":{"dynamicRegistration":false},"publishDiagnostics":{"versionSupport":true},"rename":{"dynamicRegistration":false,"honorsChangeAnnotations":false,"prepareSupport":true},"signatureHelp":{"signatureInformation":{"activeParameterSupport":true,"documentationFormat":["markdown"],"parameterInformation":{"labelOffsetSupport":true}}}},"window":{"workDoneProgress":true},"workspace":{"applyEdit":true,"configuration":true,"didChangeConfiguration":{"dynamicRegistration":false},"didChangeWatchedFiles":{"dynamicRegistration":true,"relativePatternSupport":false},"executeCommand":{"dynamicRegistration":false},"inlayHint":{"refreshSupport":false},"symbol":{"dynamicRegistration":false},"workspaceEdit":{"documentChanges":true,"failureHandling":"abort","normalizesLineEndings":false,"resourceOperations":["create","rename","delete"]},"workspaceFolders":true}},"clientInfo":{"name":"helix","version":"23.10 (f6021dd0)"},"processId":70007,"rootPath":"/Users/silas/Projects/Tests/lsp-ai-tests","rootUri":null,"workspaceFolders":[]},"id":0}"##;
-//     // send_message(&mut stdin, initialization_message)?;
-//     // let _ = read_response(&mut stdout)?;
-
-//     // send_message(
-//     //     &mut stdin,
-//     //     r#"{"jsonrpc":"2.0","method":"initialized","params":{}}"#,
-//     // )?;
-//     // send_message(
-//     //     &mut stdin,
-//     //     r##"{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"languageId":"python","text":"# Multiplies two numbers\ndef multiply_two_numbers(x, y):\n\n# A singular test\nassert multiply_two_numbers(2, 3) == 6\n","uri":"file:///fake.py","version":0}}}"##,
-//     // )?;
-//     // send_message(
-//     //     &mut stdin,
-//     //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":31,"line":1},"start":{"character":31,"line":1}},"text":"\n    "}],"textDocument":{"uri":"file:///fake.py","version":1}}}"##,
-//     // )?;
-//     // send_message(
-//     //     &mut stdin,
-//     //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":4,"line":2},"start":{"character":4,"line":2}},"text":"r"}],"textDocument":{"uri":"file:///fake.py","version":2}}}"##,
-//     // )?;
-//     // send_message(
-//     //     &mut stdin,
-//     //     r##"{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"contentChanges":[{"range":{"end":{"character":5,"line":2},"start":{"character":5,"line":2}},"text":"e"}],"textDocument":{"uri":"file:///fake.py","version":3}}}"##,
-//     // )?;
-//     // send_message(
-//     //     &mut stdin,
-//     //     r##"{"jsonrpc":"2.0","method":"textDocument/completion","params":{"position":{"character":6,"line":2},"textDocument":{"uri":"file:///fake.py"}},"id":1}"##,
-//     // )?;
-
-//     // let output = read_response(&mut stdout)?;
-//     // assert_eq!(
-//     //     output,
-//     //     r##"{"jsonrpc":"2.0","id":1,"result":{"isIncomplete":false,"items":[{"filterText":"    re\n","kind":1,"label":"ai - turn x * y","textEdit":{"newText":"turn x * y","range":{"end":{"character":6,"line":2},"start":{"character":6,"line":2}}}}]}}"##
-//     // );
-
-//     // child.kill()?;
-//     Ok(())
-// }

From 2be85968e411170508e494e38b8b37acc74b7d92 Mon Sep 17 00:00:00 2001
From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sat, 22 Jun 2024 10:07:29 -0700
Subject: [PATCH 15/18] Fixed some versioning and warnings

---
 Cargo.lock                              | 165 ++++++++++++++++++++++++
 Cargo.toml                              |   4 -
 crates/lsp-ai/Cargo.toml                |   4 +-
 crates/lsp-ai/src/config.rs             |  28 ++--
 crates/lsp-ai/src/transformer_worker.rs |  20 +--
 5 files changed, 185 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2f1dd7b..b89368f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3449,6 +3449,156 @@ dependencies = [
  "regex",
 ]
 
+[[package]]
+name = "tree-sitter-bash"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5244703ad2e08a616d859a0557d7aa290adcd5e0990188a692e628ffe9dce40"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-c"
+version = "0.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f956d5351d62652864a4ff3ae861747e7a1940dc96c9998ae400ac0d3ce30427"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-c-sharp"
+version = "0.21.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff899037068a1ffbb891891b7e94db1400ddf12c3d934b85b8c9e30be5cd18da"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-cpp"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "537b7e0f0d8c89b8dd6f4d195814da94832f20720c09016c2a3ac3dc3c437993"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-css"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2f806f96136762b0121f5fdd7172a3dcd8f42d37a2f23ed7f11b35895e20eb4"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-elixir"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df94bf7f057768b1cab2ee1f14812ed4ae33f9e04d09254043eeaa797db4ef70"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-erlang"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8db61152e6d8a5b3b5895ecbb85848f85d028f84b4633a2368075c35e5817b34"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-go"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55cb318be5ccf75f44e054acf6898a5c95d59b53443eed578e16be0cd7ec037f"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-haskell"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef25a7e6c73cc1cbe0c0b7dbd5406e7b3485b370bd61c5d8d852ae0781f9bf9a"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-html"
+version = "0.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95b3492b08a786bf5cc79feb0ef2ff3b115d5174364e0ddfd7860e0b9b088b53"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-java"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33bc21adf831a773c075d9d00107ab43965e6a6ea7607b47fd9ec6f3db4b481b"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-javascript"
+version = "0.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fced510d43e6627cd8e19adfd994ac9cfa3b1d71b0d522b41f74145de37feef"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-json"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b737dcb73c35d74b7d64a5f3dde158113c86a012bf3cee2bfdf2150d23b05db"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-lua"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b9fe6fc87bd480e1943fc1fcb02453fb2da050e4e8ce0daa67d801544046856"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
+[[package]]
+name = "tree-sitter-ocaml"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2f2e8e848902d12ca6778d31d0e66b5709fc1ad0c84fd8b0c078472fff20dd2"
+dependencies = [
+ "cc",
+ "tree-sitter",
+]
+
 [[package]]
 name = "tree-sitter-python"
 version = "0.21.0"
@@ -3596,6 +3746,21 @@ dependencies = [
  "cc",
  "thiserror",
  "tree-sitter",
+ "tree-sitter-bash",
+ "tree-sitter-c",
+ "tree-sitter-c-sharp",
+ "tree-sitter-cpp",
+ "tree-sitter-css",
+ "tree-sitter-elixir",
+ "tree-sitter-erlang",
+ "tree-sitter-go",
+ "tree-sitter-haskell",
+ "tree-sitter-html",
+ "tree-sitter-java",
+ "tree-sitter-javascript",
+ "tree-sitter-json",
+ "tree-sitter-lua",
+ "tree-sitter-ocaml",
  "tree-sitter-python",
  "tree-sitter-rust",
  "tree-sitter-zig",
diff --git a/Cargo.toml b/Cargo.toml
index afb3496..0121f0b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,7 +10,3 @@ license = "MIT"
 description = "LSP-AI is an open-source language server that serves as a backend for AI-powered functionality, designed to assist and empower software engineers, not replace them."
 repository = "https://github.com/SilasMarvin/lsp-ai"
 readme = "README.md"
-
-[workspace.dependencies]
-utils-tree-sitter = { path = "./crates/utils-tree-sitter" }
-splitter-tree-sitter = { path = "./crates/splitter-tree-sitter" }
diff --git a/crates/lsp-ai/Cargo.toml b/crates/lsp-ai/Cargo.toml
index ee4a3ee..05ec30b 100644
--- a/crates/lsp-ai/Cargo.toml
+++ b/crates/lsp-ai/Cargo.toml
@@ -33,8 +33,8 @@ tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
 indexmap = "2.2.5"
 async-trait = "0.1.78"
 tree-sitter = "0.22"
-utils-tree-sitter = { workspace = true, features = ["all"], version = "0.1.0" }
-splitter-tree-sitter = { workspace = true, version = "0.1.0" }
+utils-tree-sitter = { path = "../utils-tree-sitter", features = ["all"], version = "0.1.0" }
+splitter-tree-sitter = { path = "../splitter-tree-sitter", version = "0.1.0" }
 text-splitter = { version = "0.13.3" }
 md5 = "0.7.0"
 
diff --git a/crates/lsp-ai/src/config.rs b/crates/lsp-ai/src/config.rs
index 5d29b66..3827a50 100644
--- a/crates/lsp-ai/src/config.rs
+++ b/crates/lsp-ai/src/config.rs
@@ -112,15 +112,6 @@ impl ChatMessage {
     }
 }
 
-#[derive(Debug, Clone, Deserialize)]
-#[serde(deny_unknown_fields)]
-pub struct Chat {
-    pub completion: Option<Vec<ChatMessage>>,
-    pub generation: Option<Vec<ChatMessage>>,
-    pub chat_template: Option<String>,
-    pub chat_format: Option<String>,
-}
-
 #[derive(Clone, Debug, Deserialize)]
 #[allow(clippy::upper_case_acronyms)]
 #[serde(deny_unknown_fields)]
@@ -178,14 +169,6 @@ impl FileStore {
     }
 }
 
-const fn n_gpu_layers_default() -> u32 {
-    1000
-}
-
-const fn n_ctx_default() -> u32 {
-    1000
-}
-
 #[derive(Clone, Debug, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct Ollama {
@@ -215,6 +198,17 @@ pub struct MistralFIM {
     pub max_requests_per_second: f32,
 }
 
+#[cfg(feature = "llama_cpp")]
+const fn n_gpu_layers_default() -> u32 {
+    1000
+}
+
+#[cfg(feature = "llama_cpp")]
+const fn n_ctx_default() -> u32 {
+    1000
+}
+
+#[cfg(feature = "llama_cpp")]
 #[derive(Clone, Debug, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct LLaMACPP {
diff --git a/crates/lsp-ai/src/transformer_worker.rs b/crates/lsp-ai/src/transformer_worker.rs
index 3597aba..fcbd831 100644
--- a/crates/lsp-ai/src/transformer_worker.rs
+++ b/crates/lsp-ai/src/transformer_worker.rs
@@ -89,14 +89,14 @@ pub struct DoGenerationStreamResponse {
 fn post_process_start(response: String, front: &str) -> String {
     let mut front_match = response.len();
     loop {
-        if response.len() == 0 || front.ends_with(&response[..front_match]) {
+        if response.is_empty() || front.ends_with(&response[..front_match]) {
             break;
         } else {
             front_match -= 1;
         }
     }
     if front_match > 0 {
-        (&response[front_match..]).to_owned()
+        response[front_match..].to_owned()
     } else {
         response
     }
@@ -105,16 +105,14 @@ fn post_process_start(response: String, front: &str) -> String {
 fn post_process_end(response: String, back: &str) -> String {
     let mut back_match = 0;
     loop {
-        if back_match == response.len() {
-            break;
-        } else if back.starts_with(&response[back_match..]) {
+        if back_match == response.len() || back.starts_with(&response[back_match..]) {
             break;
         } else {
             back_match += 1;
         }
     }
     if back_match > 0 {
-        (&response[..back_match]).to_owned()
+        response[..back_match].to_owned()
     } else {
         response
     }
@@ -140,12 +138,10 @@ fn post_process_response(
                 } else {
                     response
                 }
+            } else if config.remove_duplicate_start {
+                post_process_start(response, &context_and_code.code)
             } else {
-                if config.remove_duplicate_start {
-                    post_process_start(response, &context_and_code.code)
-                } else {
-                    response
-                }
+                response
             }
         }
         Prompt::FIM(fim) => {
@@ -289,14 +285,12 @@ async fn generate_response(
                 .context("Completions is none")?;
             let transformer_backend = transformer_backends
                 .get(&completion_config.model)
-                .clone()
                 .with_context(|| format!("can't find model: {}", &completion_config.model))?;
             do_completion(transformer_backend, memory_backend_tx, &request, &config).await
         }
         WorkerRequest::Generation(request) => {
             let transformer_backend = transformer_backends
                 .get(&request.params.model)
-                .clone()
                 .with_context(|| format!("can't find model: {}", &request.params.model))?;
             do_generate(transformer_backend, memory_backend_tx, &request).await
         }

From 11e8cf819ed7bda11f648d07e414f04d694eb8b8 Mon Sep 17 00:00:00 2001
From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sat, 22 Jun 2024 10:14:06 -0700
Subject: [PATCH 16/18] Fix git link issue

---
 crates/splitter-tree-sitter            |   1 -
 crates/splitter-tree-sitter/Cargo.toml |  16 ++
 crates/splitter-tree-sitter/LICENSE    |  21 ++
 crates/splitter-tree-sitter/README.md  |   3 +
 crates/splitter-tree-sitter/src/lib.rs | 326 +++++++++++++++++++++++++
 crates/utils-tree-sitter               |   1 -
 crates/utils-tree-sitter/Cargo.toml    |  34 +++
 crates/utils-tree-sitter/LICENSE       |  21 ++
 crates/utils-tree-sitter/README.md     |   3 +
 crates/utils-tree-sitter/src/lib.rs    |  90 +++++++
 10 files changed, 514 insertions(+), 2 deletions(-)
 delete mode 160000 crates/splitter-tree-sitter
 create mode 100644 crates/splitter-tree-sitter/Cargo.toml
 create mode 100644 crates/splitter-tree-sitter/LICENSE
 create mode 100644 crates/splitter-tree-sitter/README.md
 create mode 100644 crates/splitter-tree-sitter/src/lib.rs
 delete mode 160000 crates/utils-tree-sitter
 create mode 100644 crates/utils-tree-sitter/Cargo.toml
 create mode 100644 crates/utils-tree-sitter/LICENSE
 create mode 100644 crates/utils-tree-sitter/README.md
 create mode 100644 crates/utils-tree-sitter/src/lib.rs

diff --git a/crates/splitter-tree-sitter b/crates/splitter-tree-sitter
deleted file mode 160000
index 37a2e98..0000000
--- a/crates/splitter-tree-sitter
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 37a2e98cce5a1b39f07aec7e5b3bc75eebb41ac2
diff --git a/crates/splitter-tree-sitter/Cargo.toml b/crates/splitter-tree-sitter/Cargo.toml
new file mode 100644
index 0000000..51a55d8
--- /dev/null
+++ b/crates/splitter-tree-sitter/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "splitter-tree-sitter"
+version = "0.1.0"
+
+edition.workspace = true
+
+[dependencies]
+thiserror = "1.0.61"
+tree-sitter = "0.22"
+
+[dev-dependencies]
+tree-sitter-rust = "0.21"
+tree-sitter-zig = { git = "https://github.com/maxxnino/tree-sitter-zig" }
+
+[build-dependencies]
+cc="*"
diff --git a/crates/splitter-tree-sitter/LICENSE b/crates/splitter-tree-sitter/LICENSE
new file mode 100644
index 0000000..19e1809
--- /dev/null
+++ b/crates/splitter-tree-sitter/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Silas Marvin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/crates/splitter-tree-sitter/README.md b/crates/splitter-tree-sitter/README.md
new file mode 100644
index 0000000..5432bd6
--- /dev/null
+++ b/crates/splitter-tree-sitter/README.md
@@ -0,0 +1,3 @@
+# tree-sitter-splitter
+
+This is a code splitter that utilizes Tree-sitter to split code.
diff --git a/crates/splitter-tree-sitter/src/lib.rs b/crates/splitter-tree-sitter/src/lib.rs
new file mode 100644
index 0000000..49bf1a0
--- /dev/null
+++ b/crates/splitter-tree-sitter/src/lib.rs
@@ -0,0 +1,326 @@
+use thiserror::Error;
+use tree_sitter::{Tree, TreeCursor};
+
+#[derive(Error, Debug)]
+pub enum NewError {
+    #[error("chunk_size must be greater than chunk_overlap")]
+    SizeOverlapError,
+}
+
+#[derive(Error, Debug)]
+pub enum SplitError {
+    #[error("converting utf8 to str")]
+    Utf8Error(#[from] core::str::Utf8Error),
+}
+
+pub struct TreeSitterCodeSplitter {
+    chunk_size: usize,
+    chunk_overlap: usize,
+}
+
+pub struct ByteRange {
+    pub start_byte: usize,
+    pub end_byte: usize,
+}
+
+impl ByteRange {
+    fn new(start_byte: usize, end_byte: usize) -> Self {
+        Self {
+            start_byte,
+            end_byte,
+        }
+    }
+}
+
+pub struct Chunk<'a> {
+    pub text: &'a str,
+    pub range: ByteRange,
+}
+
+impl<'a> Chunk<'a> {
+    fn new(text: &'a str, range: ByteRange) -> Self {
+        Self { text, range }
+    }
+}
+
+impl TreeSitterCodeSplitter {
+    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self, NewError> {
+        if chunk_overlap > chunk_size {
+            Err(NewError::SizeOverlapError)
+        } else {
+            Ok(Self {
+                chunk_size,
+                chunk_overlap,
+            })
+        }
+    }
+
+    pub fn split<'a, 'b, 'c>(
+        &'a self,
+        tree: &'b Tree,
+        utf8: &'c [u8],
+    ) -> Result<Vec<Chunk<'c>>, SplitError> {
+        let cursor = tree.walk();
+        Ok(self
+            .split_recursive(cursor, utf8)?
+            .into_iter()
+            .rev()
+            // Let's combine some of our smaller chunks together
+            // We also want to do this in reverse as it (seems) to make more sense to combine code slices from bottom to top
+            .try_fold(vec![], |mut acc, current| {
+                if acc.len() == 0 {
+                    acc.push(current);
+                    Ok::<_, SplitError>(acc)
+                } else {
+                    if acc.last().as_ref().unwrap().text.len() + current.text.len()
+                        < self.chunk_size
+                    {
+                        let last = acc.pop().unwrap();
+                        let text = std::str::from_utf8(
+                            &utf8[current.range.start_byte..last.range.end_byte],
+                        )?;
+                        acc.push(Chunk::new(
+                            text,
+                            ByteRange::new(current.range.start_byte, last.range.end_byte),
+                        ));
+                    } else {
+                        acc.push(current);
+                    }
+                    Ok(acc)
+                }
+            })?
+            .into_iter()
+            .rev()
+            .collect())
+    }
+
+    fn split_recursive<'a, 'b, 'c>(
+        &'a self,
+        mut cursor: TreeCursor<'b>,
+        utf8: &'c [u8],
+    ) -> Result<Vec<Chunk<'c>>, SplitError> {
+        let node = cursor.node();
+        let text = node.utf8_text(utf8)?;
+
+        // There are three cases:
+        // 1. Is the current range of code smaller than the chunk_size? If so, return it
+        // 2. If not, does the current node have children? If so, recursively walk down
+        // 3. If not, we must split our current node
+        let mut out = if text.chars().count() <= self.chunk_size {
+            vec![Chunk::new(
+                text,
+                ByteRange::new(node.range().start_byte, node.range().end_byte),
+            )]
+        } else {
+            let mut cursor_copy = cursor.clone();
+            if cursor_copy.goto_first_child() {
+                self.split_recursive(cursor_copy, utf8)?
+            } else {
+                let mut current_range =
+                    ByteRange::new(node.range().start_byte, node.range().end_byte);
+                let mut chunks = vec![];
+                let mut current_chunk = text;
+                loop {
+                    if current_chunk.len() < self.chunk_size {
+                        chunks.push(Chunk::new(current_chunk, current_range));
+                        break;
+                    } else {
+                        let new_chunk = &current_chunk[0..self.chunk_size.min(current_chunk.len())];
+                        let new_range = ByteRange::new(
+                            current_range.start_byte,
+                            current_range.start_byte + new_chunk.as_bytes().len(),
+                        );
+                        chunks.push(Chunk::new(new_chunk, new_range));
+                        let new_current_chunk =
+                            &current_chunk[self.chunk_size - self.chunk_overlap..];
+                        let byte_diff =
+                            current_chunk.as_bytes().len() - new_current_chunk.as_bytes().len();
+                        current_range = ByteRange::new(
+                            current_range.start_byte + byte_diff,
+                            current_range.end_byte,
+                        );
+                        current_chunk = new_current_chunk
+                    }
+                }
+                chunks
+            }
+        };
+        if cursor.goto_next_sibling() {
+            out.append(&mut self.split_recursive(cursor, utf8)?);
+        }
+        Ok(out)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tree_sitter::Parser;
+
+    #[test]
+    fn test_split_rust() {
+        let splitter = TreeSitterCodeSplitter::new(128, 0).unwrap();
+
+        let mut parser = Parser::new();
+        parser
+            .set_language(&tree_sitter_rust::language())
+            .expect("Error loading Rust grammar");
+
+        let source_code = r#"
+#[derive(Debug)]
+struct Rectangle {
+    width: u32,
+    height: u32,
+}
+
+impl Rectangle {
+    fn area(&self) -> u32 {
+        self.width * self.height
+    }
+}
+
+fn main() {
+    let rect1 = Rectangle {
+        width: 30,
+        height: 50,
+    };
+
+    println!(
+        "The area of the rectangle is {} square pixels.",
+        rect1.area()
+    );
+}
+"#;
+        let tree = parser.parse(source_code, None).unwrap();
+        let chunks = splitter.split(&tree, source_code.as_bytes()).unwrap();
+        assert_eq!(
+            chunks[0].text,
+            r#"#[derive(Debug)]
+struct Rectangle {
+    width: u32,
+    height: u32,
+}"#
+        );
+        assert_eq!(
+            chunks[1].text,
+            r#"impl Rectangle {
+    fn area(&self) -> u32 {
+        self.width * self.height
+    }
+}"#
+        );
+        assert_eq!(
+            chunks[2].text,
+            r#"fn main() {
+    let rect1 = Rectangle {
+        width: 30,
+        height: 50,
+    };"#
+        );
+        assert_eq!(
+            chunks[3].text,
+            r#"println!(
+        "The area of the rectangle is {} square pixels.",
+        rect1.area()
+    );
+}"#
+        );
+    }
+
+    #[test]
+    fn test_split_zig() {
+        let splitter = TreeSitterCodeSplitter::new(128, 10).unwrap();
+
+        let mut parser = Parser::new();
+        parser
+            .set_language(&tree_sitter_rust::language())
+            .expect("Error loading Rust grammar");
+
+        let source_code = r#"
+const std = @import("std");
+const parseInt = std.fmt.parseInt;
+
+std.debug.print("Here is a long string 1 ... Here is a long string 2 ... Here is a long string 3 ... Here is a long string 4 ... Here is a long string 5 ... Here is a long string 6 ... Here is a long string 7 ... Here is a long string 8 ... Here is a long string 9 ...", .{});
+
+test "parse integers" {
+    const input = "123 67 89,99";
+    const ally = std.testing.allocator;
+
+    var list = std.ArrayList(u32).init(ally);
+    // Ensure the list is freed at scope exit.
+    // Try commenting out this line!
+    defer list.deinit();
+
+    var it = std.mem.tokenizeAny(u8, input, " ,");
+    while (it.next()) |num| {
+        const n = try parseInt(u32, num, 10);
+        try list.append(n);
+    }
+
+    const expected = [_]u32{ 123, 67, 89, 99 };
+
+    for (expected, list.items) |exp, actual| {
+        try std.testing.expectEqual(exp, actual);
+    }
+}
+"#;
+        let tree = parser.parse(source_code, None).unwrap();
+        let chunks = splitter.split(&tree, source_code.as_bytes()).unwrap();
+
+        assert_eq!(
+            chunks[0].text,
+            r#"const std = @import("std");
+const parseInt = std.fmt.parseInt;
+
+std.debug.print(""#
+        );
+        assert_eq!(
+            chunks[1].text,
+            r#"Here is a long string 1 ... Here is a long string 2 ... Here is a long string 3 ... Here is a long string 4 ... Here is a long s"#
+        );
+        assert_eq!(
+            chunks[2].text,
+            r#"s a long string 5 ... Here is a long string 6 ... Here is a long string 7 ... Here is a long string 8 ... Here is a long string "#
+        );
+        assert_eq!(chunks[3].text, r#"ng string 9 ...", .{});"#);
+        assert_eq!(
+            chunks[4].text,
+            r#"test "parse integers" {
+    const input = "123 67 89,99";
+    const ally = std.testing.allocator;
+
+    var list = std.ArrayList"#
+        );
+        assert_eq!(
+            chunks[5].text,
+            r#"(u32).init(ally);
+    // Ensure the list is freed at scope exit.
+    // Try commenting out this line!"#
+        );
+        assert_eq!(
+            chunks[6].text,
+            r#"defer list.deinit();
+
+    var it = std.mem.tokenizeAny(u8, input, " ,");
+    while (it.next()) |num"#
+        );
+        assert_eq!(
+            chunks[7].text,
+            r#"| {
+        const n = try parseInt(u32, num, 10);
+        try list.append(n);
+    }
+
+    const expected = [_]u32{ 123, 67, 89,"#
+        );
+        assert_eq!(
+            chunks[8].text,
+            r#"99 };
+
+    for (expected, list.items) |exp, actual| {
+        try std.testing.expectEqual(exp, actual);
+    }
+}"#
+        );
+    }
+}
diff --git a/crates/utils-tree-sitter b/crates/utils-tree-sitter
deleted file mode 160000
index a38e714..0000000
--- a/crates/utils-tree-sitter
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a38e7143bcab2412348fd92904cc5105117896a1
diff --git a/crates/utils-tree-sitter/Cargo.toml b/crates/utils-tree-sitter/Cargo.toml
new file mode 100644
index 0000000..8e90d57
--- /dev/null
+++ b/crates/utils-tree-sitter/Cargo.toml
@@ -0,0 +1,34 @@
+[package]
+name = "utils-tree-sitter"
+version = "0.1.0"
+
+edition.workspace = true
+
+[dependencies]
+thiserror = "1.0.61"
+tree-sitter = "0.22"
+tree-sitter-bash = { version = "0.21", optional  = true }
+tree-sitter-c = { version = "0.21", optional  = true }
+tree-sitter-cpp = { version = "0.22", optional  = true }
+tree-sitter-c-sharp = { version = "0.21", optional  = true }
+tree-sitter-css = { version = "0.21", optional  = true }
+tree-sitter-elixir = { version = "0.2", optional  = true }
+tree-sitter-erlang = { version = "0.6", optional  = true }
+tree-sitter-go = { version = "0.21", optional  = true }
+tree-sitter-html = { version = "0.20", optional  = true }
+tree-sitter-java = { version = "0.21", optional  = true }
+tree-sitter-javascript = { version = "0.21", optional  = true }
+tree-sitter-json = { version = "0.21", optional  = true }
+tree-sitter-haskell = { version = "0.21", optional = true }
+tree-sitter-lua = { version = "0.1.0", optional = true }
+tree-sitter-ocaml = { version = "0.22.0", optional = true }
+tree-sitter-python = { version = "0.21", optional = true }
+tree-sitter-rust = { version = "0.21", optional = true }
+tree-sitter-zig = { git = "https://github.com/maxxnino/tree-sitter-zig", optional = true }
+
+[build-dependencies]
+cc="*"
+
+[features]
+default = []
+all = ["dep:tree-sitter-python", "dep:tree-sitter-bash", "dep:tree-sitter-c", "dep:tree-sitter-cpp", "dep:tree-sitter-c-sharp", "dep:tree-sitter-css", "dep:tree-sitter-elixir", "dep:tree-sitter-erlang", "dep:tree-sitter-go", "dep:tree-sitter-html", "dep:tree-sitter-java", "dep:tree-sitter-javascript", "dep:tree-sitter-json", "dep:tree-sitter-rust", "dep:tree-sitter-zig", "dep:tree-sitter-haskell", "dep:tree-sitter-lua", "dep:tree-sitter-ocaml"]
diff --git a/crates/utils-tree-sitter/LICENSE b/crates/utils-tree-sitter/LICENSE
new file mode 100644
index 0000000..19e1809
--- /dev/null
+++ b/crates/utils-tree-sitter/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Silas Marvin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/crates/utils-tree-sitter/README.md b/crates/utils-tree-sitter/README.md
new file mode 100644
index 0000000..97f75c8
--- /dev/null
+++ b/crates/utils-tree-sitter/README.md
@@ -0,0 +1,3 @@
+# utils-tree-sitter
+
+Utils for working with Tree-sitter
diff --git a/crates/utils-tree-sitter/src/lib.rs b/crates/utils-tree-sitter/src/lib.rs
new file mode 100644
index 0000000..6e71246
--- /dev/null
+++ b/crates/utils-tree-sitter/src/lib.rs
@@ -0,0 +1,90 @@
+use thiserror::Error;
+use tree_sitter::{LanguageError, Parser};
+
+#[derive(Error, Debug)]
+pub enum GetParserError {
+    #[error("no parser found for extension")]
+    NoParserFoundForExtension(String),
+    #[error("no parser found for extension")]
+    NoLanguageFoundForExtension(String),
+    #[error("loading grammer")]
+    LoadingGrammer(#[from] LanguageError),
+}
+
+fn get_extension_for_language(extension: &str) -> Result<String, GetParserError> {
+    Ok(match extension {
+        "py" => "Python",
+        "rs" => "Rust",
+        "zig" => "Zig",
+        "sh" => "Bash",
+        "c" => "C",
+        "cpp" => "C++",
+        "cs" => "C#",
+        "css" => "CSS",
+        "ex" => "Elixir",
+        "erl" => "Erlang",
+        "go" => "Go",
+        "html" => "HTML",
+        "java" => "Java",
+        "js" => "JavaScript",
+        "json" => "JSON",
+        "hs" => "Haskell",
+        "lua" => "Lua",
+        "ml" => "OCaml",
+        _ => {
+            return Err(GetParserError::NoLanguageFoundForExtension(
+                extension.to_string(),
+            ))
+        }
+    }
+    .to_string())
+}
+
+pub fn get_parser_for_extension(extension: &str) -> Result<Parser, GetParserError> {
+    let language = get_extension_for_language(extension)?;
+    let mut parser = Parser::new();
+    match language.as_str() {
+        #[cfg(any(feature = "all", feature = "python"))]
+        "Python" => parser.set_language(&tree_sitter_python::language())?,
+        #[cfg(any(feature = "all", feature = "rust"))]
+        "Rust" => parser.set_language(&tree_sitter_rust::language())?,
+        #[cfg(any(feature = "all", feature = "zig"))]
+        "Zig" => parser.set_language(&tree_sitter_zig::language())?,
+        #[cfg(any(feature = "all", feature = "bash"))]
+        "Bash" => parser.set_language(&tree_sitter_bash::language())?,
+        #[cfg(any(feature = "all", feature = "c"))]
+        "C" => parser.set_language(&tree_sitter_c::language())?,
+        #[cfg(any(feature = "all", feature = "cpp"))]
+        "C++" => parser.set_language(&tree_sitter_cpp::language())?,
+        #[cfg(any(feature = "all", feature = "c-sharp"))]
+        "C#" => parser.set_language(&tree_sitter_c_sharp::language())?,
+        #[cfg(any(feature = "all", feature = "css"))]
+        "CSS" => parser.set_language(&tree_sitter_css::language())?,
+        #[cfg(any(feature = "all", feature = "elixir"))]
+        "Elixir" => parser.set_language(&tree_sitter_elixir::language())?,
+        #[cfg(any(feature = "all", feature = "erlang"))]
+        "Erlang" => parser.set_language(&tree_sitter_erlang::language())?,
+        #[cfg(any(feature = "all", feature = "go"))]
+        "Go" => parser.set_language(&tree_sitter_go::language())?,
+        #[cfg(any(feature = "all", feature = "html"))]
+        "HTML" => parser.set_language(&tree_sitter_html::language())?,
+        #[cfg(any(feature = "all", feature = "java"))]
+        "Java" => parser.set_language(&tree_sitter_java::language())?,
+        #[cfg(any(feature = "all", feature = "javascript"))]
+        "JavaScript" => parser.set_language(&tree_sitter_javascript::language())?,
+        #[cfg(any(feature = "all", feature = "json"))]
+        "JSON" => parser.set_language(&tree_sitter_json::language())?,
+        #[cfg(any(feature = "all", feature = "haskell"))]
+        "Haskell" => parser.set_language(&tree_sitter_haskell::language())?,
+        #[cfg(any(feature = "all", feature = "lua"))]
+        "Lua" => parser.set_language(&tree_sitter_lua::language())?,
+        #[cfg(any(feature = "all", feature = "ocaml"))]
+        "OCaml" => parser.set_language(&tree_sitter_ocaml::language_ocaml())?,
+        _ => {
+            return Err(GetParserError::NoParserFoundForExtension(
+                language.to_string(),
+            ))
+        }
+    }
+    Ok(parser)
+}

From 335188d77e2d237c1608ede9395035951b416c0b Mon Sep 17 00:00:00 2001
From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Sat, 22 Jun 2024 20:33:45 -0700
Subject: [PATCH 17/18] Added some more tests

---
 crates/lsp-ai/src/config.rs                   |   2 +-
 .../lsp-ai/src/memory_backends/file_store.rs  |  56 ++++++++--
 crates/lsp-ai/src/memory_worker.rs            |   3 +-
 crates/lsp-ai/src/transformer_worker.rs       | 104 +++++++++++++++++-
 4 files changed, 148 insertions(+), 17 deletions(-)

diff --git a/crates/lsp-ai/src/config.rs b/crates/lsp-ai/src/config.rs
index 3827a50..ea9631f 100644
--- a/crates/lsp-ai/src/config.rs
+++ b/crates/lsp-ai/src/config.rs
@@ -371,7 +371,7 @@ impl Config {
     }
 }
 
-// This makes testing much easier.
+// For teesting use only
 #[cfg(test)]
 impl Config {
     pub fn default_with_file_store_without_models() -> Self {
diff --git a/crates/lsp-ai/src/memory_backends/file_store.rs b/crates/lsp-ai/src/memory_backends/file_store.rs
index a8828a6..45abb7e 100644
--- a/crates/lsp-ai/src/memory_backends/file_store.rs
+++ b/crates/lsp-ai/src/memory_backends/file_store.rs
@@ -126,7 +126,7 @@ impl FileStore {
                         return Ok(false);
                     }
                     // This means it has been opened before
-                    let insert_uri = format!("file://{path}");
+                    let insert_uri = format!("file:///{path}");
                     if self.file_map.lock().contains_key(&insert_uri) {
                         return Ok(true);
                     }
@@ -453,6 +453,42 @@ impl MemoryBackend for FileStore {
     }
 }
 
+// For teesting use only
+#[cfg(test)]
+impl FileStore {
+    pub fn default_with_filler_file() -> anyhow::Result<Self> {
+        let config = Config::default_with_file_store_without_models();
+        let file_store_config = if let config::ValidMemoryBackend::FileStore(file_store_config) =
+            config.config.memory.clone()
+        {
+            file_store_config
+        } else {
+            anyhow::bail!("requires a file_store_config")
+        };
+        let f = FileStore::new(file_store_config, config)?;
+
+        let uri = "file:///filler.py";
+        let text = r#"# Multiplies two numbers
+def multiply_two_numbers(x, y):
+    return
+
+# A singular test
+assert multiply_two_numbers(2, 3) == 6
+"#;
+        let params = lsp_types::DidOpenTextDocumentParams {
+            text_document: lsp_types::TextDocumentItem {
+                uri: reqwest::Url::parse(uri).unwrap(),
+                language_id: "filler".to_string(),
+                version: 0,
+                text: text.to_string(),
+            },
+        };
+        f.opened_text_document(params)?;
+
+        Ok(f)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -476,7 +512,7 @@ mod tests {
     }
 
     fn generate_filler_text_document(uri: Option<&str>, text: Option<&str>) -> TextDocumentItem {
-        let uri = uri.unwrap_or("file://filler/");
+        let uri = uri.unwrap_or("file:///filler/");
         let text = text.unwrap_or("Here is the document body");
         TextDocumentItem {
             uri: reqwest::Url::parse(uri).unwrap(),
@@ -496,7 +532,7 @@ mod tests {
         let file = file_store
             .file_map
             .lock()
-            .get("file://filler/")
+            .get("file:///filler/")
             .unwrap()
             .clone();
         assert_eq!(file.rope.to_string(), "Here is the document body");
@@ -513,8 +549,8 @@ mod tests {
 
         let params = RenameFilesParams {
             files: vec![FileRename {
-                old_uri: "file://filler/".to_string(),
-                new_uri: "file://filler2/".to_string(),
+                old_uri: "file:///filler/".to_string(),
+                new_uri: "file:///filler2/".to_string(),
             }],
         };
         file_store.renamed_files(params)?;
@@ -522,7 +558,7 @@ mod tests {
         let file = file_store
             .file_map
             .lock()
-            .get("file://filler2/")
+            .get("file:///filler2/")
             .unwrap()
             .clone();
         assert_eq!(file.rope.to_string(), "Here is the document body");
@@ -563,7 +599,7 @@ mod tests {
         let file = file_store
             .file_map
             .lock()
-            .get("file://filler/")
+            .get("file:///filler/")
             .unwrap()
             .clone();
         assert_eq!(file.rope.to_string(), "Hae is the document body");
@@ -583,7 +619,7 @@ mod tests {
         let file = file_store
             .file_map
             .lock()
-            .get("file://filler/")
+            .get("file:///filler/")
             .unwrap()
             .clone();
         assert_eq!(file.rope.to_string(), "abc");
@@ -693,7 +729,7 @@ The end with a trailing new line
 
         // Test multi-file
         let text_document2 = generate_filler_text_document(
-            Some("file://filler2"),
+            Some("file:///filler2"),
             Some(
                 r#"Document Top2
 Here is a more complicated document
@@ -781,7 +817,7 @@ The end with a trailing new line
         let params = AdditionalFileStoreParams { build_tree: true };
         let file_store = FileStore::new_with_params(file_store_config, config, params)?;
 
-        let uri = "file://filler/test.rs";
+        let uri = "file:///filler/test.rs";
         let text = r#"#[derive(Debug)]
 struct Rectangle {
     width: u32,
diff --git a/crates/lsp-ai/src/memory_worker.rs b/crates/lsp-ai/src/memory_worker.rs
index bea5f85..1b7a481 100644
--- a/crates/lsp-ai/src/memory_worker.rs
+++ b/crates/lsp-ai/src/memory_worker.rs
@@ -69,8 +69,7 @@ async fn do_build_prompt(
     params
         .tx
         .send(prompt)
-        .map_err(|_| anyhow::anyhow!("sending on channel failed"))?;
-    Ok(())
+        .map_err(|_| anyhow::anyhow!("sending on channel failed"))
 }
 
 fn do_task(
diff --git a/crates/lsp-ai/src/transformer_worker.rs b/crates/lsp-ai/src/transformer_worker.rs
index fcbd831..7766a11 100644
--- a/crates/lsp-ai/src/transformer_worker.rs
+++ b/crates/lsp-ai/src/transformer_worker.rs
@@ -173,7 +173,7 @@ pub fn run(
         connection,
         config,
     ) {
-        error!("error in transformer worker: {e}")
+        error!("error in transformer worker: {e:?}")
     }
 }
 
@@ -256,7 +256,7 @@ async fn dispatch_request(
     {
         Ok(response) => response,
         Err(e) => {
-            error!("generating response: {e}");
+            error!("generating response: {e:?}");
             Response {
                 id: request.get_id(),
                 result: None,
@@ -266,7 +266,7 @@ async fn dispatch_request(
     };
 
     if let Err(e) = connection.sender.send(Message::Response(response)) {
-        error!("sending response: {e}");
+        error!("sending response: {e:?}");
     }
 }
 
@@ -412,7 +412,103 @@ async fn do_generate(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::memory_backends::{ContextAndCodePrompt, FIMPrompt};
+    use crate::memory_backends::{
+        file_store::FileStore, ContextAndCodePrompt, FIMPrompt, MemoryBackend,
+    };
+    use serde_json::json;
+    use std::{sync::mpsc, thread};
+
+    #[tokio::test]
+    async fn test_do_completion() -> anyhow::Result<()> {
+        let (memory_tx, memory_rx) = mpsc::channel();
+        let memory_backend: Box<dyn MemoryBackend + Send + Sync> =
+            Box::new(FileStore::default_with_filler_file()?);
+        thread::spawn(move || memory_worker::run(memory_backend, memory_rx));
+
+        let transformer_backend: Box<dyn TransformerBackend + Send + Sync> =
+            config::ValidModel::Ollama(serde_json::from_value(
+                json!({"model": "deepseek-coder:1.3b-base"}),
+            )?)
+            .try_into()?;
+        let completion_request = CompletionRequest::new(
+            serde_json::from_value(json!(0))?,
+            serde_json::from_value(json!({
+                "position": {"character":10, "line":2},
+                "textDocument": {
+                    "uri": "file:///filler.py"
+                }
+            }))?,
+        );
+        let mut config = config::Config::default_with_file_store_without_models();
+        config.config.completion = Some(serde_json::from_value(json!({
+            "model": "model1",
+            "parameters": {
+                "options": {
+                    "temperature": 0
+                }
+            }
+        }))?);
+
+        let result = do_completion(
+            &transformer_backend,
+            memory_tx,
+            &completion_request,
+            &config,
+        )
+        .await?;
+
+        assert_eq!(
+            " x * y",
+            result.result.clone().unwrap()["items"][0]["textEdit"]["newText"]
+                .as_str()
+                .unwrap()
+        );
+        assert_eq!(
+            "    return",
+            result.result.unwrap()["items"][0]["filterText"]
+                .as_str()
+                .unwrap()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_do_generate() -> anyhow::Result<()> {
+        let (memory_tx, memory_rx) = mpsc::channel();
+        let memory_backend: Box<dyn MemoryBackend + Send + Sync> =
+            Box::new(FileStore::default_with_filler_file()?);
+        thread::spawn(move || memory_worker::run(memory_backend, memory_rx));
+
+        let transformer_backend: Box<dyn TransformerBackend + Send + Sync> =
+            config::ValidModel::Ollama(serde_json::from_value(
+                json!({"model": "deepseek-coder:1.3b-base"}),
+            )?)
+            .try_into()?;
+        let generation_request = GenerationRequest::new(
+            serde_json::from_value(json!(0))?,
+            serde_json::from_value(json!({
+                "position": {"character":10, "line":2},
+                "textDocument": {
+                    "uri": "file:///filler.py"
+                },
+                "model": "model1",
+                "parameters": {
+                    "options": {
+                        "temperature": 0
+                    }
+                }
+            }))?,
+        );
+        let result = do_generate(&transformer_backend, memory_tx, &generation_request).await?;
+
+        assert_eq!(
+            " x * y",
+            result.result.unwrap()["generatedText"].as_str().unwrap()
+        );
+
+        Ok(())
+    }
 
     #[test]
     fn test_post_process_fim() {

From 9c17fc3456c19b566fcc0b40d4f5916a65b6a9b5 Mon Sep 17 00:00:00 2001
From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com>
Date: Mon, 24 Jun 2024 08:38:36 -0700
Subject: [PATCH 18/18] Get ready for publishing

---
 Cargo.lock                             | 1 -
 crates/splitter-tree-sitter/Cargo.toml | 3 +++
 crates/utils-tree-sitter/Cargo.toml    | 7 +++++--
 crates/utils-tree-sitter/src/lib.rs    | 6 +++---
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b89368f..d9ef5db 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3763,7 +3763,6 @@ dependencies = [
  "tree-sitter-ocaml",
  "tree-sitter-python",
  "tree-sitter-rust",
- "tree-sitter-zig",
 ]
 
 [[package]]
diff --git a/crates/splitter-tree-sitter/Cargo.toml b/crates/splitter-tree-sitter/Cargo.toml
index 51a55d8..2502006 100644
--- a/crates/splitter-tree-sitter/Cargo.toml
+++ b/crates/splitter-tree-sitter/Cargo.toml
@@ -1,8 +1,11 @@
 [package]
 name = "splitter-tree-sitter"
 version = "0.1.0"
+description = "A code splitter utilizing Tree-sitter"
 
 edition.workspace = true
+repository.workspace = true
+license.workspace = true
 
 [dependencies]
 thiserror = "1.0.61"
diff --git a/crates/utils-tree-sitter/Cargo.toml b/crates/utils-tree-sitter/Cargo.toml
index 8e90d57..22ebc26 100644
--- a/crates/utils-tree-sitter/Cargo.toml
+++ b/crates/utils-tree-sitter/Cargo.toml
@@ -1,8 +1,11 @@
 [package]
 name = "utils-tree-sitter"
 version = "0.1.0"
+description = "Utils for working with splitter-tree-sitter"
 
 edition.workspace = true
+repository.workspace = true
+license.workspace = true
 
 [dependencies]
 thiserror = "1.0.61"
@@ -24,11 +27,11 @@ tree-sitter-lua = { version = "0.1.0", optional = true }
 tree-sitter-ocaml = { version = "0.22.0", optional = true }
 tree-sitter-python = { version = "0.21", optional = true }
 tree-sitter-rust = { version = "0.21", optional = true }
-tree-sitter-zig = { git = "https://github.com/maxxnino/tree-sitter-zig", optional = true }
+# tree-sitter-zig = { git = "https://github.com/maxxnino/tree-sitter-zig", optional = true }
 
 [build-dependencies]
 cc="*"
 
 [features]
 default = []
-all = ["dep:tree-sitter-python", "dep:tree-sitter-bash", "dep:tree-sitter-c", "dep:tree-sitter-cpp", "dep:tree-sitter-c-sharp", "dep:tree-sitter-css", "dep:tree-sitter-elixir", "dep:tree-sitter-erlang", "dep:tree-sitter-go", "dep:tree-sitter-html", "dep:tree-sitter-java", "dep:tree-sitter-javascript", "dep:tree-sitter-json", "dep:tree-sitter-rust", "dep:tree-sitter-zig", "dep:tree-sitter-haskell", "dep:tree-sitter-lua", "dep:tree-sitter-ocaml"]
+all = ["dep:tree-sitter-python", "dep:tree-sitter-bash", "dep:tree-sitter-c", "dep:tree-sitter-cpp", "dep:tree-sitter-c-sharp", "dep:tree-sitter-css", "dep:tree-sitter-elixir", "dep:tree-sitter-erlang", "dep:tree-sitter-go", "dep:tree-sitter-html", "dep:tree-sitter-java", "dep:tree-sitter-javascript", "dep:tree-sitter-json", "dep:tree-sitter-rust", "dep:tree-sitter-haskell", "dep:tree-sitter-lua", "dep:tree-sitter-ocaml"]
diff --git a/crates/utils-tree-sitter/src/lib.rs b/crates/utils-tree-sitter/src/lib.rs
index 6e71246..7facd50 100644
--- a/crates/utils-tree-sitter/src/lib.rs
+++ b/crates/utils-tree-sitter/src/lib.rs
@@ -15,7 +15,7 @@ fn get_extension_for_language(extension: &str) -> Result<String, GetParserError>
     Ok(match extension {
         "py" => "Python",
         "rs" => "Rust",
-        "zig" => "Zig",
+        // "zig" => "Zig",
         "sh" => "Bash",
         "c" => "C",
         "cpp" => "C++",
@@ -48,8 +48,8 @@ pub fn get_parser_for_extension(extension: &str) -> Result<Parser, GetParserErro
         "Python" => parser.set_language(&tree_sitter_python::language())?,
         #[cfg(any(feature = "all", feature = "rust"))]
         "Rust" => parser.set_language(&tree_sitter_rust::language())?,
-        #[cfg(any(feature = "all", feature = "zig"))]
-        "Zig" => parser.set_language(&tree_sitter_zig::language())?,
+        // #[cfg(any(feature = "all", feature = "zig"))]
+        // "Zig" => parser.set_language(&tree_sitter_zig::language())?,
         #[cfg(any(feature = "all", feature = "bash"))]
         "Bash" => parser.set_language(&tree_sitter_bash::language())?,
         #[cfg(any(feature = "all", feature = "c"))]