Made into a workspace

This commit is contained in:
Silas Marvin 2024-06-16 16:25:44 -07:00
parent 58192c4182
commit f2b8c1eda3
28 changed files with 222 additions and 91 deletions

103
Cargo.lock generated
View File

@ -149,6 +149,18 @@ dependencies = [
"num-traits",
]
[[package]]
name = "auto_enums"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1899bfcfd9340ceea3533ea157360ba8fa864354eccbceab58e1006ecab35393"
dependencies = [
"derive_utils",
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@ -356,7 +368,7 @@ version = "4.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47"
dependencies = [
"heck",
"heck 0.4.1",
"proc-macro2",
"quote",
"syn 2.0.52",
@ -662,6 +674,17 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "derive_utils"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61bb5a1014ce6dfc2a378578509abe775a5aa06bff584a547555d9efdb81b926"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "difflib"
version = "0.4.0"
@ -730,9 +753,9 @@ checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125"
[[package]]
name = "either"
version = "1.10.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
dependencies = [
"serde",
]
@ -1056,6 +1079,12 @@ dependencies = [
"unicode-segmentation",
]
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.3.9"
@ -1364,6 +1393,15 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.10"
@ -1518,7 +1556,7 @@ dependencies = [
[[package]]
name = "lsp-ai"
version = "0.3.0"
version = "0.2.0"
dependencies = [
"anyhow",
"assert_cmd",
@ -1541,6 +1579,7 @@ dependencies = [
"serde",
"serde_json",
"splitter-tree-sitter",
"text-splitter",
"tokenizers",
"tokio",
"tracing",
@ -2419,6 +2458,12 @@ dependencies = [
"untrusted",
]
[[package]]
name = "rustversion"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
[[package]]
name = "ryu"
version = "1.0.17"
@ -2479,7 +2524,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "878cf3d57f0e5bfacd425cdaccc58b4c06d68a7b71c63fc28710a20c88676808"
dependencies = [
"darling 0.14.4",
"heck",
"heck 0.4.1",
"quote",
"syn 1.0.109",
]
@ -2502,7 +2547,7 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25a82fcb49253abcb45cdcb2adf92956060ec0928635eb21b4f7a6d8f25ab0bc"
dependencies = [
"heck",
"heck 0.4.1",
"proc-macro2",
"quote",
"syn 2.0.52",
@ -2767,6 +2812,8 @@ dependencies = [
"cc",
"thiserror",
"tree-sitter",
"tree-sitter-rust",
"tree-sitter-zig",
]
[[package]]
@ -2870,7 +2917,7 @@ checksum = "5833ef53aaa16d860e92123292f1f6a3d53c34ba8b1969f152ef1a7bb803f3c8"
dependencies = [
"dotenvy",
"either",
"heck",
"heck 0.4.1",
"hex",
"once_cell",
"proc-macro2",
@ -3026,6 +3073,28 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "strum"
version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29"
dependencies = [
"strum_macros",
]
[[package]]
name = "strum_macros"
version = "0.26.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
dependencies = [
"heck 0.5.0",
"proc-macro2",
"quote",
"rustversion",
"syn 2.0.52",
]
[[package]]
name = "subtle"
version = "2.5.0"
@ -3099,6 +3168,24 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
[[package]]
name = "text-splitter"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ab9dc04b7cf08eb01c07c272bf699fa55679a326ddf7dd075e14094efc80fb9"
dependencies = [
"ahash",
"auto_enums",
"either",
"itertools 0.13.0",
"once_cell",
"regex",
"strum",
"thiserror",
"tree-sitter",
"unicode-segmentation",
]
[[package]]
name = "thiserror"
version = "1.0.61"
@ -3385,7 +3472,7 @@ dependencies = [
[[package]]
name = "tree-sitter-zig"
version = "0.0.1"
source = "git+https://github.com/SilasMarvin/tree-sitter-zig?branch=silas-update-tree-sitter-version#2eedab3ff6dda88aedddf0bb32a14f81bb709a73"
source = "git+https://github.com/maxxnino/tree-sitter-zig#7c5a29b721d409be8842017351bf007d7e384401"
dependencies = [
"cc",
"tree-sitter",

View File

@ -1,52 +1,16 @@
[package]
name = "lsp-ai"
version = "0.3.0"
[workspace]
members = [
"crates/*",
]
resolver = "2"
[workspace.package]
edition = "2021"
license = "MIT"
description = "LSP-AI is an open-source language server that serves as a backend for AI-powered functionality, designed to assist and empower software engineers, not replace them."
repository = "https://github.com/SilasMarvin/lsp-ai"
readme = "README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.75"
lsp-server = "0.7.6"
lsp-types = "0.95.0"
ropey = "1.6.1"
serde = "1.0.190"
serde_json = "1.0.108"
hf-hub = { git = "https://github.com/huggingface/hf-hub", version = "0.3.2" }
rand = "0.8.5"
tokenizers = "0.14.1"
parking_lot = "0.12.1"
once_cell = "1.19.0"
directories = "5.0.1"
llama-cpp-2 = { version = "0.1.55", optional = true }
minijinja = { version = "1.0.12", features = ["loader"] }
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
tracing = "0.1.40"
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
reqwest = { version = "0.11.25", features = ["blocking", "json"] }
ignore = "0.4.22"
pgml = "1.0.4"
tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
indexmap = "2.2.5"
async-trait = "0.1.78"
tree-sitter = "0.22"
# splitter-tree-sitter = { git = "https://github.com/SilasMarvin/splitter-tree-sitter" }
splitter-tree-sitter = { path = "../../splitter-tree-sitter" }
# utils-tree-sitter = { git = "https://github.com/SilasMarvin/utils-tree-sitter" }
utils-tree-sitter = { path = "../../utils-tree-sitter", features = ["all"] }
[build-dependencies]
cc="*"
[features]
default = []
llama_cpp = ["dep:llama-cpp-2"]
metal = ["llama-cpp-2/metal"]
cuda = ["llama-cpp-2/cuda"]
[dev-dependencies]
assert_cmd = "2.0.14"
[workspace.dependencies]
utils-tree-sitter = { path = "./crates/utils-tree-sitter" }
splitter-tree-sitter = { path = "./crates/splitter-tree-sitter" }

50
crates/lsp-ai/Cargo.toml Normal file
View File

@ -0,0 +1,50 @@
[package]
name = "lsp-ai"
version = "0.2.0"
description.workspace = true
repository.workspace = true
readme.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
anyhow = "1.0.75"
lsp-server = "0.7.6"
lsp-types = "0.95.0"
ropey = "1.6.1"
serde = "1.0.190"
serde_json = "1.0.108"
hf-hub = { git = "https://github.com/huggingface/hf-hub", version = "0.3.2" }
rand = "0.8.5"
tokenizers = "0.14.1"
parking_lot = "0.12.1"
once_cell = "1.19.0"
directories = "5.0.1"
llama-cpp-2 = { version = "0.1.55", optional = true }
minijinja = { version = "1.0.12", features = ["loader"] }
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
tracing = "0.1.40"
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
reqwest = { version = "0.11.25", features = ["blocking", "json"] }
ignore = "0.4.22"
pgml = "1.0.4"
tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
indexmap = "2.2.5"
async-trait = "0.1.78"
tree-sitter = "0.22"
utils-tree-sitter = { workspace = true, features = ["all"] }
splitter-tree-sitter = { workspace = true }
text-splitter = { version = "0.13.3", features = ["code"] }
[build-dependencies]
cc="*"
[features]
default = []
llama_cpp = ["dep:llama-cpp-2"]
metal = ["llama-cpp-2/metal"]
cuda = ["llama-cpp-2/cuda"]
[dev-dependencies]
assert_cmd = "2.0.14"

View File

@ -1,5 +1,6 @@
use ignore::WalkBuilder;
use std::collections::HashSet;
use tracing::{error, instrument};
use crate::config::{self, Config};
@ -18,14 +19,11 @@ impl Crawl {
}
}
pub fn crawl_config(&self) -> &config::Crawl {
&self.crawl_config
}
#[instrument(skip(self, f))]
pub fn maybe_do_crawl(
&mut self,
triggered_file: Option<String>,
mut f: impl FnMut(&config::Crawl, &str) -> anyhow::Result<()>,
mut f: impl FnMut(&config::Crawl, &str) -> anyhow::Result<bool>,
) -> anyhow::Result<()> {
if let Some(root_uri) = &self.config.client_params.root_uri {
if !root_uri.starts_with("file://") {
@ -56,7 +54,14 @@ impl Crawl {
if !path.is_dir() {
if let Some(path_str) = path.to_str() {
if self.crawl_config.all_files {
f(&self.crawl_config, path_str)?;
match f(&self.crawl_config, path_str) {
Ok(c) => {
if !c {
return Ok(());
}
}
Err(e) => error!("{e:?}"),
}
} else {
match (
path.extension().map(|pe| pe.to_str()).flatten(),
@ -64,7 +69,14 @@ impl Crawl {
) {
(Some(path_extension), Some(extension_to_match)) => {
if path_extension == extension_to_match {
f(&self.crawl_config, path_str)?;
match f(&self.crawl_config, path_str) {
Ok(c) => {
if !c {
return Ok(());
}
}
Err(e) => error!("{e:?}"),
}
}
}
_ => continue,

View File

@ -4,8 +4,8 @@ use lsp_types::TextDocumentPositionParams;
use parking_lot::Mutex;
use ropey::Rope;
use serde_json::Value;
use std::collections::HashMap;
use tracing::{error, instrument};
use std::{collections::HashMap, io::Read};
use tracing::{error, instrument, warn};
use tree_sitter::{InputEdit, Point, Tree};
use crate::{
@ -114,18 +114,37 @@ impl FileStore {
}
fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
let mut total_bytes = 0;
let mut current_bytes = 0;
if let Some(crawl) = &self.crawl {
crawl
.lock()
.maybe_do_crawl(triggered_file, |config, path| {
// Break if total bytes is over the max crawl memory
if total_bytes as u64 >= config.max_crawl_memory {
warn!("Ending crawl early due to `max_crawl_memory` resetraint");
return Ok(false);
}
// This means it has been opened before
let insert_uri = format!("file://{path}");
if self.file_map.lock().contains_key(&insert_uri) {
return Ok(());
return Ok(true);
}
// TODO: actually limit files based on config
let contents = std::fs::read_to_string(path)?;
// Open the file and see if it is small enough to read
let mut f = std::fs::File::open(path)?;
let metadata = f.metadata()?;
if metadata.len() > config.max_file_size {
warn!("Skipping file: {path} because it is too large");
return Ok(true);
}
// Read the file contents
let mut contents = vec![];
f.read_to_end(&mut contents)?;
let contents = String::from_utf8(contents)?;
current_bytes += contents.len();
total_bytes += contents.len();
self.add_new_file(&insert_uri, contents);
Ok(())
Ok(true)
})?;
}
Ok(())

View File

@ -251,29 +251,31 @@ impl PostgresML {
crawl
.lock()
.maybe_do_crawl(triggered_file, |config, path| {
let uri = format!("file://{path}");
// Break if total bytes is over the max crawl memory
if total_bytes as u64 >= config.max_crawl_memory {
warn!("Ending crawl early due to `max_crawl_memory` resetraint");
return Ok(false);
}
// This means it has been opened before
let uri = format!("file://{path}");
if self.file_store.contains_file(&uri) {
return Ok(());
return Ok(true);
}
// Open the file and see if it is small enough to read
let mut f = std::fs::File::open(path)?;
if f.metadata()
.map(|m| m.len() > config.max_file_size)
.unwrap_or(true)
{
warn!("Skipping file because it is too large: {path}");
return Ok(());
let metadata = f.metadata()?;
if metadata.len() > config.max_file_size {
warn!("Skipping file: {path} because it is too large");
return Ok(true);
}
// Read the file contents
let mut contents = vec![];
f.read_to_end(&mut contents);
if let Ok(contents) = String::from_utf8(contents) {
current_bytes += contents.len();
total_bytes += contents.len();
let chunks = self.splitter.split_file_contents(&uri, &contents);
documents.push((uri, chunks));
}
f.read_to_end(&mut contents)?;
let contents = String::from_utf8(contents)?;
current_bytes += contents.len();
total_bytes += contents.len();
let chunks = self.splitter.split_file_contents(&uri, &contents);
documents.push((uri, chunks));
// If we have over 100 mega bytes of data do the upsert
if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory
{
@ -305,12 +307,7 @@ impl PostgresML {
current_bytes = 0;
documents = vec![];
}
// Break if total bytes is over the max crawl memory
if total_bytes as u64 >= config.max_crawl_memory {
warn!("Ending crawl eraly do to max_crawl_memory");
return Ok(());
}
Ok(())
Ok(true)
})?;
}
Ok(())

View File

@ -41,7 +41,7 @@ impl Splitter for TreeSitter {
Ok(chunks) => chunks,
Err(e) => {
error!(
"Failed to parse tree for file with error {e:?}. Falling back to default splitter.",
"Failed to parse tree for file with error: {e:?}. Falling back to default splitter.",
);
todo!()
}
@ -57,14 +57,14 @@ impl Splitter for TreeSitter {
Ok(chunks) => chunks,
Err(e) => {
error!(
"Failed to parse tree for file: {uri} with error {e:?}. Falling back to default splitter.",
"Failed to parse tree for file: {uri} with error: {e:?}. Falling back to default splitter.",
);
todo!()
}
},
Err(e) => {
error!(
"Failed to parse tree for file {uri} with error {e:?}. Falling back to default splitter.",
"Failed to parse tree for file {uri} with error: {e:?}. Falling back to default splitter.",
);
todo!()
}

@ -0,0 +1 @@
Subproject commit 37a2e98cce5a1b39f07aec7e5b3bc75eebb41ac2

@ -0,0 +1 @@
Subproject commit a38e7143bcab2412348fd92904cc5105117896a1