mirror of
https://github.com/zed-industries/zed.git
synced 2024-09-18 18:08:07 +03:00
Port semantic_index
to gpui2
Co-Authored-By: Julia Risley <julia@zed.dev>
This commit is contained in:
parent
d433da1e70
commit
09db455db2
51
Cargo.lock
generated
51
Cargo.lock
generated
@ -8232,6 +8232,57 @@ dependencies = [
|
||||
"workspace",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semantic_index2"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ai2",
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"client2",
|
||||
"collections",
|
||||
"ctor",
|
||||
"env_logger 0.9.3",
|
||||
"futures 0.3.28",
|
||||
"globset",
|
||||
"gpui2",
|
||||
"language2",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"ndarray",
|
||||
"node_runtime",
|
||||
"ordered-float 2.10.0",
|
||||
"parking_lot 0.11.2",
|
||||
"postage",
|
||||
"pretty_assertions",
|
||||
"project2",
|
||||
"rand 0.8.5",
|
||||
"rpc2",
|
||||
"rusqlite",
|
||||
"rust-embed",
|
||||
"schemars",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"settings2",
|
||||
"sha1",
|
||||
"smol",
|
||||
"tempdir",
|
||||
"tiktoken-rs",
|
||||
"tree-sitter",
|
||||
"tree-sitter-cpp",
|
||||
"tree-sitter-elixir",
|
||||
"tree-sitter-json 0.20.0",
|
||||
"tree-sitter-lua",
|
||||
"tree-sitter-php",
|
||||
"tree-sitter-ruby",
|
||||
"tree-sitter-rust",
|
||||
"tree-sitter-toml",
|
||||
"tree-sitter-typescript",
|
||||
"unindent",
|
||||
"util",
|
||||
"workspace2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.18"
|
||||
|
@ -95,6 +95,8 @@ members = [
|
||||
"crates/rpc2",
|
||||
"crates/search",
|
||||
"crates/search2",
|
||||
"crates/semantic_index",
|
||||
"crates/semantic_index2",
|
||||
"crates/settings",
|
||||
"crates/settings2",
|
||||
"crates/snippet",
|
||||
@ -114,7 +116,6 @@ members = [
|
||||
"crates/theme_selector2",
|
||||
"crates/ui2",
|
||||
"crates/util",
|
||||
"crates/semantic_index",
|
||||
"crates/story",
|
||||
"crates/vim",
|
||||
"crates/vcs_menu",
|
||||
|
@ -7,7 +7,7 @@ pub enum ProviderCredential {
|
||||
NotNeeded,
|
||||
}
|
||||
|
||||
pub trait CredentialProvider {
|
||||
pub trait CredentialProvider: Send + Sync {
|
||||
fn has_credentials(&self) -> bool;
|
||||
fn retrieve_credentials(&self, cx: &mut AppContext) -> ProviderCredential;
|
||||
fn save_credentials(&self, cx: &mut AppContext, credential: ProviderCredential);
|
||||
|
@ -35,7 +35,7 @@ pub struct OpenAIEmbeddingProvider {
|
||||
model: OpenAILanguageModel,
|
||||
credential: Arc<RwLock<ProviderCredential>>,
|
||||
pub client: Arc<dyn HttpClient>,
|
||||
pub executor: Arc<BackgroundExecutor>,
|
||||
pub executor: BackgroundExecutor,
|
||||
rate_limit_count_rx: watch::Receiver<Option<Instant>>,
|
||||
rate_limit_count_tx: Arc<Mutex<watch::Sender<Option<Instant>>>>,
|
||||
}
|
||||
@ -66,7 +66,7 @@ struct OpenAIEmbeddingUsage {
|
||||
}
|
||||
|
||||
impl OpenAIEmbeddingProvider {
|
||||
pub fn new(client: Arc<dyn HttpClient>, executor: Arc<BackgroundExecutor>) -> Self {
|
||||
pub fn new(client: Arc<dyn HttpClient>, executor: BackgroundExecutor) -> Self {
|
||||
let (rate_limit_count_tx, rate_limit_count_rx) = watch::channel_with(None);
|
||||
let rate_limit_count_tx = Arc::new(Mutex::new(rate_limit_count_tx));
|
||||
|
||||
|
@ -482,10 +482,6 @@ impl<T: 'static> WeakModel<T> {
|
||||
/// Update the entity referenced by this model with the given function if
|
||||
/// the referenced entity still exists. Returns an error if the entity has
|
||||
/// been released.
|
||||
///
|
||||
/// The update function receives a context appropriate for its environment.
|
||||
/// When updating in an `AppContext`, it receives a `ModelContext`.
|
||||
/// When updating an a `WindowContext`, it receives a `ViewContext`.
|
||||
pub fn update<C, R>(
|
||||
&self,
|
||||
cx: &mut C,
|
||||
@ -501,6 +497,21 @@ impl<T: 'static> WeakModel<T> {
|
||||
.map(|this| cx.update_model(&this, update)),
|
||||
)
|
||||
}
|
||||
|
||||
/// Reads the entity referenced by this model with the given function if
|
||||
/// the referenced entity still exists. Returns an error if the entity has
|
||||
/// been released.
|
||||
pub fn read_with<C, R>(&self, cx: &C, read: impl FnOnce(&T, &AppContext) -> R) -> Result<R>
|
||||
where
|
||||
C: Context,
|
||||
Result<C::Result<R>>: crate::Flatten<R>,
|
||||
{
|
||||
crate::Flatten::flatten(
|
||||
self.upgrade()
|
||||
.ok_or_else(|| anyhow!("entity release"))
|
||||
.map(|this| cx.read_model(&this, read)),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Hash for WeakModel<T> {
|
||||
|
69
crates/semantic_index2/Cargo.toml
Normal file
69
crates/semantic_index2/Cargo.toml
Normal file
@ -0,0 +1,69 @@
|
||||
[package]
|
||||
name = "semantic_index2"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
publish = false
|
||||
|
||||
[lib]
|
||||
path = "src/semantic_index.rs"
|
||||
doctest = false
|
||||
|
||||
[dependencies]
|
||||
ai = { package = "ai2", path = "../ai2" }
|
||||
collections = { path = "../collections" }
|
||||
gpui = { package = "gpui2", path = "../gpui2" }
|
||||
language = { package = "language2", path = "../language2" }
|
||||
project = { package = "project2", path = "../project2" }
|
||||
workspace = { package = "workspace2", path = "../workspace2" }
|
||||
util = { path = "../util" }
|
||||
rpc = { package = "rpc2", path = "../rpc2" }
|
||||
settings = { package = "settings2", path = "../settings2" }
|
||||
anyhow.workspace = true
|
||||
postage.workspace = true
|
||||
futures.workspace = true
|
||||
ordered-float.workspace = true
|
||||
smol.workspace = true
|
||||
rusqlite.workspace = true
|
||||
log.workspace = true
|
||||
tree-sitter.workspace = true
|
||||
lazy_static.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
async-trait.workspace = true
|
||||
tiktoken-rs.workspace = true
|
||||
parking_lot.workspace = true
|
||||
rand.workspace = true
|
||||
schemars.workspace = true
|
||||
globset.workspace = true
|
||||
sha1 = "0.10.5"
|
||||
ndarray = { version = "0.15.0" }
|
||||
|
||||
[dev-dependencies]
|
||||
ai = { package = "ai2", path = "../ai2", features = ["test-support"] }
|
||||
collections = { path = "../collections", features = ["test-support"] }
|
||||
gpui = { package = "gpui2", path = "../gpui2", features = ["test-support"] }
|
||||
language = { package = "language2", path = "../language2", features = ["test-support"] }
|
||||
project = { package = "project2", path = "../project2", features = ["test-support"] }
|
||||
rpc = { package = "rpc2", path = "../rpc2", features = ["test-support"] }
|
||||
workspace = { package = "workspace2", path = "../workspace2", features = ["test-support"] }
|
||||
settings = { package = "settings2", path = "../settings2", features = ["test-support"]}
|
||||
rust-embed = { version = "8.0", features = ["include-exclude"] }
|
||||
client = { package = "client2", path = "../client2" }
|
||||
node_runtime = { path = "../node_runtime"}
|
||||
|
||||
pretty_assertions.workspace = true
|
||||
rand.workspace = true
|
||||
unindent.workspace = true
|
||||
tempdir.workspace = true
|
||||
ctor.workspace = true
|
||||
env_logger.workspace = true
|
||||
|
||||
tree-sitter-typescript.workspace = true
|
||||
tree-sitter-json.workspace = true
|
||||
tree-sitter-rust.workspace = true
|
||||
tree-sitter-toml.workspace = true
|
||||
tree-sitter-cpp.workspace = true
|
||||
tree-sitter-elixir.workspace = true
|
||||
tree-sitter-lua.workspace = true
|
||||
tree-sitter-ruby.workspace = true
|
||||
tree-sitter-php.workspace = true
|
20
crates/semantic_index2/README.md
Normal file
20
crates/semantic_index2/README.md
Normal file
@ -0,0 +1,20 @@
|
||||
|
||||
# Semantic Index
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Metrics
|
||||
|
||||
nDCG@k:
|
||||
- "The value of NDCG is determined by comparing the relevance of the items returned by the search engine to the relevance of the item that a hypothetical "ideal" search engine would return.
|
||||
- "The relevance of result is represented by a score (also known as a 'grade') that is assigned to the search query. The scores of these results are then discounted based on their position in the search results -- did they get recommended first or last?"
|
||||
|
||||
MRR@k:
|
||||
- "Mean reciprocal rank quantifies the rank of the first relevant item found in teh recommendation list."
|
||||
|
||||
MAP@k:
|
||||
- "Mean average precision averages the precision@k metric at each relevant item position in the recommendation list.
|
||||
|
||||
Resources:
|
||||
- [Evaluating recommendation metrics](https://www.shaped.ai/blog/evaluating-recommendation-systems-map-mmr-ndcg)
|
||||
- [Math Walkthrough](https://towardsdatascience.com/demystifying-ndcg-bee3be58cfe0)
|
114
crates/semantic_index2/eval/gpt-engineer.json
Normal file
114
crates/semantic_index2/eval/gpt-engineer.json
Normal file
@ -0,0 +1,114 @@
|
||||
{
|
||||
"repo": "https://github.com/AntonOsika/gpt-engineer.git",
|
||||
"commit": "7735a6445bae3611c62f521e6464c67c957f87c2",
|
||||
"assertions": [
|
||||
{
|
||||
"query": "How do I contribute to this project?",
|
||||
"matches": [
|
||||
".github/CONTRIBUTING.md:1",
|
||||
"ROADMAP.md:48"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "What version of the openai package is active?",
|
||||
"matches": [
|
||||
"pyproject.toml:14"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "Ask user for clarification",
|
||||
"matches": [
|
||||
"gpt_engineer/steps.py:69"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "generate tests for python code",
|
||||
"matches": [
|
||||
"gpt_engineer/steps.py:153"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "get item from database based on key",
|
||||
"matches": [
|
||||
"gpt_engineer/db.py:42",
|
||||
"gpt_engineer/db.py:68"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "prompt user to select files",
|
||||
"matches": [
|
||||
"gpt_engineer/file_selector.py:171",
|
||||
"gpt_engineer/file_selector.py:306",
|
||||
"gpt_engineer/file_selector.py:289",
|
||||
"gpt_engineer/file_selector.py:234"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "send to rudderstack",
|
||||
"matches": [
|
||||
"gpt_engineer/collect.py:11",
|
||||
"gpt_engineer/collect.py:38"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "parse code blocks from chat messages",
|
||||
"matches": [
|
||||
"gpt_engineer/chat_to_files.py:10",
|
||||
"docs/intro/chat_parsing.md:1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "how do I use the docker cli?",
|
||||
"matches": [
|
||||
"docker/README.md:1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "ask the user if the code ran successfully?",
|
||||
"matches": [
|
||||
"gpt_engineer/learning.py:54"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "how is consent granted by the user?",
|
||||
"matches": [
|
||||
"gpt_engineer/learning.py:107",
|
||||
"gpt_engineer/learning.py:130",
|
||||
"gpt_engineer/learning.py:152"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "what are all the different steps the agent can take?",
|
||||
"matches": [
|
||||
"docs/intro/steps_module.md:1",
|
||||
"gpt_engineer/steps.py:391"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "ask the user for clarification?",
|
||||
"matches": [
|
||||
"gpt_engineer/steps.py:69"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "what models are available?",
|
||||
"matches": [
|
||||
"gpt_engineer/ai.py:315",
|
||||
"gpt_engineer/ai.py:341",
|
||||
"docs/open-models.md:1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "what is the current focus of the project?",
|
||||
"matches": [
|
||||
"ROADMAP.md:11"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "does the agent know how to fix code?",
|
||||
"matches": [
|
||||
"gpt_engineer/steps.py:367"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
104
crates/semantic_index2/eval/tree-sitter.json
Normal file
104
crates/semantic_index2/eval/tree-sitter.json
Normal file
@ -0,0 +1,104 @@
|
||||
{
|
||||
"repo": "https://github.com/tree-sitter/tree-sitter.git",
|
||||
"commit": "46af27796a76c72d8466627d499f2bca4af958ee",
|
||||
"assertions": [
|
||||
{
|
||||
"query": "What attributes are available for the tags configuration struct?",
|
||||
"matches": [
|
||||
"tags/src/lib.rs:24"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "create a new tag configuration",
|
||||
"matches": [
|
||||
"tags/src/lib.rs:119"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "generate tags based on config",
|
||||
"matches": [
|
||||
"tags/src/lib.rs:261"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "match on ts quantifier in rust",
|
||||
"matches": [
|
||||
"lib/binding_rust/lib.rs:139"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "cli command to generate tags",
|
||||
"matches": [
|
||||
"cli/src/tags.rs:10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "what version of the tree-sitter-tags package is active?",
|
||||
"matches": [
|
||||
"tags/Cargo.toml:4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "Insert a new parse state",
|
||||
"matches": [
|
||||
"cli/src/generate/build_tables/build_parse_table.rs:153"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "Handle conflict when numerous actions occur on the same symbol",
|
||||
"matches": [
|
||||
"cli/src/generate/build_tables/build_parse_table.rs:363",
|
||||
"cli/src/generate/build_tables/build_parse_table.rs:442"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "Match based on associativity of actions",
|
||||
"matches": [
|
||||
"cri/src/generate/build_tables/build_parse_table.rs:542"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "Format token set display",
|
||||
"matches": [
|
||||
"cli/src/generate/build_tables/item.rs:246"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "extract choices from rule",
|
||||
"matches": [
|
||||
"cli/src/generate/prepare_grammar/flatten_grammar.rs:124"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "How do we identify if a symbol is being used?",
|
||||
"matches": [
|
||||
"cli/src/generate/prepare_grammar/flatten_grammar.rs:175"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "How do we launch the playground?",
|
||||
"matches": [
|
||||
"cli/src/playground.rs:46"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "How do we test treesitter query matches in rust?",
|
||||
"matches": [
|
||||
"cli/src/query_testing.rs:152",
|
||||
"cli/src/tests/query_test.rs:781",
|
||||
"cli/src/tests/query_test.rs:2163",
|
||||
"cli/src/tests/query_test.rs:3781",
|
||||
"cli/src/tests/query_test.rs:887"
|
||||
]
|
||||
},
|
||||
{
|
||||
"query": "What does the CLI do?",
|
||||
"matches": [
|
||||
"cli/README.md:10",
|
||||
"cli/loader/README.md:3",
|
||||
"docs/section-5-implementation.md:14",
|
||||
"docs/section-5-implementation.md:18"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
603
crates/semantic_index2/src/db.rs
Normal file
603
crates/semantic_index2/src/db.rs
Normal file
@ -0,0 +1,603 @@
|
||||
use crate::{
|
||||
parsing::{Span, SpanDigest},
|
||||
SEMANTIC_INDEX_VERSION,
|
||||
};
|
||||
use ai::embedding::Embedding;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use collections::HashMap;
|
||||
use futures::channel::oneshot;
|
||||
use gpui::BackgroundExecutor;
|
||||
use ndarray::{Array1, Array2};
|
||||
use ordered_float::OrderedFloat;
|
||||
use project::Fs;
|
||||
use rpc::proto::Timestamp;
|
||||
use rusqlite::params;
|
||||
use rusqlite::types::Value;
|
||||
use std::{
|
||||
future::Future,
|
||||
ops::Range,
|
||||
path::{Path, PathBuf},
|
||||
rc::Rc,
|
||||
sync::Arc,
|
||||
time::SystemTime,
|
||||
};
|
||||
use util::{paths::PathMatcher, TryFutureExt};
|
||||
|
||||
pub fn argsort<T: Ord>(data: &[T]) -> Vec<usize> {
|
||||
let mut indices = (0..data.len()).collect::<Vec<_>>();
|
||||
indices.sort_by_key(|&i| &data[i]);
|
||||
indices.reverse();
|
||||
indices
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FileRecord {
|
||||
pub id: usize,
|
||||
pub relative_path: String,
|
||||
pub mtime: Timestamp,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct VectorDatabase {
|
||||
path: Arc<Path>,
|
||||
transactions:
|
||||
smol::channel::Sender<Box<dyn 'static + Send + FnOnce(&mut rusqlite::Connection)>>,
|
||||
}
|
||||
|
||||
impl VectorDatabase {
|
||||
pub async fn new(
|
||||
fs: Arc<dyn Fs>,
|
||||
path: Arc<Path>,
|
||||
executor: BackgroundExecutor,
|
||||
) -> Result<Self> {
|
||||
if let Some(db_directory) = path.parent() {
|
||||
fs.create_dir(db_directory).await?;
|
||||
}
|
||||
|
||||
let (transactions_tx, transactions_rx) = smol::channel::unbounded::<
|
||||
Box<dyn 'static + Send + FnOnce(&mut rusqlite::Connection)>,
|
||||
>();
|
||||
executor
|
||||
.spawn({
|
||||
let path = path.clone();
|
||||
async move {
|
||||
let mut connection = rusqlite::Connection::open(&path)?;
|
||||
|
||||
connection.pragma_update(None, "journal_mode", "wal")?;
|
||||
connection.pragma_update(None, "synchronous", "normal")?;
|
||||
connection.pragma_update(None, "cache_size", 1000000)?;
|
||||
connection.pragma_update(None, "temp_store", "MEMORY")?;
|
||||
|
||||
while let Ok(transaction) = transactions_rx.recv().await {
|
||||
transaction(&mut connection);
|
||||
}
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
.log_err()
|
||||
})
|
||||
.detach();
|
||||
let this = Self {
|
||||
transactions: transactions_tx,
|
||||
path,
|
||||
};
|
||||
this.initialize_database().await?;
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Arc<Path> {
|
||||
&self.path
|
||||
}
|
||||
|
||||
fn transact<F, T>(&self, f: F) -> impl Future<Output = Result<T>>
|
||||
where
|
||||
F: 'static + Send + FnOnce(&rusqlite::Transaction) -> Result<T>,
|
||||
T: 'static + Send,
|
||||
{
|
||||
let (tx, rx) = oneshot::channel();
|
||||
let transactions = self.transactions.clone();
|
||||
async move {
|
||||
if transactions
|
||||
.send(Box::new(|connection| {
|
||||
let result = connection
|
||||
.transaction()
|
||||
.map_err(|err| anyhow!(err))
|
||||
.and_then(|transaction| {
|
||||
let result = f(&transaction)?;
|
||||
transaction.commit()?;
|
||||
Ok(result)
|
||||
});
|
||||
let _ = tx.send(result);
|
||||
}))
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
return Err(anyhow!("connection was dropped"))?;
|
||||
}
|
||||
rx.await?
|
||||
}
|
||||
}
|
||||
|
||||
fn initialize_database(&self) -> impl Future<Output = Result<()>> {
|
||||
self.transact(|db| {
|
||||
rusqlite::vtab::array::load_module(&db)?;
|
||||
|
||||
// Delete existing tables, if SEMANTIC_INDEX_VERSION is bumped
|
||||
let version_query = db.prepare("SELECT version from semantic_index_config");
|
||||
let version = version_query
|
||||
.and_then(|mut query| query.query_row([], |row| Ok(row.get::<_, i64>(0)?)));
|
||||
if version.map_or(false, |version| version == SEMANTIC_INDEX_VERSION as i64) {
|
||||
log::trace!("vector database schema up to date");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
log::trace!("vector database schema out of date. updating...");
|
||||
// We renamed the `documents` table to `spans`, so we want to drop
|
||||
// `documents` without recreating it if it exists.
|
||||
db.execute("DROP TABLE IF EXISTS documents", [])
|
||||
.context("failed to drop 'documents' table")?;
|
||||
db.execute("DROP TABLE IF EXISTS spans", [])
|
||||
.context("failed to drop 'spans' table")?;
|
||||
db.execute("DROP TABLE IF EXISTS files", [])
|
||||
.context("failed to drop 'files' table")?;
|
||||
db.execute("DROP TABLE IF EXISTS worktrees", [])
|
||||
.context("failed to drop 'worktrees' table")?;
|
||||
db.execute("DROP TABLE IF EXISTS semantic_index_config", [])
|
||||
.context("failed to drop 'semantic_index_config' table")?;
|
||||
|
||||
// Initialize Vector Databasing Tables
|
||||
db.execute(
|
||||
"CREATE TABLE semantic_index_config (
|
||||
version INTEGER NOT NULL
|
||||
)",
|
||||
[],
|
||||
)?;
|
||||
|
||||
db.execute(
|
||||
"INSERT INTO semantic_index_config (version) VALUES (?1)",
|
||||
params![SEMANTIC_INDEX_VERSION],
|
||||
)?;
|
||||
|
||||
db.execute(
|
||||
"CREATE TABLE worktrees (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
absolute_path VARCHAR NOT NULL
|
||||
);
|
||||
CREATE UNIQUE INDEX worktrees_absolute_path ON worktrees (absolute_path);
|
||||
",
|
||||
[],
|
||||
)?;
|
||||
|
||||
db.execute(
|
||||
"CREATE TABLE files (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
worktree_id INTEGER NOT NULL,
|
||||
relative_path VARCHAR NOT NULL,
|
||||
mtime_seconds INTEGER NOT NULL,
|
||||
mtime_nanos INTEGER NOT NULL,
|
||||
FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
|
||||
)",
|
||||
[],
|
||||
)?;
|
||||
|
||||
db.execute(
|
||||
"CREATE UNIQUE INDEX files_worktree_id_and_relative_path ON files (worktree_id, relative_path)",
|
||||
[],
|
||||
)?;
|
||||
|
||||
db.execute(
|
||||
"CREATE TABLE spans (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_id INTEGER NOT NULL,
|
||||
start_byte INTEGER NOT NULL,
|
||||
end_byte INTEGER NOT NULL,
|
||||
name VARCHAR NOT NULL,
|
||||
embedding BLOB NOT NULL,
|
||||
digest BLOB NOT NULL,
|
||||
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||
)",
|
||||
[],
|
||||
)?;
|
||||
db.execute(
|
||||
"CREATE INDEX spans_digest ON spans (digest)",
|
||||
[],
|
||||
)?;
|
||||
|
||||
log::trace!("vector database initialized with updated schema.");
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn delete_file(
|
||||
&self,
|
||||
worktree_id: i64,
|
||||
delete_path: Arc<Path>,
|
||||
) -> impl Future<Output = Result<()>> {
|
||||
self.transact(move |db| {
|
||||
db.execute(
|
||||
"DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2",
|
||||
params![worktree_id, delete_path.to_str()],
|
||||
)?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn insert_file(
|
||||
&self,
|
||||
worktree_id: i64,
|
||||
path: Arc<Path>,
|
||||
mtime: SystemTime,
|
||||
spans: Vec<Span>,
|
||||
) -> impl Future<Output = Result<()>> {
|
||||
self.transact(move |db| {
|
||||
// Return the existing ID, if both the file and mtime match
|
||||
let mtime = Timestamp::from(mtime);
|
||||
|
||||
db.execute(
|
||||
"
|
||||
REPLACE INTO files
|
||||
(worktree_id, relative_path, mtime_seconds, mtime_nanos)
|
||||
VALUES (?1, ?2, ?3, ?4)
|
||||
",
|
||||
params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
|
||||
)?;
|
||||
|
||||
let file_id = db.last_insert_rowid();
|
||||
|
||||
let mut query = db.prepare(
|
||||
"
|
||||
INSERT INTO spans
|
||||
(file_id, start_byte, end_byte, name, embedding, digest)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6)
|
||||
",
|
||||
)?;
|
||||
|
||||
for span in spans {
|
||||
query.execute(params![
|
||||
file_id,
|
||||
span.range.start.to_string(),
|
||||
span.range.end.to_string(),
|
||||
span.name,
|
||||
span.embedding,
|
||||
span.digest
|
||||
])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn worktree_previously_indexed(
|
||||
&self,
|
||||
worktree_root_path: &Path,
|
||||
) -> impl Future<Output = Result<bool>> {
|
||||
let worktree_root_path = worktree_root_path.to_string_lossy().into_owned();
|
||||
self.transact(move |db| {
|
||||
let mut worktree_query =
|
||||
db.prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
|
||||
let worktree_id = worktree_query
|
||||
.query_row(params![worktree_root_path], |row| Ok(row.get::<_, i64>(0)?));
|
||||
|
||||
if worktree_id.is_ok() {
|
||||
return Ok(true);
|
||||
} else {
|
||||
return Ok(false);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn embeddings_for_digests(
|
||||
&self,
|
||||
digests: Vec<SpanDigest>,
|
||||
) -> impl Future<Output = Result<HashMap<SpanDigest, Embedding>>> {
|
||||
self.transact(move |db| {
|
||||
let mut query = db.prepare(
|
||||
"
|
||||
SELECT digest, embedding
|
||||
FROM spans
|
||||
WHERE digest IN rarray(?)
|
||||
",
|
||||
)?;
|
||||
let mut embeddings_by_digest = HashMap::default();
|
||||
let digests = Rc::new(
|
||||
digests
|
||||
.into_iter()
|
||||
.map(|p| Value::Blob(p.0.to_vec()))
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
let rows = query.query_map(params![digests], |row| {
|
||||
Ok((row.get::<_, SpanDigest>(0)?, row.get::<_, Embedding>(1)?))
|
||||
})?;
|
||||
|
||||
for row in rows {
|
||||
if let Ok(row) = row {
|
||||
embeddings_by_digest.insert(row.0, row.1);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(embeddings_by_digest)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn embeddings_for_files(
|
||||
&self,
|
||||
worktree_id_file_paths: HashMap<i64, Vec<Arc<Path>>>,
|
||||
) -> impl Future<Output = Result<HashMap<SpanDigest, Embedding>>> {
|
||||
self.transact(move |db| {
|
||||
let mut query = db.prepare(
|
||||
"
|
||||
SELECT digest, embedding
|
||||
FROM spans
|
||||
LEFT JOIN files ON files.id = spans.file_id
|
||||
WHERE files.worktree_id = ? AND files.relative_path IN rarray(?)
|
||||
",
|
||||
)?;
|
||||
let mut embeddings_by_digest = HashMap::default();
|
||||
for (worktree_id, file_paths) in worktree_id_file_paths {
|
||||
let file_paths = Rc::new(
|
||||
file_paths
|
||||
.into_iter()
|
||||
.map(|p| Value::Text(p.to_string_lossy().into_owned()))
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
let rows = query.query_map(params![worktree_id, file_paths], |row| {
|
||||
Ok((row.get::<_, SpanDigest>(0)?, row.get::<_, Embedding>(1)?))
|
||||
})?;
|
||||
|
||||
for row in rows {
|
||||
if let Ok(row) = row {
|
||||
embeddings_by_digest.insert(row.0, row.1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(embeddings_by_digest)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn find_or_create_worktree(
|
||||
&self,
|
||||
worktree_root_path: Arc<Path>,
|
||||
) -> impl Future<Output = Result<i64>> {
|
||||
self.transact(move |db| {
|
||||
let mut worktree_query =
|
||||
db.prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
|
||||
let worktree_id = worktree_query
|
||||
.query_row(params![worktree_root_path.to_string_lossy()], |row| {
|
||||
Ok(row.get::<_, i64>(0)?)
|
||||
});
|
||||
|
||||
if worktree_id.is_ok() {
|
||||
return Ok(worktree_id?);
|
||||
}
|
||||
|
||||
// If worktree_id is Err, insert new worktree
|
||||
db.execute(
|
||||
"INSERT into worktrees (absolute_path) VALUES (?1)",
|
||||
params![worktree_root_path.to_string_lossy()],
|
||||
)?;
|
||||
Ok(db.last_insert_rowid())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_file_mtimes(
|
||||
&self,
|
||||
worktree_id: i64,
|
||||
) -> impl Future<Output = Result<HashMap<PathBuf, SystemTime>>> {
|
||||
self.transact(move |db| {
|
||||
let mut statement = db.prepare(
|
||||
"
|
||||
SELECT relative_path, mtime_seconds, mtime_nanos
|
||||
FROM files
|
||||
WHERE worktree_id = ?1
|
||||
ORDER BY relative_path",
|
||||
)?;
|
||||
let mut result: HashMap<PathBuf, SystemTime> = HashMap::default();
|
||||
for row in statement.query_map(params![worktree_id], |row| {
|
||||
Ok((
|
||||
row.get::<_, String>(0)?.into(),
|
||||
Timestamp {
|
||||
seconds: row.get(1)?,
|
||||
nanos: row.get(2)?,
|
||||
}
|
||||
.into(),
|
||||
))
|
||||
})? {
|
||||
let row = row?;
|
||||
result.insert(row.0, row.1);
|
||||
}
|
||||
Ok(result)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn top_k_search(
|
||||
&self,
|
||||
query_embedding: &Embedding,
|
||||
limit: usize,
|
||||
file_ids: &[i64],
|
||||
) -> impl Future<Output = Result<Vec<(i64, OrderedFloat<f32>)>>> {
|
||||
let file_ids = file_ids.to_vec();
|
||||
let query = query_embedding.clone().0;
|
||||
let query = Array1::from_vec(query);
|
||||
self.transact(move |db| {
|
||||
let mut query_statement = db.prepare(
|
||||
"
|
||||
SELECT
|
||||
id, embedding
|
||||
FROM
|
||||
spans
|
||||
WHERE
|
||||
file_id IN rarray(?)
|
||||
",
|
||||
)?;
|
||||
|
||||
let deserialized_rows = query_statement
|
||||
.query_map(params![ids_to_sql(&file_ids)], |row| {
|
||||
Ok((row.get::<_, usize>(0)?, row.get::<_, Embedding>(1)?))
|
||||
})?
|
||||
.filter_map(|row| row.ok())
|
||||
.collect::<Vec<(usize, Embedding)>>();
|
||||
|
||||
if deserialized_rows.len() == 0 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Get Length of Embeddings Returned
|
||||
let embedding_len = deserialized_rows[0].1 .0.len();
|
||||
|
||||
let batch_n = 1000;
|
||||
let mut batches = Vec::new();
|
||||
let mut batch_ids = Vec::new();
|
||||
let mut batch_embeddings: Vec<f32> = Vec::new();
|
||||
deserialized_rows.iter().for_each(|(id, embedding)| {
|
||||
batch_ids.push(id);
|
||||
batch_embeddings.extend(&embedding.0);
|
||||
|
||||
if batch_ids.len() == batch_n {
|
||||
let embeddings = std::mem::take(&mut batch_embeddings);
|
||||
let ids = std::mem::take(&mut batch_ids);
|
||||
let array =
|
||||
Array2::from_shape_vec((ids.len(), embedding_len.clone()), embeddings);
|
||||
match array {
|
||||
Ok(array) => {
|
||||
batches.push((ids, array));
|
||||
}
|
||||
Err(err) => log::error!("Failed to deserialize to ndarray: {:?}", err),
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if batch_ids.len() > 0 {
|
||||
let array = Array2::from_shape_vec(
|
||||
(batch_ids.len(), embedding_len),
|
||||
batch_embeddings.clone(),
|
||||
);
|
||||
match array {
|
||||
Ok(array) => {
|
||||
batches.push((batch_ids.clone(), array));
|
||||
}
|
||||
Err(err) => log::error!("Failed to deserialize to ndarray: {:?}", err),
|
||||
}
|
||||
}
|
||||
|
||||
let mut ids: Vec<usize> = Vec::new();
|
||||
let mut results = Vec::new();
|
||||
for (batch_ids, array) in batches {
|
||||
let scores = array
|
||||
.dot(&query.t())
|
||||
.to_vec()
|
||||
.iter()
|
||||
.map(|score| OrderedFloat(*score))
|
||||
.collect::<Vec<OrderedFloat<f32>>>();
|
||||
results.extend(scores);
|
||||
ids.extend(batch_ids);
|
||||
}
|
||||
|
||||
let sorted_idx = argsort(&results);
|
||||
let mut sorted_results = Vec::new();
|
||||
let last_idx = limit.min(sorted_idx.len());
|
||||
for idx in &sorted_idx[0..last_idx] {
|
||||
sorted_results.push((ids[*idx] as i64, results[*idx]))
|
||||
}
|
||||
|
||||
Ok(sorted_results)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn retrieve_included_file_ids(
|
||||
&self,
|
||||
worktree_ids: &[i64],
|
||||
includes: &[PathMatcher],
|
||||
excludes: &[PathMatcher],
|
||||
) -> impl Future<Output = Result<Vec<i64>>> {
|
||||
let worktree_ids = worktree_ids.to_vec();
|
||||
let includes = includes.to_vec();
|
||||
let excludes = excludes.to_vec();
|
||||
self.transact(move |db| {
|
||||
let mut file_query = db.prepare(
|
||||
"
|
||||
SELECT
|
||||
id, relative_path
|
||||
FROM
|
||||
files
|
||||
WHERE
|
||||
worktree_id IN rarray(?)
|
||||
",
|
||||
)?;
|
||||
|
||||
let mut file_ids = Vec::<i64>::new();
|
||||
let mut rows = file_query.query([ids_to_sql(&worktree_ids)])?;
|
||||
|
||||
while let Some(row) = rows.next()? {
|
||||
let file_id = row.get(0)?;
|
||||
let relative_path = row.get_ref(1)?.as_str()?;
|
||||
let included =
|
||||
includes.is_empty() || includes.iter().any(|glob| glob.is_match(relative_path));
|
||||
let excluded = excludes.iter().any(|glob| glob.is_match(relative_path));
|
||||
if included && !excluded {
|
||||
file_ids.push(file_id);
|
||||
}
|
||||
}
|
||||
|
||||
anyhow::Ok(file_ids)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn spans_for_ids(
|
||||
&self,
|
||||
ids: &[i64],
|
||||
) -> impl Future<Output = Result<Vec<(i64, PathBuf, Range<usize>)>>> {
|
||||
let ids = ids.to_vec();
|
||||
self.transact(move |db| {
|
||||
let mut statement = db.prepare(
|
||||
"
|
||||
SELECT
|
||||
spans.id,
|
||||
files.worktree_id,
|
||||
files.relative_path,
|
||||
spans.start_byte,
|
||||
spans.end_byte
|
||||
FROM
|
||||
spans, files
|
||||
WHERE
|
||||
spans.file_id = files.id AND
|
||||
spans.id in rarray(?)
|
||||
",
|
||||
)?;
|
||||
|
||||
let result_iter = statement.query_map(params![ids_to_sql(&ids)], |row| {
|
||||
Ok((
|
||||
row.get::<_, i64>(0)?,
|
||||
row.get::<_, i64>(1)?,
|
||||
row.get::<_, String>(2)?.into(),
|
||||
row.get(3)?..row.get(4)?,
|
||||
))
|
||||
})?;
|
||||
|
||||
let mut values_by_id = HashMap::<i64, (i64, PathBuf, Range<usize>)>::default();
|
||||
for row in result_iter {
|
||||
let (id, worktree_id, path, range) = row?;
|
||||
values_by_id.insert(id, (worktree_id, path, range));
|
||||
}
|
||||
|
||||
let mut results = Vec::with_capacity(ids.len());
|
||||
for id in &ids {
|
||||
let value = values_by_id
|
||||
.remove(id)
|
||||
.ok_or(anyhow!("missing span id {}", id))?;
|
||||
results.push(value);
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn ids_to_sql(ids: &[i64]) -> Rc<Vec<rusqlite::types::Value>> {
|
||||
Rc::new(
|
||||
ids.iter()
|
||||
.copied()
|
||||
.map(|v| rusqlite::types::Value::from(v))
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
}
|
169
crates/semantic_index2/src/embedding_queue.rs
Normal file
169
crates/semantic_index2/src/embedding_queue.rs
Normal file
@ -0,0 +1,169 @@
|
||||
use crate::{parsing::Span, JobHandle};
|
||||
use ai::embedding::EmbeddingProvider;
|
||||
use gpui::BackgroundExecutor;
|
||||
use parking_lot::Mutex;
|
||||
use smol::channel;
|
||||
use std::{mem, ops::Range, path::Path, sync::Arc, time::SystemTime};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FileToEmbed {
|
||||
pub worktree_id: i64,
|
||||
pub path: Arc<Path>,
|
||||
pub mtime: SystemTime,
|
||||
pub spans: Vec<Span>,
|
||||
pub job_handle: JobHandle,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for FileToEmbed {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("FileToEmbed")
|
||||
.field("worktree_id", &self.worktree_id)
|
||||
.field("path", &self.path)
|
||||
.field("mtime", &self.mtime)
|
||||
.field("spans", &self.spans)
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for FileToEmbed {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.worktree_id == other.worktree_id
|
||||
&& self.path == other.path
|
||||
&& self.mtime == other.mtime
|
||||
&& self.spans == other.spans
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EmbeddingQueue {
|
||||
embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||
pending_batch: Vec<FileFragmentToEmbed>,
|
||||
executor: BackgroundExecutor,
|
||||
pending_batch_token_count: usize,
|
||||
finished_files_tx: channel::Sender<FileToEmbed>,
|
||||
finished_files_rx: channel::Receiver<FileToEmbed>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FileFragmentToEmbed {
|
||||
file: Arc<Mutex<FileToEmbed>>,
|
||||
span_range: Range<usize>,
|
||||
}
|
||||
|
||||
impl EmbeddingQueue {
|
||||
pub fn new(
|
||||
embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||
executor: BackgroundExecutor,
|
||||
) -> Self {
|
||||
let (finished_files_tx, finished_files_rx) = channel::unbounded();
|
||||
Self {
|
||||
embedding_provider,
|
||||
executor,
|
||||
pending_batch: Vec::new(),
|
||||
pending_batch_token_count: 0,
|
||||
finished_files_tx,
|
||||
finished_files_rx,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push(&mut self, file: FileToEmbed) {
|
||||
if file.spans.is_empty() {
|
||||
self.finished_files_tx.try_send(file).unwrap();
|
||||
return;
|
||||
}
|
||||
|
||||
let file = Arc::new(Mutex::new(file));
|
||||
|
||||
self.pending_batch.push(FileFragmentToEmbed {
|
||||
file: file.clone(),
|
||||
span_range: 0..0,
|
||||
});
|
||||
|
||||
let mut fragment_range = &mut self.pending_batch.last_mut().unwrap().span_range;
|
||||
for (ix, span) in file.lock().spans.iter().enumerate() {
|
||||
let span_token_count = if span.embedding.is_none() {
|
||||
span.token_count
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let next_token_count = self.pending_batch_token_count + span_token_count;
|
||||
if next_token_count > self.embedding_provider.max_tokens_per_batch() {
|
||||
let range_end = fragment_range.end;
|
||||
self.flush();
|
||||
self.pending_batch.push(FileFragmentToEmbed {
|
||||
file: file.clone(),
|
||||
span_range: range_end..range_end,
|
||||
});
|
||||
fragment_range = &mut self.pending_batch.last_mut().unwrap().span_range;
|
||||
}
|
||||
|
||||
fragment_range.end = ix + 1;
|
||||
self.pending_batch_token_count += span_token_count;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) {
|
||||
let batch = mem::take(&mut self.pending_batch);
|
||||
self.pending_batch_token_count = 0;
|
||||
if batch.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let finished_files_tx = self.finished_files_tx.clone();
|
||||
let embedding_provider = self.embedding_provider.clone();
|
||||
|
||||
self.executor
|
||||
.spawn(async move {
|
||||
let mut spans = Vec::new();
|
||||
for fragment in &batch {
|
||||
let file = fragment.file.lock();
|
||||
spans.extend(
|
||||
file.spans[fragment.span_range.clone()]
|
||||
.iter()
|
||||
.filter(|d| d.embedding.is_none())
|
||||
.map(|d| d.content.clone()),
|
||||
);
|
||||
}
|
||||
|
||||
// If spans is 0, just send the fragment to the finished files if its the last one.
|
||||
if spans.is_empty() {
|
||||
for fragment in batch.clone() {
|
||||
if let Some(file) = Arc::into_inner(fragment.file) {
|
||||
finished_files_tx.try_send(file.into_inner()).unwrap();
|
||||
}
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
match embedding_provider.embed_batch(spans).await {
|
||||
Ok(embeddings) => {
|
||||
let mut embeddings = embeddings.into_iter();
|
||||
for fragment in batch {
|
||||
for span in &mut fragment.file.lock().spans[fragment.span_range.clone()]
|
||||
.iter_mut()
|
||||
.filter(|d| d.embedding.is_none())
|
||||
{
|
||||
if let Some(embedding) = embeddings.next() {
|
||||
span.embedding = Some(embedding);
|
||||
} else {
|
||||
log::error!("number of embeddings != number of documents");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(file) = Arc::into_inner(fragment.file) {
|
||||
finished_files_tx.try_send(file.into_inner()).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
log::error!("{:?}", error);
|
||||
}
|
||||
}
|
||||
})
|
||||
.detach();
|
||||
}
|
||||
|
||||
pub fn finished_files(&self) -> channel::Receiver<FileToEmbed> {
|
||||
self.finished_files_rx.clone()
|
||||
}
|
||||
}
|
414
crates/semantic_index2/src/parsing.rs
Normal file
414
crates/semantic_index2/src/parsing.rs
Normal file
@ -0,0 +1,414 @@
|
||||
use ai::{
|
||||
embedding::{Embedding, EmbeddingProvider},
|
||||
models::TruncationDirection,
|
||||
};
|
||||
use anyhow::{anyhow, Result};
|
||||
use language::{Grammar, Language};
|
||||
use rusqlite::{
|
||||
types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
|
||||
ToSql,
|
||||
};
|
||||
use sha1::{Digest, Sha1};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
cmp::{self, Reverse},
|
||||
collections::HashSet,
|
||||
ops::Range,
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
};
|
||||
use tree_sitter::{Parser, QueryCursor};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub struct SpanDigest(pub [u8; 20]);
|
||||
|
||||
impl FromSql for SpanDigest {
|
||||
fn column_result(value: ValueRef) -> FromSqlResult<Self> {
|
||||
let blob = value.as_blob()?;
|
||||
let bytes =
|
||||
blob.try_into()
|
||||
.map_err(|_| rusqlite::types::FromSqlError::InvalidBlobSize {
|
||||
expected_size: 20,
|
||||
blob_size: blob.len(),
|
||||
})?;
|
||||
return Ok(SpanDigest(bytes));
|
||||
}
|
||||
}
|
||||
|
||||
impl ToSql for SpanDigest {
|
||||
fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
|
||||
self.0.to_sql()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&'_ str> for SpanDigest {
|
||||
fn from(value: &'_ str) -> Self {
|
||||
let mut sha1 = Sha1::new();
|
||||
sha1.update(value);
|
||||
Self(sha1.finalize().into())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub struct Span {
|
||||
pub name: String,
|
||||
pub range: Range<usize>,
|
||||
pub content: String,
|
||||
pub embedding: Option<Embedding>,
|
||||
pub digest: SpanDigest,
|
||||
pub token_count: usize,
|
||||
}
|
||||
|
||||
const CODE_CONTEXT_TEMPLATE: &str =
|
||||
"The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
|
||||
const ENTIRE_FILE_TEMPLATE: &str =
|
||||
"The below snippet is from file '<path>'\n\n```<language>\n<item>\n```";
|
||||
const MARKDOWN_CONTEXT_TEMPLATE: &str = "The below file contents is from file '<path>'\n\n<item>";
|
||||
pub const PARSEABLE_ENTIRE_FILE_TYPES: &[&str] = &[
|
||||
"TOML", "YAML", "CSS", "HEEX", "ERB", "SVELTE", "HTML", "Scheme",
|
||||
];
|
||||
|
||||
pub struct CodeContextRetriever {
|
||||
pub parser: Parser,
|
||||
pub cursor: QueryCursor,
|
||||
pub embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||
}
|
||||
|
||||
// Every match has an item, this represents the fundamental treesitter symbol and anchors the search
|
||||
// Every match has one or more 'name' captures. These indicate the display range of the item for deduplication.
|
||||
// If there are preceeding comments, we track this with a context capture
|
||||
// If there is a piece that should be collapsed in hierarchical queries, we capture it with a collapse capture
|
||||
// If there is a piece that should be kept inside a collapsed node, we capture it with a keep capture
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CodeContextMatch {
|
||||
pub start_col: usize,
|
||||
pub item_range: Option<Range<usize>>,
|
||||
pub name_range: Option<Range<usize>>,
|
||||
pub context_ranges: Vec<Range<usize>>,
|
||||
pub collapse_ranges: Vec<Range<usize>>,
|
||||
}
|
||||
|
||||
impl CodeContextRetriever {
|
||||
pub fn new(embedding_provider: Arc<dyn EmbeddingProvider>) -> Self {
|
||||
Self {
|
||||
parser: Parser::new(),
|
||||
cursor: QueryCursor::new(),
|
||||
embedding_provider,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_entire_file(
|
||||
&self,
|
||||
relative_path: Option<&Path>,
|
||||
language_name: Arc<str>,
|
||||
content: &str,
|
||||
) -> Result<Vec<Span>> {
|
||||
let document_span = ENTIRE_FILE_TEMPLATE
|
||||
.replace(
|
||||
"<path>",
|
||||
&relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
|
||||
)
|
||||
.replace("<language>", language_name.as_ref())
|
||||
.replace("<item>", &content);
|
||||
let digest = SpanDigest::from(document_span.as_str());
|
||||
let model = self.embedding_provider.base_model();
|
||||
let document_span = model.truncate(
|
||||
&document_span,
|
||||
model.capacity()?,
|
||||
ai::models::TruncationDirection::End,
|
||||
)?;
|
||||
let token_count = model.count_tokens(&document_span)?;
|
||||
|
||||
Ok(vec![Span {
|
||||
range: 0..content.len(),
|
||||
content: document_span,
|
||||
embedding: Default::default(),
|
||||
name: language_name.to_string(),
|
||||
digest,
|
||||
token_count,
|
||||
}])
|
||||
}
|
||||
|
||||
fn parse_markdown_file(
|
||||
&self,
|
||||
relative_path: Option<&Path>,
|
||||
content: &str,
|
||||
) -> Result<Vec<Span>> {
|
||||
let document_span = MARKDOWN_CONTEXT_TEMPLATE
|
||||
.replace(
|
||||
"<path>",
|
||||
&relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
|
||||
)
|
||||
.replace("<item>", &content);
|
||||
let digest = SpanDigest::from(document_span.as_str());
|
||||
|
||||
let model = self.embedding_provider.base_model();
|
||||
let document_span = model.truncate(
|
||||
&document_span,
|
||||
model.capacity()?,
|
||||
ai::models::TruncationDirection::End,
|
||||
)?;
|
||||
let token_count = model.count_tokens(&document_span)?;
|
||||
|
||||
Ok(vec![Span {
|
||||
range: 0..content.len(),
|
||||
content: document_span,
|
||||
embedding: None,
|
||||
name: "Markdown".to_string(),
|
||||
digest,
|
||||
token_count,
|
||||
}])
|
||||
}
|
||||
|
||||
fn get_matches_in_file(
|
||||
&mut self,
|
||||
content: &str,
|
||||
grammar: &Arc<Grammar>,
|
||||
) -> Result<Vec<CodeContextMatch>> {
|
||||
let embedding_config = grammar
|
||||
.embedding_config
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("no embedding queries"))?;
|
||||
self.parser.set_language(grammar.ts_language).unwrap();
|
||||
|
||||
let tree = self
|
||||
.parser
|
||||
.parse(&content, None)
|
||||
.ok_or_else(|| anyhow!("parsing failed"))?;
|
||||
|
||||
let mut captures: Vec<CodeContextMatch> = Vec::new();
|
||||
let mut collapse_ranges: Vec<Range<usize>> = Vec::new();
|
||||
let mut keep_ranges: Vec<Range<usize>> = Vec::new();
|
||||
for mat in self.cursor.matches(
|
||||
&embedding_config.query,
|
||||
tree.root_node(),
|
||||
content.as_bytes(),
|
||||
) {
|
||||
let mut start_col = 0;
|
||||
let mut item_range: Option<Range<usize>> = None;
|
||||
let mut name_range: Option<Range<usize>> = None;
|
||||
let mut context_ranges: Vec<Range<usize>> = Vec::new();
|
||||
collapse_ranges.clear();
|
||||
keep_ranges.clear();
|
||||
for capture in mat.captures {
|
||||
if capture.index == embedding_config.item_capture_ix {
|
||||
item_range = Some(capture.node.byte_range());
|
||||
start_col = capture.node.start_position().column;
|
||||
} else if Some(capture.index) == embedding_config.name_capture_ix {
|
||||
name_range = Some(capture.node.byte_range());
|
||||
} else if Some(capture.index) == embedding_config.context_capture_ix {
|
||||
context_ranges.push(capture.node.byte_range());
|
||||
} else if Some(capture.index) == embedding_config.collapse_capture_ix {
|
||||
collapse_ranges.push(capture.node.byte_range());
|
||||
} else if Some(capture.index) == embedding_config.keep_capture_ix {
|
||||
keep_ranges.push(capture.node.byte_range());
|
||||
}
|
||||
}
|
||||
|
||||
captures.push(CodeContextMatch {
|
||||
start_col,
|
||||
item_range,
|
||||
name_range,
|
||||
context_ranges,
|
||||
collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges),
|
||||
});
|
||||
}
|
||||
Ok(captures)
|
||||
}
|
||||
|
||||
pub fn parse_file_with_template(
|
||||
&mut self,
|
||||
relative_path: Option<&Path>,
|
||||
content: &str,
|
||||
language: Arc<Language>,
|
||||
) -> Result<Vec<Span>> {
|
||||
let language_name = language.name();
|
||||
|
||||
if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language_name.as_ref()) {
|
||||
return self.parse_entire_file(relative_path, language_name, &content);
|
||||
} else if ["Markdown", "Plain Text"].contains(&language_name.as_ref()) {
|
||||
return self.parse_markdown_file(relative_path, &content);
|
||||
}
|
||||
|
||||
let mut spans = self.parse_file(content, language)?;
|
||||
for span in &mut spans {
|
||||
let document_content = CODE_CONTEXT_TEMPLATE
|
||||
.replace(
|
||||
"<path>",
|
||||
&relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
|
||||
)
|
||||
.replace("<language>", language_name.as_ref())
|
||||
.replace("item", &span.content);
|
||||
|
||||
let model = self.embedding_provider.base_model();
|
||||
let document_content = model.truncate(
|
||||
&document_content,
|
||||
model.capacity()?,
|
||||
TruncationDirection::End,
|
||||
)?;
|
||||
let token_count = model.count_tokens(&document_content)?;
|
||||
|
||||
span.content = document_content;
|
||||
span.token_count = token_count;
|
||||
}
|
||||
Ok(spans)
|
||||
}
|
||||
|
||||
pub fn parse_file(&mut self, content: &str, language: Arc<Language>) -> Result<Vec<Span>> {
|
||||
let grammar = language
|
||||
.grammar()
|
||||
.ok_or_else(|| anyhow!("no grammar for language"))?;
|
||||
|
||||
// Iterate through query matches
|
||||
let matches = self.get_matches_in_file(content, grammar)?;
|
||||
|
||||
let language_scope = language.default_scope();
|
||||
let placeholder = language_scope.collapsed_placeholder();
|
||||
|
||||
let mut spans = Vec::new();
|
||||
let mut collapsed_ranges_within = Vec::new();
|
||||
let mut parsed_name_ranges = HashSet::new();
|
||||
for (i, context_match) in matches.iter().enumerate() {
|
||||
// Items which are collapsible but not embeddable have no item range
|
||||
let item_range = if let Some(item_range) = context_match.item_range.clone() {
|
||||
item_range
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
// Checks for deduplication
|
||||
let name;
|
||||
if let Some(name_range) = context_match.name_range.clone() {
|
||||
name = content
|
||||
.get(name_range.clone())
|
||||
.map_or(String::new(), |s| s.to_string());
|
||||
if parsed_name_ranges.contains(&name_range) {
|
||||
continue;
|
||||
}
|
||||
parsed_name_ranges.insert(name_range);
|
||||
} else {
|
||||
name = String::new();
|
||||
}
|
||||
|
||||
collapsed_ranges_within.clear();
|
||||
'outer: for remaining_match in &matches[(i + 1)..] {
|
||||
for collapsed_range in &remaining_match.collapse_ranges {
|
||||
if item_range.start <= collapsed_range.start
|
||||
&& item_range.end >= collapsed_range.end
|
||||
{
|
||||
collapsed_ranges_within.push(collapsed_range.clone());
|
||||
} else {
|
||||
break 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
collapsed_ranges_within.sort_by_key(|r| (r.start, Reverse(r.end)));
|
||||
|
||||
let mut span_content = String::new();
|
||||
for context_range in &context_match.context_ranges {
|
||||
add_content_from_range(
|
||||
&mut span_content,
|
||||
content,
|
||||
context_range.clone(),
|
||||
context_match.start_col,
|
||||
);
|
||||
span_content.push_str("\n");
|
||||
}
|
||||
|
||||
let mut offset = item_range.start;
|
||||
for collapsed_range in &collapsed_ranges_within {
|
||||
if collapsed_range.start > offset {
|
||||
add_content_from_range(
|
||||
&mut span_content,
|
||||
content,
|
||||
offset..collapsed_range.start,
|
||||
context_match.start_col,
|
||||
);
|
||||
offset = collapsed_range.start;
|
||||
}
|
||||
|
||||
if collapsed_range.end > offset {
|
||||
span_content.push_str(placeholder);
|
||||
offset = collapsed_range.end;
|
||||
}
|
||||
}
|
||||
|
||||
if offset < item_range.end {
|
||||
add_content_from_range(
|
||||
&mut span_content,
|
||||
content,
|
||||
offset..item_range.end,
|
||||
context_match.start_col,
|
||||
);
|
||||
}
|
||||
|
||||
let sha1 = SpanDigest::from(span_content.as_str());
|
||||
spans.push(Span {
|
||||
name,
|
||||
content: span_content,
|
||||
range: item_range.clone(),
|
||||
embedding: None,
|
||||
digest: sha1,
|
||||
token_count: 0,
|
||||
})
|
||||
}
|
||||
|
||||
return Ok(spans);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn subtract_ranges(
|
||||
ranges: &[Range<usize>],
|
||||
ranges_to_subtract: &[Range<usize>],
|
||||
) -> Vec<Range<usize>> {
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut ranges_to_subtract = ranges_to_subtract.iter().peekable();
|
||||
|
||||
for range in ranges {
|
||||
let mut offset = range.start;
|
||||
|
||||
while offset < range.end {
|
||||
if let Some(range_to_subtract) = ranges_to_subtract.peek() {
|
||||
if offset < range_to_subtract.start {
|
||||
let next_offset = cmp::min(range_to_subtract.start, range.end);
|
||||
result.push(offset..next_offset);
|
||||
offset = next_offset;
|
||||
} else {
|
||||
let next_offset = cmp::min(range_to_subtract.end, range.end);
|
||||
offset = next_offset;
|
||||
}
|
||||
|
||||
if offset >= range_to_subtract.end {
|
||||
ranges_to_subtract.next();
|
||||
}
|
||||
} else {
|
||||
result.push(offset..range.end);
|
||||
offset = range.end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn add_content_from_range(
|
||||
output: &mut String,
|
||||
content: &str,
|
||||
range: Range<usize>,
|
||||
start_col: usize,
|
||||
) {
|
||||
for mut line in content.get(range.clone()).unwrap_or("").lines() {
|
||||
for _ in 0..start_col {
|
||||
if line.starts_with(' ') {
|
||||
line = &line[1..];
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
output.push_str(line);
|
||||
output.push('\n');
|
||||
}
|
||||
output.pop();
|
||||
}
|
1280
crates/semantic_index2/src/semantic_index.rs
Normal file
1280
crates/semantic_index2/src/semantic_index.rs
Normal file
File diff suppressed because it is too large
Load Diff
28
crates/semantic_index2/src/semantic_index_settings.rs
Normal file
28
crates/semantic_index2/src/semantic_index_settings.rs
Normal file
@ -0,0 +1,28 @@
|
||||
use anyhow;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use settings::Settings;
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct SemanticIndexSettings {
|
||||
pub enabled: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug)]
|
||||
pub struct SemanticIndexSettingsContent {
|
||||
pub enabled: Option<bool>,
|
||||
}
|
||||
|
||||
impl Settings for SemanticIndexSettings {
|
||||
const KEY: Option<&'static str> = Some("semantic_index");
|
||||
|
||||
type FileContent = SemanticIndexSettingsContent;
|
||||
|
||||
fn load(
|
||||
default_value: &Self::FileContent,
|
||||
user_values: &[&Self::FileContent],
|
||||
_: &mut gpui::AppContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Self::load_via_json_merge(default_value, user_values)
|
||||
}
|
||||
}
|
1697
crates/semantic_index2/src/semantic_index_tests.rs
Normal file
1697
crates/semantic_index2/src/semantic_index_tests.rs
Normal file
File diff suppressed because it is too large
Load Diff
@ -3942,8 +3942,6 @@ impl std::fmt::Debug for OpenPaths {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WorkspaceCreated(pub WeakView<Workspace>);
|
||||
|
||||
pub fn activate_workspace_for_project(
|
||||
cx: &mut AppContext,
|
||||
predicate: impl Fn(&Project, &AppContext) -> bool + Send + 'static,
|
||||
|
Loading…
Reference in New Issue
Block a user