Port semantic_index to gpui2

Co-Authored-By: Julia Risley <julia@zed.dev>
This commit is contained in:
Antonio Scandurra 2023-12-05 15:38:36 +01:00
parent d433da1e70
commit 09db455db2
16 changed files with 4569 additions and 10 deletions

51
Cargo.lock generated
View File

@ -8232,6 +8232,57 @@ dependencies = [
"workspace",
]
[[package]]
name = "semantic_index2"
version = "0.1.0"
dependencies = [
"ai2",
"anyhow",
"async-trait",
"client2",
"collections",
"ctor",
"env_logger 0.9.3",
"futures 0.3.28",
"globset",
"gpui2",
"language2",
"lazy_static",
"log",
"ndarray",
"node_runtime",
"ordered-float 2.10.0",
"parking_lot 0.11.2",
"postage",
"pretty_assertions",
"project2",
"rand 0.8.5",
"rpc2",
"rusqlite",
"rust-embed",
"schemars",
"serde",
"serde_json",
"settings2",
"sha1",
"smol",
"tempdir",
"tiktoken-rs",
"tree-sitter",
"tree-sitter-cpp",
"tree-sitter-elixir",
"tree-sitter-json 0.20.0",
"tree-sitter-lua",
"tree-sitter-php",
"tree-sitter-ruby",
"tree-sitter-rust",
"tree-sitter-toml",
"tree-sitter-typescript",
"unindent",
"util",
"workspace2",
]
[[package]]
name = "semver"
version = "1.0.18"

View File

@ -95,6 +95,8 @@ members = [
"crates/rpc2",
"crates/search",
"crates/search2",
"crates/semantic_index",
"crates/semantic_index2",
"crates/settings",
"crates/settings2",
"crates/snippet",
@ -114,7 +116,6 @@ members = [
"crates/theme_selector2",
"crates/ui2",
"crates/util",
"crates/semantic_index",
"crates/story",
"crates/vim",
"crates/vcs_menu",

View File

@ -7,7 +7,7 @@ pub enum ProviderCredential {
NotNeeded,
}
pub trait CredentialProvider {
pub trait CredentialProvider: Send + Sync {
fn has_credentials(&self) -> bool;
fn retrieve_credentials(&self, cx: &mut AppContext) -> ProviderCredential;
fn save_credentials(&self, cx: &mut AppContext, credential: ProviderCredential);

View File

@ -35,7 +35,7 @@ pub struct OpenAIEmbeddingProvider {
model: OpenAILanguageModel,
credential: Arc<RwLock<ProviderCredential>>,
pub client: Arc<dyn HttpClient>,
pub executor: Arc<BackgroundExecutor>,
pub executor: BackgroundExecutor,
rate_limit_count_rx: watch::Receiver<Option<Instant>>,
rate_limit_count_tx: Arc<Mutex<watch::Sender<Option<Instant>>>>,
}
@ -66,7 +66,7 @@ struct OpenAIEmbeddingUsage {
}
impl OpenAIEmbeddingProvider {
pub fn new(client: Arc<dyn HttpClient>, executor: Arc<BackgroundExecutor>) -> Self {
pub fn new(client: Arc<dyn HttpClient>, executor: BackgroundExecutor) -> Self {
let (rate_limit_count_tx, rate_limit_count_rx) = watch::channel_with(None);
let rate_limit_count_tx = Arc::new(Mutex::new(rate_limit_count_tx));

View File

@ -482,10 +482,6 @@ impl<T: 'static> WeakModel<T> {
/// Update the entity referenced by this model with the given function if
/// the referenced entity still exists. Returns an error if the entity has
/// been released.
///
/// The update function receives a context appropriate for its environment.
/// When updating in an `AppContext`, it receives a `ModelContext`.
/// When updating an a `WindowContext`, it receives a `ViewContext`.
pub fn update<C, R>(
&self,
cx: &mut C,
@ -501,6 +497,21 @@ impl<T: 'static> WeakModel<T> {
.map(|this| cx.update_model(&this, update)),
)
}
/// Reads the entity referenced by this model with the given function if
/// the referenced entity still exists. Returns an error if the entity has
/// been released.
pub fn read_with<C, R>(&self, cx: &C, read: impl FnOnce(&T, &AppContext) -> R) -> Result<R>
where
C: Context,
Result<C::Result<R>>: crate::Flatten<R>,
{
crate::Flatten::flatten(
self.upgrade()
.ok_or_else(|| anyhow!("entity release"))
.map(|this| cx.read_model(&this, read)),
)
}
}
impl<T> Hash for WeakModel<T> {

View File

@ -0,0 +1,69 @@
[package]
name = "semantic_index2"
version = "0.1.0"
edition = "2021"
publish = false
[lib]
path = "src/semantic_index.rs"
doctest = false
[dependencies]
ai = { package = "ai2", path = "../ai2" }
collections = { path = "../collections" }
gpui = { package = "gpui2", path = "../gpui2" }
language = { package = "language2", path = "../language2" }
project = { package = "project2", path = "../project2" }
workspace = { package = "workspace2", path = "../workspace2" }
util = { path = "../util" }
rpc = { package = "rpc2", path = "../rpc2" }
settings = { package = "settings2", path = "../settings2" }
anyhow.workspace = true
postage.workspace = true
futures.workspace = true
ordered-float.workspace = true
smol.workspace = true
rusqlite.workspace = true
log.workspace = true
tree-sitter.workspace = true
lazy_static.workspace = true
serde.workspace = true
serde_json.workspace = true
async-trait.workspace = true
tiktoken-rs.workspace = true
parking_lot.workspace = true
rand.workspace = true
schemars.workspace = true
globset.workspace = true
sha1 = "0.10.5"
ndarray = { version = "0.15.0" }
[dev-dependencies]
ai = { package = "ai2", path = "../ai2", features = ["test-support"] }
collections = { path = "../collections", features = ["test-support"] }
gpui = { package = "gpui2", path = "../gpui2", features = ["test-support"] }
language = { package = "language2", path = "../language2", features = ["test-support"] }
project = { package = "project2", path = "../project2", features = ["test-support"] }
rpc = { package = "rpc2", path = "../rpc2", features = ["test-support"] }
workspace = { package = "workspace2", path = "../workspace2", features = ["test-support"] }
settings = { package = "settings2", path = "../settings2", features = ["test-support"]}
rust-embed = { version = "8.0", features = ["include-exclude"] }
client = { package = "client2", path = "../client2" }
node_runtime = { path = "../node_runtime"}
pretty_assertions.workspace = true
rand.workspace = true
unindent.workspace = true
tempdir.workspace = true
ctor.workspace = true
env_logger.workspace = true
tree-sitter-typescript.workspace = true
tree-sitter-json.workspace = true
tree-sitter-rust.workspace = true
tree-sitter-toml.workspace = true
tree-sitter-cpp.workspace = true
tree-sitter-elixir.workspace = true
tree-sitter-lua.workspace = true
tree-sitter-ruby.workspace = true
tree-sitter-php.workspace = true

View File

@ -0,0 +1,20 @@
# Semantic Index
## Evaluation
### Metrics
nDCG@k:
- "The value of NDCG is determined by comparing the relevance of the items returned by the search engine to the relevance of the item that a hypothetical "ideal" search engine would return.
- "The relevance of result is represented by a score (also known as a 'grade') that is assigned to the search query. The scores of these results are then discounted based on their position in the search results -- did they get recommended first or last?"
MRR@k:
- "Mean reciprocal rank quantifies the rank of the first relevant item found in teh recommendation list."
MAP@k:
- "Mean average precision averages the precision@k metric at each relevant item position in the recommendation list.
Resources:
- [Evaluating recommendation metrics](https://www.shaped.ai/blog/evaluating-recommendation-systems-map-mmr-ndcg)
- [Math Walkthrough](https://towardsdatascience.com/demystifying-ndcg-bee3be58cfe0)

View File

@ -0,0 +1,114 @@
{
"repo": "https://github.com/AntonOsika/gpt-engineer.git",
"commit": "7735a6445bae3611c62f521e6464c67c957f87c2",
"assertions": [
{
"query": "How do I contribute to this project?",
"matches": [
".github/CONTRIBUTING.md:1",
"ROADMAP.md:48"
]
},
{
"query": "What version of the openai package is active?",
"matches": [
"pyproject.toml:14"
]
},
{
"query": "Ask user for clarification",
"matches": [
"gpt_engineer/steps.py:69"
]
},
{
"query": "generate tests for python code",
"matches": [
"gpt_engineer/steps.py:153"
]
},
{
"query": "get item from database based on key",
"matches": [
"gpt_engineer/db.py:42",
"gpt_engineer/db.py:68"
]
},
{
"query": "prompt user to select files",
"matches": [
"gpt_engineer/file_selector.py:171",
"gpt_engineer/file_selector.py:306",
"gpt_engineer/file_selector.py:289",
"gpt_engineer/file_selector.py:234"
]
},
{
"query": "send to rudderstack",
"matches": [
"gpt_engineer/collect.py:11",
"gpt_engineer/collect.py:38"
]
},
{
"query": "parse code blocks from chat messages",
"matches": [
"gpt_engineer/chat_to_files.py:10",
"docs/intro/chat_parsing.md:1"
]
},
{
"query": "how do I use the docker cli?",
"matches": [
"docker/README.md:1"
]
},
{
"query": "ask the user if the code ran successfully?",
"matches": [
"gpt_engineer/learning.py:54"
]
},
{
"query": "how is consent granted by the user?",
"matches": [
"gpt_engineer/learning.py:107",
"gpt_engineer/learning.py:130",
"gpt_engineer/learning.py:152"
]
},
{
"query": "what are all the different steps the agent can take?",
"matches": [
"docs/intro/steps_module.md:1",
"gpt_engineer/steps.py:391"
]
},
{
"query": "ask the user for clarification?",
"matches": [
"gpt_engineer/steps.py:69"
]
},
{
"query": "what models are available?",
"matches": [
"gpt_engineer/ai.py:315",
"gpt_engineer/ai.py:341",
"docs/open-models.md:1"
]
},
{
"query": "what is the current focus of the project?",
"matches": [
"ROADMAP.md:11"
]
},
{
"query": "does the agent know how to fix code?",
"matches": [
"gpt_engineer/steps.py:367"
]
}
]
}

View File

@ -0,0 +1,104 @@
{
"repo": "https://github.com/tree-sitter/tree-sitter.git",
"commit": "46af27796a76c72d8466627d499f2bca4af958ee",
"assertions": [
{
"query": "What attributes are available for the tags configuration struct?",
"matches": [
"tags/src/lib.rs:24"
]
},
{
"query": "create a new tag configuration",
"matches": [
"tags/src/lib.rs:119"
]
},
{
"query": "generate tags based on config",
"matches": [
"tags/src/lib.rs:261"
]
},
{
"query": "match on ts quantifier in rust",
"matches": [
"lib/binding_rust/lib.rs:139"
]
},
{
"query": "cli command to generate tags",
"matches": [
"cli/src/tags.rs:10"
]
},
{
"query": "what version of the tree-sitter-tags package is active?",
"matches": [
"tags/Cargo.toml:4"
]
},
{
"query": "Insert a new parse state",
"matches": [
"cli/src/generate/build_tables/build_parse_table.rs:153"
]
},
{
"query": "Handle conflict when numerous actions occur on the same symbol",
"matches": [
"cli/src/generate/build_tables/build_parse_table.rs:363",
"cli/src/generate/build_tables/build_parse_table.rs:442"
]
},
{
"query": "Match based on associativity of actions",
"matches": [
"cri/src/generate/build_tables/build_parse_table.rs:542"
]
},
{
"query": "Format token set display",
"matches": [
"cli/src/generate/build_tables/item.rs:246"
]
},
{
"query": "extract choices from rule",
"matches": [
"cli/src/generate/prepare_grammar/flatten_grammar.rs:124"
]
},
{
"query": "How do we identify if a symbol is being used?",
"matches": [
"cli/src/generate/prepare_grammar/flatten_grammar.rs:175"
]
},
{
"query": "How do we launch the playground?",
"matches": [
"cli/src/playground.rs:46"
]
},
{
"query": "How do we test treesitter query matches in rust?",
"matches": [
"cli/src/query_testing.rs:152",
"cli/src/tests/query_test.rs:781",
"cli/src/tests/query_test.rs:2163",
"cli/src/tests/query_test.rs:3781",
"cli/src/tests/query_test.rs:887"
]
},
{
"query": "What does the CLI do?",
"matches": [
"cli/README.md:10",
"cli/loader/README.md:3",
"docs/section-5-implementation.md:14",
"docs/section-5-implementation.md:18"
]
}
]
}

View File

@ -0,0 +1,603 @@
use crate::{
parsing::{Span, SpanDigest},
SEMANTIC_INDEX_VERSION,
};
use ai::embedding::Embedding;
use anyhow::{anyhow, Context, Result};
use collections::HashMap;
use futures::channel::oneshot;
use gpui::BackgroundExecutor;
use ndarray::{Array1, Array2};
use ordered_float::OrderedFloat;
use project::Fs;
use rpc::proto::Timestamp;
use rusqlite::params;
use rusqlite::types::Value;
use std::{
future::Future,
ops::Range,
path::{Path, PathBuf},
rc::Rc,
sync::Arc,
time::SystemTime,
};
use util::{paths::PathMatcher, TryFutureExt};
pub fn argsort<T: Ord>(data: &[T]) -> Vec<usize> {
let mut indices = (0..data.len()).collect::<Vec<_>>();
indices.sort_by_key(|&i| &data[i]);
indices.reverse();
indices
}
#[derive(Debug)]
pub struct FileRecord {
pub id: usize,
pub relative_path: String,
pub mtime: Timestamp,
}
#[derive(Clone)]
pub struct VectorDatabase {
path: Arc<Path>,
transactions:
smol::channel::Sender<Box<dyn 'static + Send + FnOnce(&mut rusqlite::Connection)>>,
}
impl VectorDatabase {
pub async fn new(
fs: Arc<dyn Fs>,
path: Arc<Path>,
executor: BackgroundExecutor,
) -> Result<Self> {
if let Some(db_directory) = path.parent() {
fs.create_dir(db_directory).await?;
}
let (transactions_tx, transactions_rx) = smol::channel::unbounded::<
Box<dyn 'static + Send + FnOnce(&mut rusqlite::Connection)>,
>();
executor
.spawn({
let path = path.clone();
async move {
let mut connection = rusqlite::Connection::open(&path)?;
connection.pragma_update(None, "journal_mode", "wal")?;
connection.pragma_update(None, "synchronous", "normal")?;
connection.pragma_update(None, "cache_size", 1000000)?;
connection.pragma_update(None, "temp_store", "MEMORY")?;
while let Ok(transaction) = transactions_rx.recv().await {
transaction(&mut connection);
}
anyhow::Ok(())
}
.log_err()
})
.detach();
let this = Self {
transactions: transactions_tx,
path,
};
this.initialize_database().await?;
Ok(this)
}
pub fn path(&self) -> &Arc<Path> {
&self.path
}
fn transact<F, T>(&self, f: F) -> impl Future<Output = Result<T>>
where
F: 'static + Send + FnOnce(&rusqlite::Transaction) -> Result<T>,
T: 'static + Send,
{
let (tx, rx) = oneshot::channel();
let transactions = self.transactions.clone();
async move {
if transactions
.send(Box::new(|connection| {
let result = connection
.transaction()
.map_err(|err| anyhow!(err))
.and_then(|transaction| {
let result = f(&transaction)?;
transaction.commit()?;
Ok(result)
});
let _ = tx.send(result);
}))
.await
.is_err()
{
return Err(anyhow!("connection was dropped"))?;
}
rx.await?
}
}
fn initialize_database(&self) -> impl Future<Output = Result<()>> {
self.transact(|db| {
rusqlite::vtab::array::load_module(&db)?;
// Delete existing tables, if SEMANTIC_INDEX_VERSION is bumped
let version_query = db.prepare("SELECT version from semantic_index_config");
let version = version_query
.and_then(|mut query| query.query_row([], |row| Ok(row.get::<_, i64>(0)?)));
if version.map_or(false, |version| version == SEMANTIC_INDEX_VERSION as i64) {
log::trace!("vector database schema up to date");
return Ok(());
}
log::trace!("vector database schema out of date. updating...");
// We renamed the `documents` table to `spans`, so we want to drop
// `documents` without recreating it if it exists.
db.execute("DROP TABLE IF EXISTS documents", [])
.context("failed to drop 'documents' table")?;
db.execute("DROP TABLE IF EXISTS spans", [])
.context("failed to drop 'spans' table")?;
db.execute("DROP TABLE IF EXISTS files", [])
.context("failed to drop 'files' table")?;
db.execute("DROP TABLE IF EXISTS worktrees", [])
.context("failed to drop 'worktrees' table")?;
db.execute("DROP TABLE IF EXISTS semantic_index_config", [])
.context("failed to drop 'semantic_index_config' table")?;
// Initialize Vector Databasing Tables
db.execute(
"CREATE TABLE semantic_index_config (
version INTEGER NOT NULL
)",
[],
)?;
db.execute(
"INSERT INTO semantic_index_config (version) VALUES (?1)",
params![SEMANTIC_INDEX_VERSION],
)?;
db.execute(
"CREATE TABLE worktrees (
id INTEGER PRIMARY KEY AUTOINCREMENT,
absolute_path VARCHAR NOT NULL
);
CREATE UNIQUE INDEX worktrees_absolute_path ON worktrees (absolute_path);
",
[],
)?;
db.execute(
"CREATE TABLE files (
id INTEGER PRIMARY KEY AUTOINCREMENT,
worktree_id INTEGER NOT NULL,
relative_path VARCHAR NOT NULL,
mtime_seconds INTEGER NOT NULL,
mtime_nanos INTEGER NOT NULL,
FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
)",
[],
)?;
db.execute(
"CREATE UNIQUE INDEX files_worktree_id_and_relative_path ON files (worktree_id, relative_path)",
[],
)?;
db.execute(
"CREATE TABLE spans (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
start_byte INTEGER NOT NULL,
end_byte INTEGER NOT NULL,
name VARCHAR NOT NULL,
embedding BLOB NOT NULL,
digest BLOB NOT NULL,
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
)",
[],
)?;
db.execute(
"CREATE INDEX spans_digest ON spans (digest)",
[],
)?;
log::trace!("vector database initialized with updated schema.");
Ok(())
})
}
pub fn delete_file(
&self,
worktree_id: i64,
delete_path: Arc<Path>,
) -> impl Future<Output = Result<()>> {
self.transact(move |db| {
db.execute(
"DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2",
params![worktree_id, delete_path.to_str()],
)?;
Ok(())
})
}
pub fn insert_file(
&self,
worktree_id: i64,
path: Arc<Path>,
mtime: SystemTime,
spans: Vec<Span>,
) -> impl Future<Output = Result<()>> {
self.transact(move |db| {
// Return the existing ID, if both the file and mtime match
let mtime = Timestamp::from(mtime);
db.execute(
"
REPLACE INTO files
(worktree_id, relative_path, mtime_seconds, mtime_nanos)
VALUES (?1, ?2, ?3, ?4)
",
params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
)?;
let file_id = db.last_insert_rowid();
let mut query = db.prepare(
"
INSERT INTO spans
(file_id, start_byte, end_byte, name, embedding, digest)
VALUES (?1, ?2, ?3, ?4, ?5, ?6)
",
)?;
for span in spans {
query.execute(params![
file_id,
span.range.start.to_string(),
span.range.end.to_string(),
span.name,
span.embedding,
span.digest
])?;
}
Ok(())
})
}
pub fn worktree_previously_indexed(
&self,
worktree_root_path: &Path,
) -> impl Future<Output = Result<bool>> {
let worktree_root_path = worktree_root_path.to_string_lossy().into_owned();
self.transact(move |db| {
let mut worktree_query =
db.prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
let worktree_id = worktree_query
.query_row(params![worktree_root_path], |row| Ok(row.get::<_, i64>(0)?));
if worktree_id.is_ok() {
return Ok(true);
} else {
return Ok(false);
}
})
}
pub fn embeddings_for_digests(
&self,
digests: Vec<SpanDigest>,
) -> impl Future<Output = Result<HashMap<SpanDigest, Embedding>>> {
self.transact(move |db| {
let mut query = db.prepare(
"
SELECT digest, embedding
FROM spans
WHERE digest IN rarray(?)
",
)?;
let mut embeddings_by_digest = HashMap::default();
let digests = Rc::new(
digests
.into_iter()
.map(|p| Value::Blob(p.0.to_vec()))
.collect::<Vec<_>>(),
);
let rows = query.query_map(params![digests], |row| {
Ok((row.get::<_, SpanDigest>(0)?, row.get::<_, Embedding>(1)?))
})?;
for row in rows {
if let Ok(row) = row {
embeddings_by_digest.insert(row.0, row.1);
}
}
Ok(embeddings_by_digest)
})
}
pub fn embeddings_for_files(
&self,
worktree_id_file_paths: HashMap<i64, Vec<Arc<Path>>>,
) -> impl Future<Output = Result<HashMap<SpanDigest, Embedding>>> {
self.transact(move |db| {
let mut query = db.prepare(
"
SELECT digest, embedding
FROM spans
LEFT JOIN files ON files.id = spans.file_id
WHERE files.worktree_id = ? AND files.relative_path IN rarray(?)
",
)?;
let mut embeddings_by_digest = HashMap::default();
for (worktree_id, file_paths) in worktree_id_file_paths {
let file_paths = Rc::new(
file_paths
.into_iter()
.map(|p| Value::Text(p.to_string_lossy().into_owned()))
.collect::<Vec<_>>(),
);
let rows = query.query_map(params![worktree_id, file_paths], |row| {
Ok((row.get::<_, SpanDigest>(0)?, row.get::<_, Embedding>(1)?))
})?;
for row in rows {
if let Ok(row) = row {
embeddings_by_digest.insert(row.0, row.1);
}
}
}
Ok(embeddings_by_digest)
})
}
pub fn find_or_create_worktree(
&self,
worktree_root_path: Arc<Path>,
) -> impl Future<Output = Result<i64>> {
self.transact(move |db| {
let mut worktree_query =
db.prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
let worktree_id = worktree_query
.query_row(params![worktree_root_path.to_string_lossy()], |row| {
Ok(row.get::<_, i64>(0)?)
});
if worktree_id.is_ok() {
return Ok(worktree_id?);
}
// If worktree_id is Err, insert new worktree
db.execute(
"INSERT into worktrees (absolute_path) VALUES (?1)",
params![worktree_root_path.to_string_lossy()],
)?;
Ok(db.last_insert_rowid())
})
}
pub fn get_file_mtimes(
&self,
worktree_id: i64,
) -> impl Future<Output = Result<HashMap<PathBuf, SystemTime>>> {
self.transact(move |db| {
let mut statement = db.prepare(
"
SELECT relative_path, mtime_seconds, mtime_nanos
FROM files
WHERE worktree_id = ?1
ORDER BY relative_path",
)?;
let mut result: HashMap<PathBuf, SystemTime> = HashMap::default();
for row in statement.query_map(params![worktree_id], |row| {
Ok((
row.get::<_, String>(0)?.into(),
Timestamp {
seconds: row.get(1)?,
nanos: row.get(2)?,
}
.into(),
))
})? {
let row = row?;
result.insert(row.0, row.1);
}
Ok(result)
})
}
pub fn top_k_search(
&self,
query_embedding: &Embedding,
limit: usize,
file_ids: &[i64],
) -> impl Future<Output = Result<Vec<(i64, OrderedFloat<f32>)>>> {
let file_ids = file_ids.to_vec();
let query = query_embedding.clone().0;
let query = Array1::from_vec(query);
self.transact(move |db| {
let mut query_statement = db.prepare(
"
SELECT
id, embedding
FROM
spans
WHERE
file_id IN rarray(?)
",
)?;
let deserialized_rows = query_statement
.query_map(params![ids_to_sql(&file_ids)], |row| {
Ok((row.get::<_, usize>(0)?, row.get::<_, Embedding>(1)?))
})?
.filter_map(|row| row.ok())
.collect::<Vec<(usize, Embedding)>>();
if deserialized_rows.len() == 0 {
return Ok(Vec::new());
}
// Get Length of Embeddings Returned
let embedding_len = deserialized_rows[0].1 .0.len();
let batch_n = 1000;
let mut batches = Vec::new();
let mut batch_ids = Vec::new();
let mut batch_embeddings: Vec<f32> = Vec::new();
deserialized_rows.iter().for_each(|(id, embedding)| {
batch_ids.push(id);
batch_embeddings.extend(&embedding.0);
if batch_ids.len() == batch_n {
let embeddings = std::mem::take(&mut batch_embeddings);
let ids = std::mem::take(&mut batch_ids);
let array =
Array2::from_shape_vec((ids.len(), embedding_len.clone()), embeddings);
match array {
Ok(array) => {
batches.push((ids, array));
}
Err(err) => log::error!("Failed to deserialize to ndarray: {:?}", err),
}
}
});
if batch_ids.len() > 0 {
let array = Array2::from_shape_vec(
(batch_ids.len(), embedding_len),
batch_embeddings.clone(),
);
match array {
Ok(array) => {
batches.push((batch_ids.clone(), array));
}
Err(err) => log::error!("Failed to deserialize to ndarray: {:?}", err),
}
}
let mut ids: Vec<usize> = Vec::new();
let mut results = Vec::new();
for (batch_ids, array) in batches {
let scores = array
.dot(&query.t())
.to_vec()
.iter()
.map(|score| OrderedFloat(*score))
.collect::<Vec<OrderedFloat<f32>>>();
results.extend(scores);
ids.extend(batch_ids);
}
let sorted_idx = argsort(&results);
let mut sorted_results = Vec::new();
let last_idx = limit.min(sorted_idx.len());
for idx in &sorted_idx[0..last_idx] {
sorted_results.push((ids[*idx] as i64, results[*idx]))
}
Ok(sorted_results)
})
}
pub fn retrieve_included_file_ids(
&self,
worktree_ids: &[i64],
includes: &[PathMatcher],
excludes: &[PathMatcher],
) -> impl Future<Output = Result<Vec<i64>>> {
let worktree_ids = worktree_ids.to_vec();
let includes = includes.to_vec();
let excludes = excludes.to_vec();
self.transact(move |db| {
let mut file_query = db.prepare(
"
SELECT
id, relative_path
FROM
files
WHERE
worktree_id IN rarray(?)
",
)?;
let mut file_ids = Vec::<i64>::new();
let mut rows = file_query.query([ids_to_sql(&worktree_ids)])?;
while let Some(row) = rows.next()? {
let file_id = row.get(0)?;
let relative_path = row.get_ref(1)?.as_str()?;
let included =
includes.is_empty() || includes.iter().any(|glob| glob.is_match(relative_path));
let excluded = excludes.iter().any(|glob| glob.is_match(relative_path));
if included && !excluded {
file_ids.push(file_id);
}
}
anyhow::Ok(file_ids)
})
}
pub fn spans_for_ids(
&self,
ids: &[i64],
) -> impl Future<Output = Result<Vec<(i64, PathBuf, Range<usize>)>>> {
let ids = ids.to_vec();
self.transact(move |db| {
let mut statement = db.prepare(
"
SELECT
spans.id,
files.worktree_id,
files.relative_path,
spans.start_byte,
spans.end_byte
FROM
spans, files
WHERE
spans.file_id = files.id AND
spans.id in rarray(?)
",
)?;
let result_iter = statement.query_map(params![ids_to_sql(&ids)], |row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, i64>(1)?,
row.get::<_, String>(2)?.into(),
row.get(3)?..row.get(4)?,
))
})?;
let mut values_by_id = HashMap::<i64, (i64, PathBuf, Range<usize>)>::default();
for row in result_iter {
let (id, worktree_id, path, range) = row?;
values_by_id.insert(id, (worktree_id, path, range));
}
let mut results = Vec::with_capacity(ids.len());
for id in &ids {
let value = values_by_id
.remove(id)
.ok_or(anyhow!("missing span id {}", id))?;
results.push(value);
}
Ok(results)
})
}
}
fn ids_to_sql(ids: &[i64]) -> Rc<Vec<rusqlite::types::Value>> {
Rc::new(
ids.iter()
.copied()
.map(|v| rusqlite::types::Value::from(v))
.collect::<Vec<_>>(),
)
}

View File

@ -0,0 +1,169 @@
use crate::{parsing::Span, JobHandle};
use ai::embedding::EmbeddingProvider;
use gpui::BackgroundExecutor;
use parking_lot::Mutex;
use smol::channel;
use std::{mem, ops::Range, path::Path, sync::Arc, time::SystemTime};
#[derive(Clone)]
pub struct FileToEmbed {
pub worktree_id: i64,
pub path: Arc<Path>,
pub mtime: SystemTime,
pub spans: Vec<Span>,
pub job_handle: JobHandle,
}
impl std::fmt::Debug for FileToEmbed {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FileToEmbed")
.field("worktree_id", &self.worktree_id)
.field("path", &self.path)
.field("mtime", &self.mtime)
.field("spans", &self.spans)
.finish_non_exhaustive()
}
}
impl PartialEq for FileToEmbed {
fn eq(&self, other: &Self) -> bool {
self.worktree_id == other.worktree_id
&& self.path == other.path
&& self.mtime == other.mtime
&& self.spans == other.spans
}
}
pub struct EmbeddingQueue {
embedding_provider: Arc<dyn EmbeddingProvider>,
pending_batch: Vec<FileFragmentToEmbed>,
executor: BackgroundExecutor,
pending_batch_token_count: usize,
finished_files_tx: channel::Sender<FileToEmbed>,
finished_files_rx: channel::Receiver<FileToEmbed>,
}
#[derive(Clone)]
pub struct FileFragmentToEmbed {
file: Arc<Mutex<FileToEmbed>>,
span_range: Range<usize>,
}
impl EmbeddingQueue {
pub fn new(
embedding_provider: Arc<dyn EmbeddingProvider>,
executor: BackgroundExecutor,
) -> Self {
let (finished_files_tx, finished_files_rx) = channel::unbounded();
Self {
embedding_provider,
executor,
pending_batch: Vec::new(),
pending_batch_token_count: 0,
finished_files_tx,
finished_files_rx,
}
}
pub fn push(&mut self, file: FileToEmbed) {
if file.spans.is_empty() {
self.finished_files_tx.try_send(file).unwrap();
return;
}
let file = Arc::new(Mutex::new(file));
self.pending_batch.push(FileFragmentToEmbed {
file: file.clone(),
span_range: 0..0,
});
let mut fragment_range = &mut self.pending_batch.last_mut().unwrap().span_range;
for (ix, span) in file.lock().spans.iter().enumerate() {
let span_token_count = if span.embedding.is_none() {
span.token_count
} else {
0
};
let next_token_count = self.pending_batch_token_count + span_token_count;
if next_token_count > self.embedding_provider.max_tokens_per_batch() {
let range_end = fragment_range.end;
self.flush();
self.pending_batch.push(FileFragmentToEmbed {
file: file.clone(),
span_range: range_end..range_end,
});
fragment_range = &mut self.pending_batch.last_mut().unwrap().span_range;
}
fragment_range.end = ix + 1;
self.pending_batch_token_count += span_token_count;
}
}
pub fn flush(&mut self) {
let batch = mem::take(&mut self.pending_batch);
self.pending_batch_token_count = 0;
if batch.is_empty() {
return;
}
let finished_files_tx = self.finished_files_tx.clone();
let embedding_provider = self.embedding_provider.clone();
self.executor
.spawn(async move {
let mut spans = Vec::new();
for fragment in &batch {
let file = fragment.file.lock();
spans.extend(
file.spans[fragment.span_range.clone()]
.iter()
.filter(|d| d.embedding.is_none())
.map(|d| d.content.clone()),
);
}
// If spans is 0, just send the fragment to the finished files if its the last one.
if spans.is_empty() {
for fragment in batch.clone() {
if let Some(file) = Arc::into_inner(fragment.file) {
finished_files_tx.try_send(file.into_inner()).unwrap();
}
}
return;
};
match embedding_provider.embed_batch(spans).await {
Ok(embeddings) => {
let mut embeddings = embeddings.into_iter();
for fragment in batch {
for span in &mut fragment.file.lock().spans[fragment.span_range.clone()]
.iter_mut()
.filter(|d| d.embedding.is_none())
{
if let Some(embedding) = embeddings.next() {
span.embedding = Some(embedding);
} else {
log::error!("number of embeddings != number of documents");
}
}
if let Some(file) = Arc::into_inner(fragment.file) {
finished_files_tx.try_send(file.into_inner()).unwrap();
}
}
}
Err(error) => {
log::error!("{:?}", error);
}
}
})
.detach();
}
pub fn finished_files(&self) -> channel::Receiver<FileToEmbed> {
self.finished_files_rx.clone()
}
}

View File

@ -0,0 +1,414 @@
use ai::{
embedding::{Embedding, EmbeddingProvider},
models::TruncationDirection,
};
use anyhow::{anyhow, Result};
use language::{Grammar, Language};
use rusqlite::{
types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
ToSql,
};
use sha1::{Digest, Sha1};
use std::{
borrow::Cow,
cmp::{self, Reverse},
collections::HashSet,
ops::Range,
path::Path,
sync::Arc,
};
use tree_sitter::{Parser, QueryCursor};
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub struct SpanDigest(pub [u8; 20]);
impl FromSql for SpanDigest {
fn column_result(value: ValueRef) -> FromSqlResult<Self> {
let blob = value.as_blob()?;
let bytes =
blob.try_into()
.map_err(|_| rusqlite::types::FromSqlError::InvalidBlobSize {
expected_size: 20,
blob_size: blob.len(),
})?;
return Ok(SpanDigest(bytes));
}
}
impl ToSql for SpanDigest {
fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
self.0.to_sql()
}
}
impl From<&'_ str> for SpanDigest {
fn from(value: &'_ str) -> Self {
let mut sha1 = Sha1::new();
sha1.update(value);
Self(sha1.finalize().into())
}
}
#[derive(Debug, PartialEq, Clone)]
pub struct Span {
pub name: String,
pub range: Range<usize>,
pub content: String,
pub embedding: Option<Embedding>,
pub digest: SpanDigest,
pub token_count: usize,
}
const CODE_CONTEXT_TEMPLATE: &str =
"The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
const ENTIRE_FILE_TEMPLATE: &str =
"The below snippet is from file '<path>'\n\n```<language>\n<item>\n```";
const MARKDOWN_CONTEXT_TEMPLATE: &str = "The below file contents is from file '<path>'\n\n<item>";
pub const PARSEABLE_ENTIRE_FILE_TYPES: &[&str] = &[
"TOML", "YAML", "CSS", "HEEX", "ERB", "SVELTE", "HTML", "Scheme",
];
pub struct CodeContextRetriever {
pub parser: Parser,
pub cursor: QueryCursor,
pub embedding_provider: Arc<dyn EmbeddingProvider>,
}
// Every match has an item, this represents the fundamental treesitter symbol and anchors the search
// Every match has one or more 'name' captures. These indicate the display range of the item for deduplication.
// If there are preceeding comments, we track this with a context capture
// If there is a piece that should be collapsed in hierarchical queries, we capture it with a collapse capture
// If there is a piece that should be kept inside a collapsed node, we capture it with a keep capture
#[derive(Debug, Clone)]
pub struct CodeContextMatch {
pub start_col: usize,
pub item_range: Option<Range<usize>>,
pub name_range: Option<Range<usize>>,
pub context_ranges: Vec<Range<usize>>,
pub collapse_ranges: Vec<Range<usize>>,
}
impl CodeContextRetriever {
pub fn new(embedding_provider: Arc<dyn EmbeddingProvider>) -> Self {
Self {
parser: Parser::new(),
cursor: QueryCursor::new(),
embedding_provider,
}
}
fn parse_entire_file(
&self,
relative_path: Option<&Path>,
language_name: Arc<str>,
content: &str,
) -> Result<Vec<Span>> {
let document_span = ENTIRE_FILE_TEMPLATE
.replace(
"<path>",
&relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
)
.replace("<language>", language_name.as_ref())
.replace("<item>", &content);
let digest = SpanDigest::from(document_span.as_str());
let model = self.embedding_provider.base_model();
let document_span = model.truncate(
&document_span,
model.capacity()?,
ai::models::TruncationDirection::End,
)?;
let token_count = model.count_tokens(&document_span)?;
Ok(vec![Span {
range: 0..content.len(),
content: document_span,
embedding: Default::default(),
name: language_name.to_string(),
digest,
token_count,
}])
}
fn parse_markdown_file(
&self,
relative_path: Option<&Path>,
content: &str,
) -> Result<Vec<Span>> {
let document_span = MARKDOWN_CONTEXT_TEMPLATE
.replace(
"<path>",
&relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
)
.replace("<item>", &content);
let digest = SpanDigest::from(document_span.as_str());
let model = self.embedding_provider.base_model();
let document_span = model.truncate(
&document_span,
model.capacity()?,
ai::models::TruncationDirection::End,
)?;
let token_count = model.count_tokens(&document_span)?;
Ok(vec![Span {
range: 0..content.len(),
content: document_span,
embedding: None,
name: "Markdown".to_string(),
digest,
token_count,
}])
}
fn get_matches_in_file(
&mut self,
content: &str,
grammar: &Arc<Grammar>,
) -> Result<Vec<CodeContextMatch>> {
let embedding_config = grammar
.embedding_config
.as_ref()
.ok_or_else(|| anyhow!("no embedding queries"))?;
self.parser.set_language(grammar.ts_language).unwrap();
let tree = self
.parser
.parse(&content, None)
.ok_or_else(|| anyhow!("parsing failed"))?;
let mut captures: Vec<CodeContextMatch> = Vec::new();
let mut collapse_ranges: Vec<Range<usize>> = Vec::new();
let mut keep_ranges: Vec<Range<usize>> = Vec::new();
for mat in self.cursor.matches(
&embedding_config.query,
tree.root_node(),
content.as_bytes(),
) {
let mut start_col = 0;
let mut item_range: Option<Range<usize>> = None;
let mut name_range: Option<Range<usize>> = None;
let mut context_ranges: Vec<Range<usize>> = Vec::new();
collapse_ranges.clear();
keep_ranges.clear();
for capture in mat.captures {
if capture.index == embedding_config.item_capture_ix {
item_range = Some(capture.node.byte_range());
start_col = capture.node.start_position().column;
} else if Some(capture.index) == embedding_config.name_capture_ix {
name_range = Some(capture.node.byte_range());
} else if Some(capture.index) == embedding_config.context_capture_ix {
context_ranges.push(capture.node.byte_range());
} else if Some(capture.index) == embedding_config.collapse_capture_ix {
collapse_ranges.push(capture.node.byte_range());
} else if Some(capture.index) == embedding_config.keep_capture_ix {
keep_ranges.push(capture.node.byte_range());
}
}
captures.push(CodeContextMatch {
start_col,
item_range,
name_range,
context_ranges,
collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges),
});
}
Ok(captures)
}
pub fn parse_file_with_template(
&mut self,
relative_path: Option<&Path>,
content: &str,
language: Arc<Language>,
) -> Result<Vec<Span>> {
let language_name = language.name();
if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language_name.as_ref()) {
return self.parse_entire_file(relative_path, language_name, &content);
} else if ["Markdown", "Plain Text"].contains(&language_name.as_ref()) {
return self.parse_markdown_file(relative_path, &content);
}
let mut spans = self.parse_file(content, language)?;
for span in &mut spans {
let document_content = CODE_CONTEXT_TEMPLATE
.replace(
"<path>",
&relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
)
.replace("<language>", language_name.as_ref())
.replace("item", &span.content);
let model = self.embedding_provider.base_model();
let document_content = model.truncate(
&document_content,
model.capacity()?,
TruncationDirection::End,
)?;
let token_count = model.count_tokens(&document_content)?;
span.content = document_content;
span.token_count = token_count;
}
Ok(spans)
}
pub fn parse_file(&mut self, content: &str, language: Arc<Language>) -> Result<Vec<Span>> {
let grammar = language
.grammar()
.ok_or_else(|| anyhow!("no grammar for language"))?;
// Iterate through query matches
let matches = self.get_matches_in_file(content, grammar)?;
let language_scope = language.default_scope();
let placeholder = language_scope.collapsed_placeholder();
let mut spans = Vec::new();
let mut collapsed_ranges_within = Vec::new();
let mut parsed_name_ranges = HashSet::new();
for (i, context_match) in matches.iter().enumerate() {
// Items which are collapsible but not embeddable have no item range
let item_range = if let Some(item_range) = context_match.item_range.clone() {
item_range
} else {
continue;
};
// Checks for deduplication
let name;
if let Some(name_range) = context_match.name_range.clone() {
name = content
.get(name_range.clone())
.map_or(String::new(), |s| s.to_string());
if parsed_name_ranges.contains(&name_range) {
continue;
}
parsed_name_ranges.insert(name_range);
} else {
name = String::new();
}
collapsed_ranges_within.clear();
'outer: for remaining_match in &matches[(i + 1)..] {
for collapsed_range in &remaining_match.collapse_ranges {
if item_range.start <= collapsed_range.start
&& item_range.end >= collapsed_range.end
{
collapsed_ranges_within.push(collapsed_range.clone());
} else {
break 'outer;
}
}
}
collapsed_ranges_within.sort_by_key(|r| (r.start, Reverse(r.end)));
let mut span_content = String::new();
for context_range in &context_match.context_ranges {
add_content_from_range(
&mut span_content,
content,
context_range.clone(),
context_match.start_col,
);
span_content.push_str("\n");
}
let mut offset = item_range.start;
for collapsed_range in &collapsed_ranges_within {
if collapsed_range.start > offset {
add_content_from_range(
&mut span_content,
content,
offset..collapsed_range.start,
context_match.start_col,
);
offset = collapsed_range.start;
}
if collapsed_range.end > offset {
span_content.push_str(placeholder);
offset = collapsed_range.end;
}
}
if offset < item_range.end {
add_content_from_range(
&mut span_content,
content,
offset..item_range.end,
context_match.start_col,
);
}
let sha1 = SpanDigest::from(span_content.as_str());
spans.push(Span {
name,
content: span_content,
range: item_range.clone(),
embedding: None,
digest: sha1,
token_count: 0,
})
}
return Ok(spans);
}
}
pub(crate) fn subtract_ranges(
ranges: &[Range<usize>],
ranges_to_subtract: &[Range<usize>],
) -> Vec<Range<usize>> {
let mut result = Vec::new();
let mut ranges_to_subtract = ranges_to_subtract.iter().peekable();
for range in ranges {
let mut offset = range.start;
while offset < range.end {
if let Some(range_to_subtract) = ranges_to_subtract.peek() {
if offset < range_to_subtract.start {
let next_offset = cmp::min(range_to_subtract.start, range.end);
result.push(offset..next_offset);
offset = next_offset;
} else {
let next_offset = cmp::min(range_to_subtract.end, range.end);
offset = next_offset;
}
if offset >= range_to_subtract.end {
ranges_to_subtract.next();
}
} else {
result.push(offset..range.end);
offset = range.end;
}
}
}
result
}
fn add_content_from_range(
output: &mut String,
content: &str,
range: Range<usize>,
start_col: usize,
) {
for mut line in content.get(range.clone()).unwrap_or("").lines() {
for _ in 0..start_col {
if line.starts_with(' ') {
line = &line[1..];
} else {
break;
}
}
output.push_str(line);
output.push('\n');
}
output.pop();
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,28 @@
use anyhow;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use settings::Settings;
#[derive(Deserialize, Debug)]
pub struct SemanticIndexSettings {
pub enabled: bool,
}
#[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug)]
pub struct SemanticIndexSettingsContent {
pub enabled: Option<bool>,
}
impl Settings for SemanticIndexSettings {
const KEY: Option<&'static str> = Some("semantic_index");
type FileContent = SemanticIndexSettingsContent;
fn load(
default_value: &Self::FileContent,
user_values: &[&Self::FileContent],
_: &mut gpui::AppContext,
) -> anyhow::Result<Self> {
Self::load_via_json_merge(default_value, user_values)
}
}

File diff suppressed because it is too large Load Diff

View File

@ -3942,8 +3942,6 @@ impl std::fmt::Debug for OpenPaths {
}
}
pub struct WorkspaceCreated(pub WeakView<Workspace>);
pub fn activate_workspace_for_project(
cx: &mut AppContext,
predicate: impl Fn(&Project, &AppContext) -> bool + Send + 'static,