Port semantic_index to gpui2

Co-Authored-By: Julia Risley <julia@zed.dev>
2024-09-18 18:08:07 +03:00 · 2023-12-05 15:38:36 +01:00 · 2023-12-05 15:38:36 +01:00 · 09db455db2
commit 09db455db2
parent d433da1e70
16 changed files with 4569 additions and 10 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -8232,6 +8232,57 @@ dependencies = [
 "workspace",
 ]

+[[package]]
+name = "semantic_index2"
+version = "0.1.0"
+dependencies = [
+ "ai2",
+ "anyhow",
+ "async-trait",
+ "client2",
+ "collections",
+ "ctor",
+ "env_logger 0.9.3",
+ "futures 0.3.28",
+ "globset",
+ "gpui2",
+ "language2",
+ "lazy_static",
+ "log",
+ "ndarray",
+ "node_runtime",
+ "ordered-float 2.10.0",
+ "parking_lot 0.11.2",
+ "postage",
+ "pretty_assertions",
+ "project2",
+ "rand 0.8.5",
+ "rpc2",
+ "rusqlite",
+ "rust-embed",
+ "schemars",
+ "serde",
+ "serde_json",
+ "settings2",
+ "sha1",
+ "smol",
+ "tempdir",
+ "tiktoken-rs",
+ "tree-sitter",
+ "tree-sitter-cpp",
+ "tree-sitter-elixir",
+ "tree-sitter-json 0.20.0",
+ "tree-sitter-lua",
+ "tree-sitter-php",
+ "tree-sitter-ruby",
+ "tree-sitter-rust",
+ "tree-sitter-toml",
+ "tree-sitter-typescript",
+ "unindent",
+ "util",
+ "workspace2",
+]
+
 [[package]]
 name = "semver"
 version = "1.0.18"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -95,6 +95,8 @@ members = [
    "crates/rpc2",
    "crates/search",
    "crates/search2",
+    "crates/semantic_index",
+    "crates/semantic_index2",
    "crates/settings",
    "crates/settings2",
    "crates/snippet",
@ -114,7 +116,6 @@ members = [
    "crates/theme_selector2",
    "crates/ui2",
    "crates/util",
-    "crates/semantic_index",
    "crates/story",
    "crates/vim",
    "crates/vcs_menu",
--- a/crates/ai2/src/auth.rs
+++ b/crates/ai2/src/auth.rs
@ -7,7 +7,7 @@ pub enum ProviderCredential {
    NotNeeded,
 }

-pub trait CredentialProvider {
+pub trait CredentialProvider: Send + Sync {
    fn has_credentials(&self) -> bool;
    fn retrieve_credentials(&self, cx: &mut AppContext) -> ProviderCredential;
    fn save_credentials(&self, cx: &mut AppContext, credential: ProviderCredential);
--- a/crates/ai2/src/providers/open_ai/embedding.rs
+++ b/crates/ai2/src/providers/open_ai/embedding.rs
@ -35,7 +35,7 @@ pub struct OpenAIEmbeddingProvider {
    model: OpenAILanguageModel,
    credential: Arc<RwLock<ProviderCredential>>,
    pub client: Arc<dyn HttpClient>,
-    pub executor: Arc<BackgroundExecutor>,
+    pub executor: BackgroundExecutor,
    rate_limit_count_rx: watch::Receiver<Option<Instant>>,
    rate_limit_count_tx: Arc<Mutex<watch::Sender<Option<Instant>>>>,
 }
@ -66,7 +66,7 @@ struct OpenAIEmbeddingUsage {
 }

 impl OpenAIEmbeddingProvider {
-    pub fn new(client: Arc<dyn HttpClient>, executor: Arc<BackgroundExecutor>) -> Self {
+    pub fn new(client: Arc<dyn HttpClient>, executor: BackgroundExecutor) -> Self {
        let (rate_limit_count_tx, rate_limit_count_rx) = watch::channel_with(None);
        let rate_limit_count_tx = Arc::new(Mutex::new(rate_limit_count_tx));

--- a/crates/gpui2/src/app/entity_map.rs
+++ b/crates/gpui2/src/app/entity_map.rs
@ -482,10 +482,6 @@ impl<T: 'static> WeakModel<T> {
    /// Update the entity referenced by this model with the given function if
    /// the referenced entity still exists. Returns an error if the entity has
    /// been released.
-    ///
-    /// The update function receives a context appropriate for its environment.
-    /// When updating in an `AppContext`, it receives a `ModelContext`.
-    /// When updating an a `WindowContext`, it receives a `ViewContext`.
    pub fn update<C, R>(
        &self,
        cx: &mut C,
@ -501,6 +497,21 @@ impl<T: 'static> WeakModel<T> {
                .map(|this| cx.update_model(&this, update)),
        )
    }
+
+    /// Reads the entity referenced by this model with the given function if
+    /// the referenced entity still exists. Returns an error if the entity has
+    /// been released.
+    pub fn read_with<C, R>(&self, cx: &C, read: impl FnOnce(&T, &AppContext) -> R) -> Result<R>
+    where
+        C: Context,
+        Result<C::Result<R>>: crate::Flatten<R>,
+    {
+        crate::Flatten::flatten(
+            self.upgrade()
+                .ok_or_else(|| anyhow!("entity release"))
+                .map(|this| cx.read_model(&this, read)),
+        )
+    }
 }

 impl<T> Hash for WeakModel<T> {
--- a/crates/semantic_index2/Cargo.toml
+++ b/crates/semantic_index2/Cargo.toml
@ -0,0 +1,69 @@
+[package]
+name = "semantic_index2"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[lib]
+path = "src/semantic_index.rs"
+doctest = false
+
+[dependencies]
+ai = { package = "ai2", path = "../ai2" }
+collections = { path = "../collections" }
+gpui = { package = "gpui2", path = "../gpui2" }
+language = { package = "language2", path = "../language2" }
+project = { package = "project2", path = "../project2" }
+workspace = { package = "workspace2", path = "../workspace2" }
+util = { path = "../util" }
+rpc = { package = "rpc2", path = "../rpc2" }
+settings = { package = "settings2", path = "../settings2" }
+anyhow.workspace = true
+postage.workspace = true
+futures.workspace = true
+ordered-float.workspace = true
+smol.workspace = true
+rusqlite.workspace = true
+log.workspace = true
+tree-sitter.workspace = true
+lazy_static.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+async-trait.workspace = true
+tiktoken-rs.workspace = true
+parking_lot.workspace = true
+rand.workspace = true
+schemars.workspace = true
+globset.workspace = true
+sha1 = "0.10.5"
+ndarray = { version = "0.15.0" }
+
+[dev-dependencies]
+ai = { package = "ai2", path = "../ai2", features = ["test-support"] }
+collections = { path = "../collections", features = ["test-support"] }
+gpui = { package = "gpui2", path = "../gpui2", features = ["test-support"] }
+language = { package = "language2", path = "../language2", features = ["test-support"] }
+project = { package = "project2", path = "../project2", features = ["test-support"] }
+rpc = { package = "rpc2", path = "../rpc2", features = ["test-support"] }
+workspace = { package = "workspace2", path = "../workspace2", features = ["test-support"] }
+settings = { package = "settings2", path = "../settings2", features = ["test-support"]}
+rust-embed = { version = "8.0", features = ["include-exclude"] }
+client = { package = "client2", path = "../client2" }
+node_runtime = { path = "../node_runtime"}
+
+pretty_assertions.workspace = true
+rand.workspace = true
+unindent.workspace = true
+tempdir.workspace = true
+ctor.workspace = true
+env_logger.workspace = true
+
+tree-sitter-typescript.workspace = true
+tree-sitter-json.workspace = true
+tree-sitter-rust.workspace = true
+tree-sitter-toml.workspace = true
+tree-sitter-cpp.workspace = true
+tree-sitter-elixir.workspace = true
+tree-sitter-lua.workspace = true
+tree-sitter-ruby.workspace = true
+tree-sitter-php.workspace = true
--- a/crates/semantic_index2/README.md
+++ b/crates/semantic_index2/README.md
@ -0,0 +1,20 @@
+
+# Semantic Index
+
+## Evaluation
+
+### Metrics
+
+nDCG@k:
+- "The value of NDCG is determined by comparing the relevance of the items returned by the search engine to the relevance of the item that a hypothetical "ideal" search engine would return.
+- "The relevance of result is represented by a score (also known as a 'grade') that is assigned to the search query. The scores of these results are then discounted based on their position in the search results -- did they get recommended first or last?"
+
+MRR@k:
+- "Mean reciprocal rank quantifies the rank of the first relevant item found in teh recommendation list."
+
+MAP@k:
+- "Mean average precision averages the precision@k metric at each relevant item position in the recommendation list.
+
+Resources:
+- [Evaluating recommendation metrics](https://www.shaped.ai/blog/evaluating-recommendation-systems-map-mmr-ndcg)
+- [Math Walkthrough](https://towardsdatascience.com/demystifying-ndcg-bee3be58cfe0)
--- a/crates/semantic_index2/eval/gpt-engineer.json
+++ b/crates/semantic_index2/eval/gpt-engineer.json
@ -0,0 +1,114 @@
+{
+  "repo": "https://github.com/AntonOsika/gpt-engineer.git",
+  "commit": "7735a6445bae3611c62f521e6464c67c957f87c2",
+  "assertions": [
+    {
+      "query": "How do I contribute to this project?",
+      "matches": [
+        ".github/CONTRIBUTING.md:1",
+        "ROADMAP.md:48"
+      ]
+    },
+    {
+      "query": "What version of the openai package is active?",
+      "matches": [
+        "pyproject.toml:14"
+      ]
+    },
+    {
+      "query": "Ask user for clarification",
+      "matches": [
+        "gpt_engineer/steps.py:69"
+      ]
+    },
+    {
+      "query": "generate tests for python code",
+      "matches": [
+        "gpt_engineer/steps.py:153"
+      ]
+    },
+    {
+      "query": "get item from database based on key",
+      "matches": [
+        "gpt_engineer/db.py:42",
+        "gpt_engineer/db.py:68"
+      ]
+    },
+    {
+      "query": "prompt user to select files",
+      "matches": [
+        "gpt_engineer/file_selector.py:171",
+        "gpt_engineer/file_selector.py:306",
+        "gpt_engineer/file_selector.py:289",
+        "gpt_engineer/file_selector.py:234"
+      ]
+    },
+    {
+      "query": "send to rudderstack",
+      "matches": [
+        "gpt_engineer/collect.py:11",
+        "gpt_engineer/collect.py:38"
+      ]
+    },
+    {
+      "query": "parse code blocks from chat messages",
+      "matches": [
+        "gpt_engineer/chat_to_files.py:10",
+        "docs/intro/chat_parsing.md:1"
+      ]
+    },
+    {
+      "query": "how do I use the docker cli?",
+      "matches": [
+        "docker/README.md:1"
+      ]
+    },
+    {
+      "query": "ask the user if the code ran successfully?",
+      "matches": [
+        "gpt_engineer/learning.py:54"
+      ]
+    },
+    {
+      "query": "how is consent granted by the user?",
+      "matches": [
+        "gpt_engineer/learning.py:107",
+        "gpt_engineer/learning.py:130",
+        "gpt_engineer/learning.py:152"
+      ]
+    },
+    {
+      "query": "what are all the different steps the agent can take?",
+      "matches": [
+        "docs/intro/steps_module.md:1",
+        "gpt_engineer/steps.py:391"
+      ]
+    },
+    {
+      "query": "ask the user for clarification?",
+      "matches": [
+        "gpt_engineer/steps.py:69"
+      ]
+    },
+    {
+      "query": "what models are available?",
+      "matches": [
+        "gpt_engineer/ai.py:315",
+        "gpt_engineer/ai.py:341",
+        "docs/open-models.md:1"
+      ]
+    },
+    {
+      "query": "what is the current focus of the project?",
+      "matches": [
+        "ROADMAP.md:11"
+      ]
+    },
+    {
+      "query": "does the agent know how to fix code?",
+      "matches": [
+        "gpt_engineer/steps.py:367"
+      ]
+    }
+  ]
+}
--- a/crates/semantic_index2/eval/tree-sitter.json
+++ b/crates/semantic_index2/eval/tree-sitter.json
@ -0,0 +1,104 @@
+{
+  "repo": "https://github.com/tree-sitter/tree-sitter.git",
+  "commit": "46af27796a76c72d8466627d499f2bca4af958ee",
+  "assertions": [
+    {
+      "query": "What attributes are available for the tags configuration struct?",
+      "matches": [
+        "tags/src/lib.rs:24"
+      ]
+    },
+    {
+      "query": "create a new tag configuration",
+      "matches": [
+        "tags/src/lib.rs:119"
+      ]
+    },
+    {
+      "query": "generate tags based on config",
+      "matches": [
+        "tags/src/lib.rs:261"
+      ]
+    },
+    {
+      "query": "match on ts quantifier in rust",
+      "matches": [
+        "lib/binding_rust/lib.rs:139"
+      ]
+    },
+    {
+      "query": "cli command to generate tags",
+      "matches": [
+        "cli/src/tags.rs:10"
+      ]
+    },
+    {
+      "query": "what version of the tree-sitter-tags package is active?",
+      "matches": [
+        "tags/Cargo.toml:4"
+      ]
+    },
+    {
+      "query": "Insert a new parse state",
+      "matches": [
+        "cli/src/generate/build_tables/build_parse_table.rs:153"
+      ]
+    },
+    {
+      "query": "Handle conflict when numerous actions occur on the same symbol",
+      "matches": [
+        "cli/src/generate/build_tables/build_parse_table.rs:363",
+        "cli/src/generate/build_tables/build_parse_table.rs:442"
+      ]
+    },
+    {
+      "query": "Match based on associativity of actions",
+      "matches": [
+        "cri/src/generate/build_tables/build_parse_table.rs:542"
+      ]
+    },
+    {
+      "query": "Format token set display",
+      "matches": [
+        "cli/src/generate/build_tables/item.rs:246"
+      ]
+    },
+    {
+      "query": "extract choices from rule",
+      "matches": [
+        "cli/src/generate/prepare_grammar/flatten_grammar.rs:124"
+      ]
+    },
+    {
+      "query": "How do we identify if a symbol is being used?",
+      "matches": [
+        "cli/src/generate/prepare_grammar/flatten_grammar.rs:175"
+      ]
+    },
+    {
+      "query": "How do we launch the playground?",
+      "matches": [
+        "cli/src/playground.rs:46"
+      ]
+    },
+    {
+      "query": "How do we test treesitter query matches in rust?",
+      "matches": [
+        "cli/src/query_testing.rs:152",
+        "cli/src/tests/query_test.rs:781",
+        "cli/src/tests/query_test.rs:2163",
+        "cli/src/tests/query_test.rs:3781",
+        "cli/src/tests/query_test.rs:887"
+      ]
+    },
+    {
+      "query": "What does the CLI do?",
+      "matches": [
+        "cli/README.md:10",
+        "cli/loader/README.md:3",
+        "docs/section-5-implementation.md:14",
+        "docs/section-5-implementation.md:18"
+      ]
+    }
+  ]
+}
--- a/crates/semantic_index2/src/db.rs
+++ b/crates/semantic_index2/src/db.rs
@ -0,0 +1,603 @@
+use crate::{
+    parsing::{Span, SpanDigest},
+    SEMANTIC_INDEX_VERSION,
+};
+use ai::embedding::Embedding;
+use anyhow::{anyhow, Context, Result};
+use collections::HashMap;
+use futures::channel::oneshot;
+use gpui::BackgroundExecutor;
+use ndarray::{Array1, Array2};
+use ordered_float::OrderedFloat;
+use project::Fs;
+use rpc::proto::Timestamp;
+use rusqlite::params;
+use rusqlite::types::Value;
+use std::{
+    future::Future,
+    ops::Range,
+    path::{Path, PathBuf},
+    rc::Rc,
+    sync::Arc,
+    time::SystemTime,
+};
+use util::{paths::PathMatcher, TryFutureExt};
+
+pub fn argsort<T: Ord>(data: &[T]) -> Vec<usize> {
+    let mut indices = (0..data.len()).collect::<Vec<_>>();
+    indices.sort_by_key(|&i| &data[i]);
+    indices.reverse();
+    indices
+}
+
+#[derive(Debug)]
+pub struct FileRecord {
+    pub id: usize,
+    pub relative_path: String,
+    pub mtime: Timestamp,
+}
+
+#[derive(Clone)]
+pub struct VectorDatabase {
+    path: Arc<Path>,
+    transactions:
+        smol::channel::Sender<Box<dyn 'static + Send + FnOnce(&mut rusqlite::Connection)>>,
+}
+
+impl VectorDatabase {
+    pub async fn new(
+        fs: Arc<dyn Fs>,
+        path: Arc<Path>,
+        executor: BackgroundExecutor,
+    ) -> Result<Self> {
+        if let Some(db_directory) = path.parent() {
+            fs.create_dir(db_directory).await?;
+        }
+
+        let (transactions_tx, transactions_rx) = smol::channel::unbounded::<
+            Box<dyn 'static + Send + FnOnce(&mut rusqlite::Connection)>,
+        >();
+        executor
+            .spawn({
+                let path = path.clone();
+                async move {
+                    let mut connection = rusqlite::Connection::open(&path)?;
+
+                    connection.pragma_update(None, "journal_mode", "wal")?;
+                    connection.pragma_update(None, "synchronous", "normal")?;
+                    connection.pragma_update(None, "cache_size", 1000000)?;
+                    connection.pragma_update(None, "temp_store", "MEMORY")?;
+
+                    while let Ok(transaction) = transactions_rx.recv().await {
+                        transaction(&mut connection);
+                    }
+
+                    anyhow::Ok(())
+                }
+                .log_err()
+            })
+            .detach();
+        let this = Self {
+            transactions: transactions_tx,
+            path,
+        };
+        this.initialize_database().await?;
+        Ok(this)
+    }
+
+    pub fn path(&self) -> &Arc<Path> {
+        &self.path
+    }
+
+    fn transact<F, T>(&self, f: F) -> impl Future<Output = Result<T>>
+    where
+        F: 'static + Send + FnOnce(&rusqlite::Transaction) -> Result<T>,
+        T: 'static + Send,
+    {
+        let (tx, rx) = oneshot::channel();
+        let transactions = self.transactions.clone();
+        async move {
+            if transactions
+                .send(Box::new(|connection| {
+                    let result = connection
+                        .transaction()
+                        .map_err(|err| anyhow!(err))
+                        .and_then(|transaction| {
+                            let result = f(&transaction)?;
+                            transaction.commit()?;
+                            Ok(result)
+                        });
+                    let _ = tx.send(result);
+                }))
+                .await
+                .is_err()
+            {
+                return Err(anyhow!("connection was dropped"))?;
+            }
+            rx.await?
+        }
+    }
+
+    fn initialize_database(&self) -> impl Future<Output = Result<()>> {
+        self.transact(|db| {
+            rusqlite::vtab::array::load_module(&db)?;
+
+            // Delete existing tables, if SEMANTIC_INDEX_VERSION is bumped
+            let version_query = db.prepare("SELECT version from semantic_index_config");
+            let version = version_query
+                .and_then(|mut query| query.query_row([], |row| Ok(row.get::<_, i64>(0)?)));
+            if version.map_or(false, |version| version == SEMANTIC_INDEX_VERSION as i64) {
+                log::trace!("vector database schema up to date");
+                return Ok(());
+            }
+
+            log::trace!("vector database schema out of date. updating...");
+            // We renamed the `documents` table to `spans`, so we want to drop
+            // `documents` without recreating it if it exists.
+            db.execute("DROP TABLE IF EXISTS documents", [])
+                .context("failed to drop 'documents' table")?;
+            db.execute("DROP TABLE IF EXISTS spans", [])
+                .context("failed to drop 'spans' table")?;
+            db.execute("DROP TABLE IF EXISTS files", [])
+                .context("failed to drop 'files' table")?;
+            db.execute("DROP TABLE IF EXISTS worktrees", [])
+                .context("failed to drop 'worktrees' table")?;
+            db.execute("DROP TABLE IF EXISTS semantic_index_config", [])
+                .context("failed to drop 'semantic_index_config' table")?;
+
+            // Initialize Vector Databasing Tables
+            db.execute(
+                "CREATE TABLE semantic_index_config (
+                    version INTEGER NOT NULL
+                )",
+                [],
+            )?;
+
+            db.execute(
+                "INSERT INTO semantic_index_config (version) VALUES (?1)",
+                params![SEMANTIC_INDEX_VERSION],
+            )?;
+
+            db.execute(
+                "CREATE TABLE worktrees (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    absolute_path VARCHAR NOT NULL
+                );
+                CREATE UNIQUE INDEX worktrees_absolute_path ON worktrees (absolute_path);
+                ",
+                [],
+            )?;
+
+            db.execute(
+                "CREATE TABLE files (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    worktree_id INTEGER NOT NULL,
+                    relative_path VARCHAR NOT NULL,
+                    mtime_seconds INTEGER NOT NULL,
+                    mtime_nanos INTEGER NOT NULL,
+                    FOREIGN KEY(worktree_id) REFERENCES worktrees(id) ON DELETE CASCADE
+                )",
+                [],
+            )?;
+
+            db.execute(
+                "CREATE UNIQUE INDEX files_worktree_id_and_relative_path ON files (worktree_id, relative_path)",
+                [],
+            )?;
+
+            db.execute(
+                "CREATE TABLE spans (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    file_id INTEGER NOT NULL,
+                    start_byte INTEGER NOT NULL,
+                    end_byte INTEGER NOT NULL,
+                    name VARCHAR NOT NULL,
+                    embedding BLOB NOT NULL,
+                    digest BLOB NOT NULL,
+                    FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
+                )",
+                [],
+            )?;
+            db.execute(
+                "CREATE INDEX spans_digest ON spans (digest)",
+                [],
+            )?;
+
+            log::trace!("vector database initialized with updated schema.");
+            Ok(())
+        })
+    }
+
+    pub fn delete_file(
+        &self,
+        worktree_id: i64,
+        delete_path: Arc<Path>,
+    ) -> impl Future<Output = Result<()>> {
+        self.transact(move |db| {
+            db.execute(
+                "DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2",
+                params![worktree_id, delete_path.to_str()],
+            )?;
+            Ok(())
+        })
+    }
+
+    pub fn insert_file(
+        &self,
+        worktree_id: i64,
+        path: Arc<Path>,
+        mtime: SystemTime,
+        spans: Vec<Span>,
+    ) -> impl Future<Output = Result<()>> {
+        self.transact(move |db| {
+            // Return the existing ID, if both the file and mtime match
+            let mtime = Timestamp::from(mtime);
+
+            db.execute(
+                "
+                REPLACE INTO files
+                (worktree_id, relative_path, mtime_seconds, mtime_nanos)
+                VALUES (?1, ?2, ?3, ?4)
+                ",
+                params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
+            )?;
+
+            let file_id = db.last_insert_rowid();
+
+            let mut query = db.prepare(
+                "
+                INSERT INTO spans
+                (file_id, start_byte, end_byte, name, embedding, digest)
+                VALUES (?1, ?2, ?3, ?4, ?5, ?6)
+                ",
+            )?;
+
+            for span in spans {
+                query.execute(params![
+                    file_id,
+                    span.range.start.to_string(),
+                    span.range.end.to_string(),
+                    span.name,
+                    span.embedding,
+                    span.digest
+                ])?;
+            }
+
+            Ok(())
+        })
+    }
+
+    pub fn worktree_previously_indexed(
+        &self,
+        worktree_root_path: &Path,
+    ) -> impl Future<Output = Result<bool>> {
+        let worktree_root_path = worktree_root_path.to_string_lossy().into_owned();
+        self.transact(move |db| {
+            let mut worktree_query =
+                db.prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
+            let worktree_id = worktree_query
+                .query_row(params![worktree_root_path], |row| Ok(row.get::<_, i64>(0)?));
+
+            if worktree_id.is_ok() {
+                return Ok(true);
+            } else {
+                return Ok(false);
+            }
+        })
+    }
+
+    pub fn embeddings_for_digests(
+        &self,
+        digests: Vec<SpanDigest>,
+    ) -> impl Future<Output = Result<HashMap<SpanDigest, Embedding>>> {
+        self.transact(move |db| {
+            let mut query = db.prepare(
+                "
+                SELECT digest, embedding
+                FROM spans
+                WHERE digest IN rarray(?)
+                ",
+            )?;
+            let mut embeddings_by_digest = HashMap::default();
+            let digests = Rc::new(
+                digests
+                    .into_iter()
+                    .map(|p| Value::Blob(p.0.to_vec()))
+                    .collect::<Vec<_>>(),
+            );
+            let rows = query.query_map(params![digests], |row| {
+                Ok((row.get::<_, SpanDigest>(0)?, row.get::<_, Embedding>(1)?))
+            })?;
+
+            for row in rows {
+                if let Ok(row) = row {
+                    embeddings_by_digest.insert(row.0, row.1);
+                }
+            }
+
+            Ok(embeddings_by_digest)
+        })
+    }
+
+    pub fn embeddings_for_files(
+        &self,
+        worktree_id_file_paths: HashMap<i64, Vec<Arc<Path>>>,
+    ) -> impl Future<Output = Result<HashMap<SpanDigest, Embedding>>> {
+        self.transact(move |db| {
+            let mut query = db.prepare(
+                "
+                SELECT digest, embedding
+                FROM spans
+                LEFT JOIN files ON files.id = spans.file_id
+                WHERE files.worktree_id = ? AND files.relative_path IN rarray(?)
+            ",
+            )?;
+            let mut embeddings_by_digest = HashMap::default();
+            for (worktree_id, file_paths) in worktree_id_file_paths {
+                let file_paths = Rc::new(
+                    file_paths
+                        .into_iter()
+                        .map(|p| Value::Text(p.to_string_lossy().into_owned()))
+                        .collect::<Vec<_>>(),
+                );
+                let rows = query.query_map(params![worktree_id, file_paths], |row| {
+                    Ok((row.get::<_, SpanDigest>(0)?, row.get::<_, Embedding>(1)?))
+                })?;
+
+                for row in rows {
+                    if let Ok(row) = row {
+                        embeddings_by_digest.insert(row.0, row.1);
+                    }
+                }
+            }
+
+            Ok(embeddings_by_digest)
+        })
+    }
+
+    pub fn find_or_create_worktree(
+        &self,
+        worktree_root_path: Arc<Path>,
+    ) -> impl Future<Output = Result<i64>> {
+        self.transact(move |db| {
+            let mut worktree_query =
+                db.prepare("SELECT id FROM worktrees WHERE absolute_path = ?1")?;
+            let worktree_id = worktree_query
+                .query_row(params![worktree_root_path.to_string_lossy()], |row| {
+                    Ok(row.get::<_, i64>(0)?)
+                });
+
+            if worktree_id.is_ok() {
+                return Ok(worktree_id?);
+            }
+
+            // If worktree_id is Err, insert new worktree
+            db.execute(
+                "INSERT into worktrees (absolute_path) VALUES (?1)",
+                params![worktree_root_path.to_string_lossy()],
+            )?;
+            Ok(db.last_insert_rowid())
+        })
+    }
+
+    pub fn get_file_mtimes(
+        &self,
+        worktree_id: i64,
+    ) -> impl Future<Output = Result<HashMap<PathBuf, SystemTime>>> {
+        self.transact(move |db| {
+            let mut statement = db.prepare(
+                "
+                SELECT relative_path, mtime_seconds, mtime_nanos
+                FROM files
+                WHERE worktree_id = ?1
+                ORDER BY relative_path",
+            )?;
+            let mut result: HashMap<PathBuf, SystemTime> = HashMap::default();
+            for row in statement.query_map(params![worktree_id], |row| {
+                Ok((
+                    row.get::<_, String>(0)?.into(),
+                    Timestamp {
+                        seconds: row.get(1)?,
+                        nanos: row.get(2)?,
+                    }
+                    .into(),
+                ))
+            })? {
+                let row = row?;
+                result.insert(row.0, row.1);
+            }
+            Ok(result)
+        })
+    }
+
+    pub fn top_k_search(
+        &self,
+        query_embedding: &Embedding,
+        limit: usize,
+        file_ids: &[i64],
+    ) -> impl Future<Output = Result<Vec<(i64, OrderedFloat<f32>)>>> {
+        let file_ids = file_ids.to_vec();
+        let query = query_embedding.clone().0;
+        let query = Array1::from_vec(query);
+        self.transact(move |db| {
+            let mut query_statement = db.prepare(
+                "
+                    SELECT
+                        id, embedding
+                    FROM
+                        spans
+                    WHERE
+                        file_id IN rarray(?)
+                    ",
+            )?;
+
+            let deserialized_rows = query_statement
+                .query_map(params![ids_to_sql(&file_ids)], |row| {
+                    Ok((row.get::<_, usize>(0)?, row.get::<_, Embedding>(1)?))
+                })?
+                .filter_map(|row| row.ok())
+                .collect::<Vec<(usize, Embedding)>>();
+
+            if deserialized_rows.len() == 0 {
+                return Ok(Vec::new());
+            }
+
+            // Get Length of Embeddings Returned
+            let embedding_len = deserialized_rows[0].1 .0.len();
+
+            let batch_n = 1000;
+            let mut batches = Vec::new();
+            let mut batch_ids = Vec::new();
+            let mut batch_embeddings: Vec<f32> = Vec::new();
+            deserialized_rows.iter().for_each(|(id, embedding)| {
+                batch_ids.push(id);
+                batch_embeddings.extend(&embedding.0);
+
+                if batch_ids.len() == batch_n {
+                    let embeddings = std::mem::take(&mut batch_embeddings);
+                    let ids = std::mem::take(&mut batch_ids);
+                    let array =
+                        Array2::from_shape_vec((ids.len(), embedding_len.clone()), embeddings);
+                    match array {
+                        Ok(array) => {
+                            batches.push((ids, array));
+                        }
+                        Err(err) => log::error!("Failed to deserialize to ndarray: {:?}", err),
+                    }
+                }
+            });
+
+            if batch_ids.len() > 0 {
+                let array = Array2::from_shape_vec(
+                    (batch_ids.len(), embedding_len),
+                    batch_embeddings.clone(),
+                );
+                match array {
+                    Ok(array) => {
+                        batches.push((batch_ids.clone(), array));
+                    }
+                    Err(err) => log::error!("Failed to deserialize to ndarray: {:?}", err),
+                }
+            }
+
+            let mut ids: Vec<usize> = Vec::new();
+            let mut results = Vec::new();
+            for (batch_ids, array) in batches {
+                let scores = array
+                    .dot(&query.t())
+                    .to_vec()
+                    .iter()
+                    .map(|score| OrderedFloat(*score))
+                    .collect::<Vec<OrderedFloat<f32>>>();
+                results.extend(scores);
+                ids.extend(batch_ids);
+            }
+
+            let sorted_idx = argsort(&results);
+            let mut sorted_results = Vec::new();
+            let last_idx = limit.min(sorted_idx.len());
+            for idx in &sorted_idx[0..last_idx] {
+                sorted_results.push((ids[*idx] as i64, results[*idx]))
+            }
+
+            Ok(sorted_results)
+        })
+    }
+
+    pub fn retrieve_included_file_ids(
+        &self,
+        worktree_ids: &[i64],
+        includes: &[PathMatcher],
+        excludes: &[PathMatcher],
+    ) -> impl Future<Output = Result<Vec<i64>>> {
+        let worktree_ids = worktree_ids.to_vec();
+        let includes = includes.to_vec();
+        let excludes = excludes.to_vec();
+        self.transact(move |db| {
+            let mut file_query = db.prepare(
+                "
+                SELECT
+                    id, relative_path
+                FROM
+                    files
+                WHERE
+                    worktree_id IN rarray(?)
+                ",
+            )?;
+
+            let mut file_ids = Vec::<i64>::new();
+            let mut rows = file_query.query([ids_to_sql(&worktree_ids)])?;
+
+            while let Some(row) = rows.next()? {
+                let file_id = row.get(0)?;
+                let relative_path = row.get_ref(1)?.as_str()?;
+                let included =
+                    includes.is_empty() || includes.iter().any(|glob| glob.is_match(relative_path));
+                let excluded = excludes.iter().any(|glob| glob.is_match(relative_path));
+                if included && !excluded {
+                    file_ids.push(file_id);
+                }
+            }
+
+            anyhow::Ok(file_ids)
+        })
+    }
+
+    pub fn spans_for_ids(
+        &self,
+        ids: &[i64],
+    ) -> impl Future<Output = Result<Vec<(i64, PathBuf, Range<usize>)>>> {
+        let ids = ids.to_vec();
+        self.transact(move |db| {
+            let mut statement = db.prepare(
+                "
+                    SELECT
+                        spans.id,
+                        files.worktree_id,
+                        files.relative_path,
+                        spans.start_byte,
+                        spans.end_byte
+                    FROM
+                        spans, files
+                    WHERE
+                        spans.file_id = files.id AND
+                        spans.id in rarray(?)
+                ",
+            )?;
+
+            let result_iter = statement.query_map(params![ids_to_sql(&ids)], |row| {
+                Ok((
+                    row.get::<_, i64>(0)?,
+                    row.get::<_, i64>(1)?,
+                    row.get::<_, String>(2)?.into(),
+                    row.get(3)?..row.get(4)?,
+                ))
+            })?;
+
+            let mut values_by_id = HashMap::<i64, (i64, PathBuf, Range<usize>)>::default();
+            for row in result_iter {
+                let (id, worktree_id, path, range) = row?;
+                values_by_id.insert(id, (worktree_id, path, range));
+            }
+
+            let mut results = Vec::with_capacity(ids.len());
+            for id in &ids {
+                let value = values_by_id
+                    .remove(id)
+                    .ok_or(anyhow!("missing span id {}", id))?;
+                results.push(value);
+            }
+
+            Ok(results)
+        })
+    }
+}
+
+fn ids_to_sql(ids: &[i64]) -> Rc<Vec<rusqlite::types::Value>> {
+    Rc::new(
+        ids.iter()
+            .copied()
+            .map(|v| rusqlite::types::Value::from(v))
+            .collect::<Vec<_>>(),
+    )
+}
--- a/crates/semantic_index2/src/embedding_queue.rs
+++ b/crates/semantic_index2/src/embedding_queue.rs
@ -0,0 +1,169 @@
+use crate::{parsing::Span, JobHandle};
+use ai::embedding::EmbeddingProvider;
+use gpui::BackgroundExecutor;
+use parking_lot::Mutex;
+use smol::channel;
+use std::{mem, ops::Range, path::Path, sync::Arc, time::SystemTime};
+
+#[derive(Clone)]
+pub struct FileToEmbed {
+    pub worktree_id: i64,
+    pub path: Arc<Path>,
+    pub mtime: SystemTime,
+    pub spans: Vec<Span>,
+    pub job_handle: JobHandle,
+}
+
+impl std::fmt::Debug for FileToEmbed {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("FileToEmbed")
+            .field("worktree_id", &self.worktree_id)
+            .field("path", &self.path)
+            .field("mtime", &self.mtime)
+            .field("spans", &self.spans)
+            .finish_non_exhaustive()
+    }
+}
+
+impl PartialEq for FileToEmbed {
+    fn eq(&self, other: &Self) -> bool {
+        self.worktree_id == other.worktree_id
+            && self.path == other.path
+            && self.mtime == other.mtime
+            && self.spans == other.spans
+    }
+}
+
+pub struct EmbeddingQueue {
+    embedding_provider: Arc<dyn EmbeddingProvider>,
+    pending_batch: Vec<FileFragmentToEmbed>,
+    executor: BackgroundExecutor,
+    pending_batch_token_count: usize,
+    finished_files_tx: channel::Sender<FileToEmbed>,
+    finished_files_rx: channel::Receiver<FileToEmbed>,
+}
+
+#[derive(Clone)]
+pub struct FileFragmentToEmbed {
+    file: Arc<Mutex<FileToEmbed>>,
+    span_range: Range<usize>,
+}
+
+impl EmbeddingQueue {
+    pub fn new(
+        embedding_provider: Arc<dyn EmbeddingProvider>,
+        executor: BackgroundExecutor,
+    ) -> Self {
+        let (finished_files_tx, finished_files_rx) = channel::unbounded();
+        Self {
+            embedding_provider,
+            executor,
+            pending_batch: Vec::new(),
+            pending_batch_token_count: 0,
+            finished_files_tx,
+            finished_files_rx,
+        }
+    }
+
+    pub fn push(&mut self, file: FileToEmbed) {
+        if file.spans.is_empty() {
+            self.finished_files_tx.try_send(file).unwrap();
+            return;
+        }
+
+        let file = Arc::new(Mutex::new(file));
+
+        self.pending_batch.push(FileFragmentToEmbed {
+            file: file.clone(),
+            span_range: 0..0,
+        });
+
+        let mut fragment_range = &mut self.pending_batch.last_mut().unwrap().span_range;
+        for (ix, span) in file.lock().spans.iter().enumerate() {
+            let span_token_count = if span.embedding.is_none() {
+                span.token_count
+            } else {
+                0
+            };
+
+            let next_token_count = self.pending_batch_token_count + span_token_count;
+            if next_token_count > self.embedding_provider.max_tokens_per_batch() {
+                let range_end = fragment_range.end;
+                self.flush();
+                self.pending_batch.push(FileFragmentToEmbed {
+                    file: file.clone(),
+                    span_range: range_end..range_end,
+                });
+                fragment_range = &mut self.pending_batch.last_mut().unwrap().span_range;
+            }
+
+            fragment_range.end = ix + 1;
+            self.pending_batch_token_count += span_token_count;
+        }
+    }
+
+    pub fn flush(&mut self) {
+        let batch = mem::take(&mut self.pending_batch);
+        self.pending_batch_token_count = 0;
+        if batch.is_empty() {
+            return;
+        }
+
+        let finished_files_tx = self.finished_files_tx.clone();
+        let embedding_provider = self.embedding_provider.clone();
+
+        self.executor
+            .spawn(async move {
+                let mut spans = Vec::new();
+                for fragment in &batch {
+                    let file = fragment.file.lock();
+                    spans.extend(
+                        file.spans[fragment.span_range.clone()]
+                            .iter()
+                            .filter(|d| d.embedding.is_none())
+                            .map(|d| d.content.clone()),
+                    );
+                }
+
+                // If spans is 0, just send the fragment to the finished files if its the last one.
+                if spans.is_empty() {
+                    for fragment in batch.clone() {
+                        if let Some(file) = Arc::into_inner(fragment.file) {
+                            finished_files_tx.try_send(file.into_inner()).unwrap();
+                        }
+                    }
+                    return;
+                };
+
+                match embedding_provider.embed_batch(spans).await {
+                    Ok(embeddings) => {
+                        let mut embeddings = embeddings.into_iter();
+                        for fragment in batch {
+                            for span in &mut fragment.file.lock().spans[fragment.span_range.clone()]
+                                .iter_mut()
+                                .filter(|d| d.embedding.is_none())
+                            {
+                                if let Some(embedding) = embeddings.next() {
+                                    span.embedding = Some(embedding);
+                                } else {
+                                    log::error!("number of embeddings != number of documents");
+                                }
+                            }
+
+                            if let Some(file) = Arc::into_inner(fragment.file) {
+                                finished_files_tx.try_send(file.into_inner()).unwrap();
+                            }
+                        }
+                    }
+                    Err(error) => {
+                        log::error!("{:?}", error);
+                    }
+                }
+            })
+            .detach();
+    }
+
+    pub fn finished_files(&self) -> channel::Receiver<FileToEmbed> {
+        self.finished_files_rx.clone()
+    }
+}
--- a/crates/semantic_index2/src/parsing.rs
+++ b/crates/semantic_index2/src/parsing.rs
@ -0,0 +1,414 @@
+use ai::{
+    embedding::{Embedding, EmbeddingProvider},
+    models::TruncationDirection,
+};
+use anyhow::{anyhow, Result};
+use language::{Grammar, Language};
+use rusqlite::{
+    types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
+    ToSql,
+};
+use sha1::{Digest, Sha1};
+use std::{
+    borrow::Cow,
+    cmp::{self, Reverse},
+    collections::HashSet,
+    ops::Range,
+    path::Path,
+    sync::Arc,
+};
+use tree_sitter::{Parser, QueryCursor};
+
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub struct SpanDigest(pub [u8; 20]);
+
+impl FromSql for SpanDigest {
+    fn column_result(value: ValueRef) -> FromSqlResult<Self> {
+        let blob = value.as_blob()?;
+        let bytes =
+            blob.try_into()
+                .map_err(|_| rusqlite::types::FromSqlError::InvalidBlobSize {
+                    expected_size: 20,
+                    blob_size: blob.len(),
+                })?;
+        return Ok(SpanDigest(bytes));
+    }
+}
+
+impl ToSql for SpanDigest {
+    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
+        self.0.to_sql()
+    }
+}
+
+impl From<&'_ str> for SpanDigest {
+    fn from(value: &'_ str) -> Self {
+        let mut sha1 = Sha1::new();
+        sha1.update(value);
+        Self(sha1.finalize().into())
+    }
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct Span {
+    pub name: String,
+    pub range: Range<usize>,
+    pub content: String,
+    pub embedding: Option<Embedding>,
+    pub digest: SpanDigest,
+    pub token_count: usize,
+}
+
+const CODE_CONTEXT_TEMPLATE: &str =
+    "The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
+const ENTIRE_FILE_TEMPLATE: &str =
+    "The below snippet is from file '<path>'\n\n```<language>\n<item>\n```";
+const MARKDOWN_CONTEXT_TEMPLATE: &str = "The below file contents is from file '<path>'\n\n<item>";
+pub const PARSEABLE_ENTIRE_FILE_TYPES: &[&str] = &[
+    "TOML", "YAML", "CSS", "HEEX", "ERB", "SVELTE", "HTML", "Scheme",
+];
+
+pub struct CodeContextRetriever {
+    pub parser: Parser,
+    pub cursor: QueryCursor,
+    pub embedding_provider: Arc<dyn EmbeddingProvider>,
+}
+
+// Every match has an item, this represents the fundamental treesitter symbol and anchors the search
+// Every match has one or more 'name' captures. These indicate the display range of the item for deduplication.
+// If there are preceeding comments, we track this with a context capture
+// If there is a piece that should be collapsed in hierarchical queries, we capture it with a collapse capture
+// If there is a piece that should be kept inside a collapsed node, we capture it with a keep capture
+#[derive(Debug, Clone)]
+pub struct CodeContextMatch {
+    pub start_col: usize,
+    pub item_range: Option<Range<usize>>,
+    pub name_range: Option<Range<usize>>,
+    pub context_ranges: Vec<Range<usize>>,
+    pub collapse_ranges: Vec<Range<usize>>,
+}
+
+impl CodeContextRetriever {
+    pub fn new(embedding_provider: Arc<dyn EmbeddingProvider>) -> Self {
+        Self {
+            parser: Parser::new(),
+            cursor: QueryCursor::new(),
+            embedding_provider,
+        }
+    }
+
+    fn parse_entire_file(
+        &self,
+        relative_path: Option<&Path>,
+        language_name: Arc<str>,
+        content: &str,
+    ) -> Result<Vec<Span>> {
+        let document_span = ENTIRE_FILE_TEMPLATE
+            .replace(
+                "<path>",
+                &relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
+            )
+            .replace("<language>", language_name.as_ref())
+            .replace("<item>", &content);
+        let digest = SpanDigest::from(document_span.as_str());
+        let model = self.embedding_provider.base_model();
+        let document_span = model.truncate(
+            &document_span,
+            model.capacity()?,
+            ai::models::TruncationDirection::End,
+        )?;
+        let token_count = model.count_tokens(&document_span)?;
+
+        Ok(vec![Span {
+            range: 0..content.len(),
+            content: document_span,
+            embedding: Default::default(),
+            name: language_name.to_string(),
+            digest,
+            token_count,
+        }])
+    }
+
+    fn parse_markdown_file(
+        &self,
+        relative_path: Option<&Path>,
+        content: &str,
+    ) -> Result<Vec<Span>> {
+        let document_span = MARKDOWN_CONTEXT_TEMPLATE
+            .replace(
+                "<path>",
+                &relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
+            )
+            .replace("<item>", &content);
+        let digest = SpanDigest::from(document_span.as_str());
+
+        let model = self.embedding_provider.base_model();
+        let document_span = model.truncate(
+            &document_span,
+            model.capacity()?,
+            ai::models::TruncationDirection::End,
+        )?;
+        let token_count = model.count_tokens(&document_span)?;
+
+        Ok(vec![Span {
+            range: 0..content.len(),
+            content: document_span,
+            embedding: None,
+            name: "Markdown".to_string(),
+            digest,
+            token_count,
+        }])
+    }
+
+    fn get_matches_in_file(
+        &mut self,
+        content: &str,
+        grammar: &Arc<Grammar>,
+    ) -> Result<Vec<CodeContextMatch>> {
+        let embedding_config = grammar
+            .embedding_config
+            .as_ref()
+            .ok_or_else(|| anyhow!("no embedding queries"))?;
+        self.parser.set_language(grammar.ts_language).unwrap();
+
+        let tree = self
+            .parser
+            .parse(&content, None)
+            .ok_or_else(|| anyhow!("parsing failed"))?;
+
+        let mut captures: Vec<CodeContextMatch> = Vec::new();
+        let mut collapse_ranges: Vec<Range<usize>> = Vec::new();
+        let mut keep_ranges: Vec<Range<usize>> = Vec::new();
+        for mat in self.cursor.matches(
+            &embedding_config.query,
+            tree.root_node(),
+            content.as_bytes(),
+        ) {
+            let mut start_col = 0;
+            let mut item_range: Option<Range<usize>> = None;
+            let mut name_range: Option<Range<usize>> = None;
+            let mut context_ranges: Vec<Range<usize>> = Vec::new();
+            collapse_ranges.clear();
+            keep_ranges.clear();
+            for capture in mat.captures {
+                if capture.index == embedding_config.item_capture_ix {
+                    item_range = Some(capture.node.byte_range());
+                    start_col = capture.node.start_position().column;
+                } else if Some(capture.index) == embedding_config.name_capture_ix {
+                    name_range = Some(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.context_capture_ix {
+                    context_ranges.push(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.collapse_capture_ix {
+                    collapse_ranges.push(capture.node.byte_range());
+                } else if Some(capture.index) == embedding_config.keep_capture_ix {
+                    keep_ranges.push(capture.node.byte_range());
+                }
+            }
+
+            captures.push(CodeContextMatch {
+                start_col,
+                item_range,
+                name_range,
+                context_ranges,
+                collapse_ranges: subtract_ranges(&collapse_ranges, &keep_ranges),
+            });
+        }
+        Ok(captures)
+    }
+
+    pub fn parse_file_with_template(
+        &mut self,
+        relative_path: Option<&Path>,
+        content: &str,
+        language: Arc<Language>,
+    ) -> Result<Vec<Span>> {
+        let language_name = language.name();
+
+        if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language_name.as_ref()) {
+            return self.parse_entire_file(relative_path, language_name, &content);
+        } else if ["Markdown", "Plain Text"].contains(&language_name.as_ref()) {
+            return self.parse_markdown_file(relative_path, &content);
+        }
+
+        let mut spans = self.parse_file(content, language)?;
+        for span in &mut spans {
+            let document_content = CODE_CONTEXT_TEMPLATE
+                .replace(
+                    "<path>",
+                    &relative_path.map_or(Cow::Borrowed("untitled"), |path| path.to_string_lossy()),
+                )
+                .replace("<language>", language_name.as_ref())
+                .replace("item", &span.content);
+
+            let model = self.embedding_provider.base_model();
+            let document_content = model.truncate(
+                &document_content,
+                model.capacity()?,
+                TruncationDirection::End,
+            )?;
+            let token_count = model.count_tokens(&document_content)?;
+
+            span.content = document_content;
+            span.token_count = token_count;
+        }
+        Ok(spans)
+    }
+
+    pub fn parse_file(&mut self, content: &str, language: Arc<Language>) -> Result<Vec<Span>> {
+        let grammar = language
+            .grammar()
+            .ok_or_else(|| anyhow!("no grammar for language"))?;
+
+        // Iterate through query matches
+        let matches = self.get_matches_in_file(content, grammar)?;
+
+        let language_scope = language.default_scope();
+        let placeholder = language_scope.collapsed_placeholder();
+
+        let mut spans = Vec::new();
+        let mut collapsed_ranges_within = Vec::new();
+        let mut parsed_name_ranges = HashSet::new();
+        for (i, context_match) in matches.iter().enumerate() {
+            // Items which are collapsible but not embeddable have no item range
+            let item_range = if let Some(item_range) = context_match.item_range.clone() {
+                item_range
+            } else {
+                continue;
+            };
+
+            // Checks for deduplication
+            let name;
+            if let Some(name_range) = context_match.name_range.clone() {
+                name = content
+                    .get(name_range.clone())
+                    .map_or(String::new(), |s| s.to_string());
+                if parsed_name_ranges.contains(&name_range) {
+                    continue;
+                }
+                parsed_name_ranges.insert(name_range);
+            } else {
+                name = String::new();
+            }
+
+            collapsed_ranges_within.clear();
+            'outer: for remaining_match in &matches[(i + 1)..] {
+                for collapsed_range in &remaining_match.collapse_ranges {
+                    if item_range.start <= collapsed_range.start
+                        && item_range.end >= collapsed_range.end
+                    {
+                        collapsed_ranges_within.push(collapsed_range.clone());
+                    } else {
+                        break 'outer;
+                    }
+                }
+            }
+
+            collapsed_ranges_within.sort_by_key(|r| (r.start, Reverse(r.end)));
+
+            let mut span_content = String::new();
+            for context_range in &context_match.context_ranges {
+                add_content_from_range(
+                    &mut span_content,
+                    content,
+                    context_range.clone(),
+                    context_match.start_col,
+                );
+                span_content.push_str("\n");
+            }
+
+            let mut offset = item_range.start;
+            for collapsed_range in &collapsed_ranges_within {
+                if collapsed_range.start > offset {
+                    add_content_from_range(
+                        &mut span_content,
+                        content,
+                        offset..collapsed_range.start,
+                        context_match.start_col,
+                    );
+                    offset = collapsed_range.start;
+                }
+
+                if collapsed_range.end > offset {
+                    span_content.push_str(placeholder);
+                    offset = collapsed_range.end;
+                }
+            }
+
+            if offset < item_range.end {
+                add_content_from_range(
+                    &mut span_content,
+                    content,
+                    offset..item_range.end,
+                    context_match.start_col,
+                );
+            }
+
+            let sha1 = SpanDigest::from(span_content.as_str());
+            spans.push(Span {
+                name,
+                content: span_content,
+                range: item_range.clone(),
+                embedding: None,
+                digest: sha1,
+                token_count: 0,
+            })
+        }
+
+        return Ok(spans);
+    }
+}
+
+pub(crate) fn subtract_ranges(
+    ranges: &[Range<usize>],
+    ranges_to_subtract: &[Range<usize>],
+) -> Vec<Range<usize>> {
+    let mut result = Vec::new();
+
+    let mut ranges_to_subtract = ranges_to_subtract.iter().peekable();
+
+    for range in ranges {
+        let mut offset = range.start;
+
+        while offset < range.end {
+            if let Some(range_to_subtract) = ranges_to_subtract.peek() {
+                if offset < range_to_subtract.start {
+                    let next_offset = cmp::min(range_to_subtract.start, range.end);
+                    result.push(offset..next_offset);
+                    offset = next_offset;
+                } else {
+                    let next_offset = cmp::min(range_to_subtract.end, range.end);
+                    offset = next_offset;
+                }
+
+                if offset >= range_to_subtract.end {
+                    ranges_to_subtract.next();
+                }
+            } else {
+                result.push(offset..range.end);
+                offset = range.end;
+            }
+        }
+    }
+
+    result
+}
+
+fn add_content_from_range(
+    output: &mut String,
+    content: &str,
+    range: Range<usize>,
+    start_col: usize,
+) {
+    for mut line in content.get(range.clone()).unwrap_or("").lines() {
+        for _ in 0..start_col {
+            if line.starts_with(' ') {
+                line = &line[1..];
+            } else {
+                break;
+            }
+        }
+        output.push_str(line);
+        output.push('\n');
+    }
+    output.pop();
+}
--- a/crates/semantic_index2/src/semantic_index.rs
+++ b/crates/semantic_index2/src/semantic_index.rs
--- a/crates/semantic_index2/src/semantic_index_settings.rs
+++ b/crates/semantic_index2/src/semantic_index_settings.rs
@ -0,0 +1,28 @@
+use anyhow;
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use settings::Settings;
+
+#[derive(Deserialize, Debug)]
+pub struct SemanticIndexSettings {
+    pub enabled: bool,
+}
+
+#[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug)]
+pub struct SemanticIndexSettingsContent {
+    pub enabled: Option<bool>,
+}
+
+impl Settings for SemanticIndexSettings {
+    const KEY: Option<&'static str> = Some("semantic_index");
+
+    type FileContent = SemanticIndexSettingsContent;
+
+    fn load(
+        default_value: &Self::FileContent,
+        user_values: &[&Self::FileContent],
+        _: &mut gpui::AppContext,
+    ) -> anyhow::Result<Self> {
+        Self::load_via_json_merge(default_value, user_values)
+    }
+}
--- a/crates/semantic_index2/src/semantic_index_tests.rs
+++ b/crates/semantic_index2/src/semantic_index_tests.rs
--- a/crates/workspace2/src/workspace2.rs
+++ b/crates/workspace2/src/workspace2.rs
@ -3942,8 +3942,6 @@ impl std::fmt::Debug for OpenPaths {
    }
 }

-pub struct WorkspaceCreated(pub WeakView<Workspace>);
-
 pub fn activate_workspace_for_project(
    cx: &mut AppContext,
    predicate: impl Fn(&Project, &AppContext) -> bool + Send + 'static,