From 43ad470e587c1c4c3b373200b2fa07e0d3080ee5 Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Thu, 2 May 2024 12:28:21 -0700 Subject: [PATCH] Use outline queries to chunk files syntactically (#11283) This chunking strategy uses the existing `outline` query to chunk files. We try to find chunk boundaries that are: * at starts or ends of lines * nested within as few outline items as possible Release Notes: - N/A --- Cargo.lock | 2 + crates/language/src/language.rs | 13 +- crates/language/src/syntax_map.rs | 4 +- crates/semantic_index/Cargo.toml | 2 + crates/semantic_index/src/chunking.rs | 654 ++++++++++++-------------- 5 files changed, 322 insertions(+), 353 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e88e4f9495..cdc8e54e28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8704,6 +8704,8 @@ dependencies = [ "sha2 0.10.7", "smol", "tempfile", + "tree-sitter", + "unindent", "util", "worktree", ] diff --git a/crates/language/src/language.rs b/crates/language/src/language.rs index 02e864964d..cd8c18c85e 100644 --- a/crates/language/src/language.rs +++ b/crates/language/src/language.rs @@ -55,10 +55,10 @@ use std::{ Arc, }, }; -use syntax_map::SyntaxSnapshot; +use syntax_map::{QueryCursorHandle, SyntaxSnapshot}; pub use task_context::{BasicContextProvider, ContextProvider, ContextProviderWithTasks}; use theme::SyntaxTheme; -use tree_sitter::{self, wasmtime, Query, WasmStore}; +use tree_sitter::{self, wasmtime, Query, QueryCursor, WasmStore}; use util::http::HttpClient; pub use buffer::Operation; @@ -101,6 +101,15 @@ where }) } +pub fn with_query_cursor(func: F) -> R +where + F: FnOnce(&mut QueryCursor) -> R, +{ + use std::ops::DerefMut; + let mut cursor = QueryCursorHandle::new(); + func(cursor.deref_mut()) +} + lazy_static! { static ref NEXT_LANGUAGE_ID: AtomicUsize = Default::default(); static ref NEXT_GRAMMAR_ID: AtomicUsize = Default::default(); diff --git a/crates/language/src/syntax_map.rs b/crates/language/src/syntax_map.rs index 47c871f9f9..35d6fbd460 100644 --- a/crates/language/src/syntax_map.rs +++ b/crates/language/src/syntax_map.rs @@ -211,7 +211,7 @@ struct TextProvider<'a>(&'a Rope); struct ByteChunks<'a>(text::Chunks<'a>); -struct QueryCursorHandle(Option); +pub(crate) struct QueryCursorHandle(Option); impl SyntaxMap { pub fn new() -> Self { @@ -1739,7 +1739,7 @@ impl<'a> Iterator for ByteChunks<'a> { } impl QueryCursorHandle { - pub(crate) fn new() -> Self { + pub fn new() -> Self { let mut cursor = QUERY_CURSORS.lock().pop().unwrap_or_else(QueryCursor::new); cursor.set_match_limit(64); QueryCursorHandle(Some(cursor)) diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index a23f7853de..208204e76e 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -37,7 +37,9 @@ serde.workspace = true serde_json.workspace = true sha2.workspace = true smol.workspace = true +tree-sitter.workspace = true util. workspace = true +unindent.workspace = true worktree.workspace = true [dev-dependencies] diff --git a/crates/semantic_index/src/chunking.rs b/crates/semantic_index/src/chunking.rs index 9918bb1d2c..ec29b0bcab 100644 --- a/crates/semantic_index/src/chunking.rs +++ b/crates/semantic_index/src/chunking.rs @@ -1,9 +1,24 @@ -use language::{with_parser, Grammar, Tree}; +use language::{with_parser, with_query_cursor, Grammar}; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use std::{cmp, ops::Range, sync::Arc}; +use std::{ + cmp::{self, Reverse}, + ops::Range, + sync::Arc, +}; +use tree_sitter::QueryCapture; +use util::ResultExt as _; -const CHUNK_THRESHOLD: usize = 1500; +#[derive(Copy, Clone)] +struct ChunkSizeRange { + min: usize, + max: usize, +} + +const CHUNK_SIZE_RANGE: ChunkSizeRange = ChunkSizeRange { + min: 1024, + max: 8192, +}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Chunk { @@ -12,193 +27,318 @@ pub struct Chunk { } pub fn chunk_text(text: &str, grammar: Option<&Arc>) -> Vec { + chunk_text_with_size_range(text, grammar, CHUNK_SIZE_RANGE) +} + +fn chunk_text_with_size_range( + text: &str, + grammar: Option<&Arc>, + size_config: ChunkSizeRange, +) -> Vec { + let mut syntactic_ranges = Vec::new(); + if let Some(grammar) = grammar { - let tree = with_parser(|parser| { - parser - .set_language(&grammar.ts_language) - .expect("incompatible grammar"); - parser.parse(&text, None).expect("invalid language") - }); + if let Some(outline) = grammar.outline_config.as_ref() { + let tree = with_parser(|parser| { + parser.set_language(&grammar.ts_language).log_err()?; + parser.parse(&text, None) + }); - chunk_parse_tree(tree, &text, CHUNK_THRESHOLD) - } else { - chunk_lines(&text) - } -} - -fn chunk_parse_tree(tree: Tree, text: &str, chunk_threshold: usize) -> Vec { - let mut chunk_ranges = Vec::new(); - let mut cursor = tree.walk(); - - let mut range = 0..0; - loop { - let node = cursor.node(); - - // If adding the node to the current chunk exceeds the threshold - if node.end_byte() - range.start > chunk_threshold { - // Try to descend into its first child. If we can't, flush the current - // range and try again. - if cursor.goto_first_child() { - continue; - } else if !range.is_empty() { - chunk_ranges.push(range.clone()); - range.start = range.end; - continue; - } - - // If we get here, the node itself has no children but is larger than the threshold. - // Break its text into arbitrary chunks. - split_text(text, range.clone(), node.end_byte(), &mut chunk_ranges); - } - range.end = node.end_byte(); - - // If we get here, we consumed the node. Advance to the next child, ascending if there isn't one. - while !cursor.goto_next_sibling() { - if !cursor.goto_parent() { - if !range.is_empty() { - chunk_ranges.push(range); - } - - return chunk_ranges - .into_iter() - .map(|range| { - let digest = Sha256::digest(&text[range.clone()]).into(); - Chunk { range, digest } - }) - .collect(); + if let Some(tree) = tree { + with_query_cursor(|cursor| { + // Retrieve a list of ranges of outline items (types, functions, etc) in the document. + // Omit single-line outline items (e.g. struct fields, constant declarations), because + // we'll already be attempting to split on lines. + syntactic_ranges = cursor + .matches(&outline.query, tree.root_node(), text.as_bytes()) + .filter_map(|mat| { + mat.captures + .iter() + .find_map(|QueryCapture { node, index }| { + if *index == outline.item_capture_ix { + if node.end_position().row > node.start_position().row { + return Some(node.byte_range()); + } + } + None + }) + }) + .collect::>(); + syntactic_ranges + .sort_unstable_by_key(|range| (range.start, Reverse(range.end))); + }); } } } + + chunk_text_with_syntactic_ranges(text, &syntactic_ranges, size_config) } -fn chunk_lines(text: &str) -> Vec { - let mut chunk_ranges = Vec::new(); +fn chunk_text_with_syntactic_ranges( + text: &str, + mut syntactic_ranges: &[Range], + size_config: ChunkSizeRange, +) -> Vec { + let mut chunks = Vec::new(); let mut range = 0..0; + let mut range_end_nesting_depth = 0; - let mut newlines = text.match_indices('\n').peekable(); - while let Some((newline_ix, _)) = newlines.peek() { - let newline_ix = newline_ix + 1; - if newline_ix - range.start <= CHUNK_THRESHOLD { - range.end = newline_ix; - newlines.next(); + // Try to split the text at line boundaries. + let mut line_ixs = text + .match_indices('\n') + .map(|(ix, _)| ix + 1) + .chain(if text.ends_with('\n') { + None } else { + Some(text.len()) + }) + .peekable(); + + while let Some(&line_ix) = line_ixs.peek() { + // If the current position is beyond the maximum chunk size, then + // start a new chunk. + if line_ix - range.start > size_config.max { if range.is_empty() { - split_text(text, range, newline_ix, &mut chunk_ranges); - range = newline_ix..newline_ix; + range.end = cmp::min(range.start + size_config.max, line_ix); + while !text.is_char_boundary(range.end) { + range.end -= 1; + } + } + + chunks.push(Chunk { + range: range.clone(), + digest: Sha256::digest(&text[range.clone()]).into(), + }); + range_end_nesting_depth = 0; + range.start = range.end; + continue; + } + + // Discard any syntactic ranges that end before the current position. + while let Some(first_item) = syntactic_ranges.first() { + if first_item.end < line_ix { + syntactic_ranges = &syntactic_ranges[1..]; + continue; } else { - chunk_ranges.push(range.clone()); - range.start = range.end; + break; } } + + // Count how many syntactic ranges contain the current position. + let mut nesting_depth = 0; + for range in syntactic_ranges { + if range.start > line_ix { + break; + } + if range.start < line_ix && range.end > line_ix { + nesting_depth += 1; + } + } + + // Extend the current range to this position, unless an earlier candidate + // end position was less nested syntactically. + if range.len() < size_config.min || nesting_depth <= range_end_nesting_depth { + range.end = line_ix; + range_end_nesting_depth = nesting_depth; + } + + line_ixs.next(); } if !range.is_empty() { - chunk_ranges.push(range); - } - - chunk_ranges - .into_iter() - .map(|range| Chunk { + chunks.push(Chunk { + range: range.clone(), digest: Sha256::digest(&text[range.clone()]).into(), - range, - }) - .collect() -} - -fn split_text( - text: &str, - mut range: Range, - max_end: usize, - chunk_ranges: &mut Vec>, -) { - while range.start < max_end { - range.end = cmp::min(range.start + CHUNK_THRESHOLD, max_end); - while !text.is_char_boundary(range.end) { - range.end -= 1; - } - chunk_ranges.push(range.clone()); - range.start = range.end; + }); } + + chunks } #[cfg(test)] mod tests { use super::*; use language::{tree_sitter_rust, Language, LanguageConfig, LanguageMatcher}; + use unindent::Unindent as _; - // This example comes from crates/gpui/examples/window_positioning.rs which - // has the property of being CHUNK_THRESHOLD < TEXT.len() < 2*CHUNK_THRESHOLD - static TEXT: &str = r#" - use gpui::*; + #[test] + fn test_chunk_text_with_syntax() { + let language = rust_language(); - struct WindowContent { - text: SharedString, + let text = " + struct Person { + first_name: String, + last_name: String, + age: u32, + } + + impl Person { + fn new(first_name: String, last_name: String, age: u32) -> Self { + Self { first_name, last_name, age } + } + + fn first_name(&self) -> &str { + &self.first_name + } + + fn last_name(&self) -> &str { + &self.last_name + } + + fn age(&self) -> usize { + self.ages + } + } + " + .unindent(); + + let chunks = chunk_text_with_size_range( + &text, + language.grammar(), + ChunkSizeRange { + min: text.find('}').unwrap(), + max: text.find("Self {").unwrap(), + }, + ); + + // The entire impl cannot fit in a chunk, so it is split. + // Within the impl, two methods can fit in a chunk. + assert_chunks( + &text, + &chunks, + &[ + "struct Person {", // ... + "impl Person {", + " fn first_name", + " fn age", + ], + ); + + let text = " + struct T {} + struct U {} + struct V {} + struct W { + a: T, + b: U, + } + " + .unindent(); + + let chunks = chunk_text_with_size_range( + &text, + language.grammar(), + ChunkSizeRange { + min: text.find('{').unwrap(), + max: text.find('V').unwrap(), + }, + ); + + // Two single-line structs can fit in a chunk. + // The last struct cannot fit in a chunk together + // with the previous single-line struct. + assert_chunks( + &text, + &chunks, + &[ + "struct T", // ... + "struct V", // ... + "struct W", // ... + "}", + ], + ); } - impl Render for WindowContent { - fn render(&mut self, _cx: &mut ViewContext) -> impl IntoElement { - div() - .flex() - .bg(rgb(0x1e2025)) - .size_full() - .justify_center() - .items_center() - .text_xl() - .text_color(rgb(0xffffff)) - .child(self.text.clone()) + #[test] + fn test_chunk_with_long_lines() { + let language = rust_language(); + + let text = " + struct S { a: u32 } + struct T { a: u64 } + struct U { a: u64, b: u64, c: u64, d: u64, e: u64, f: u64, g: u64, h: u64, i: u64, j: u64 } + struct W { a: u64, b: u64, c: u64, d: u64, e: u64, f: u64, g: u64, h: u64, i: u64, j: u64 } + " + .unindent(); + + let chunks = chunk_text_with_size_range( + &text, + language.grammar(), + ChunkSizeRange { min: 32, max: 64 }, + ); + + // The line is too long to fit in one chunk + assert_chunks( + &text, + &chunks, + &[ + "struct S {", // ... + "struct U", + "4, h: u64, i: u64", // ... + "struct W", + "4, h: u64, i: u64", // ... + ], + ); + } + + #[track_caller] + fn assert_chunks(text: &str, chunks: &[Chunk], expected_chunk_text_prefixes: &[&str]) { + check_chunk_invariants(text, chunks); + + assert_eq!( + chunks.len(), + expected_chunk_text_prefixes.len(), + "unexpected number of chunks: {chunks:?}", + ); + + let mut prev_chunk_end = 0; + for (ix, chunk) in chunks.iter().enumerate() { + let expected_prefix = expected_chunk_text_prefixes[ix]; + let chunk_text = &text[chunk.range.clone()]; + if !chunk_text.starts_with(expected_prefix) { + let chunk_prefix_offset = text[prev_chunk_end..].find(expected_prefix); + if let Some(chunk_prefix_offset) = chunk_prefix_offset { + panic!( + "chunk {ix} starts at unexpected offset {}. expected {}", + chunk.range.start, + chunk_prefix_offset + prev_chunk_end + ); + } else { + panic!("invalid expected chunk prefix {ix}: {expected_prefix:?}"); + } + } + prev_chunk_end = chunk.range.end; } } - fn main() { - App::new().run(|cx: &mut AppContext| { - // Create several new windows, positioned in the top right corner of each screen - - for screen in cx.displays() { - let options = { - let popup_margin_width = DevicePixels::from(16); - let popup_margin_height = DevicePixels::from(-0) - DevicePixels::from(48); - - let window_size = Size { - width: px(400.), - height: px(72.), - }; - - let screen_bounds = screen.bounds(); - let size: Size = window_size.into(); - - let bounds = gpui::Bounds:: { - origin: screen_bounds.upper_right() - - point(size.width + popup_margin_width, popup_margin_height), - size: window_size.into(), - }; - - WindowOptions { - // Set the bounds of the window in screen coordinates - bounds: Some(bounds), - // Specify the display_id to ensure the window is created on the correct screen - display_id: Some(screen.id()), - - titlebar: None, - window_background: WindowBackgroundAppearance::default(), - focus: false, - show: true, - kind: WindowKind::PopUp, - is_movable: false, - fullscreen: false, - app_id: None, - } - }; - - cx.open_window(options, |cx| { - cx.new_view(|_| WindowContent { - text: format!("{:?}", screen.id()).into(), - }) - }); + #[track_caller] + fn check_chunk_invariants(text: &str, chunks: &[Chunk]) { + for (ix, chunk) in chunks.iter().enumerate() { + if ix > 0 && chunk.range.start != chunks[ix - 1].range.end { + panic!("chunk ranges are not contiguous: {:?}", chunks); } - }); - }"#; + } - fn setup_rust_language() -> Language { + if text.is_empty() { + assert!(chunks.is_empty()) + } else if chunks.first().unwrap().range.start != 0 + || chunks.last().unwrap().range.end != text.len() + { + panic!("chunks don't cover entire text {:?}", chunks); + } + } + + #[test] + fn test_chunk_text() { + let text = "a\n".repeat(1000); + let chunks = chunk_text(&text, None); + assert_eq!( + chunks.len(), + ((2000_f64) / (CHUNK_SIZE_RANGE.max as f64)).ceil() as usize + ); + } + + fn rust_language() -> Language { Language::new( LanguageConfig { name: "Rust".into(), @@ -210,198 +350,14 @@ mod tests { }, Some(tree_sitter_rust::language()), ) - } - - #[test] - fn test_chunk_text() { - let text = "a\n".repeat(1000); - let chunks = chunk_text(&text, None); - assert_eq!( - chunks.len(), - ((2000_f64) / (CHUNK_THRESHOLD as f64)).ceil() as usize - ); - } - - #[test] - fn test_chunk_text_grammar() { - // Let's set up a big text with some known segments - // We'll then chunk it and verify that the chunks are correct - - let language = setup_rust_language(); - - let chunks = chunk_text(TEXT, language.grammar()); - assert_eq!(chunks.len(), 2); - - assert_eq!(chunks[0].range.start, 0); - assert_eq!(chunks[0].range.end, 1498); - // The break between chunks is right before the "Specify the display_id" comment - - assert_eq!(chunks[1].range.start, 1498); - assert_eq!(chunks[1].range.end, 2434); - } - - #[test] - fn test_chunk_parse_tree() { - let language = setup_rust_language(); - let grammar = language.grammar().unwrap(); - - let tree = with_parser(|parser| { - parser - .set_language(&grammar.ts_language) - .expect("incompatible grammar"); - parser.parse(TEXT, None).expect("invalid language") - }); - - let chunks = chunk_parse_tree(tree, TEXT, 250); - assert_eq!(chunks.len(), 11); - } - - #[test] - fn test_chunk_unparsable() { - // Even if a chunk is unparsable, we should still be able to chunk it - let language = setup_rust_language(); - let grammar = language.grammar().unwrap(); - - let text = r#"fn main() {"#; - let tree = with_parser(|parser| { - parser - .set_language(&grammar.ts_language) - .expect("incompatible grammar"); - parser.parse(text, None).expect("invalid language") - }); - - let chunks = chunk_parse_tree(tree, text, 250); - assert_eq!(chunks.len(), 1); - - assert_eq!(chunks[0].range.start, 0); - assert_eq!(chunks[0].range.end, 11); - } - - #[test] - fn test_empty_text() { - let language = setup_rust_language(); - let grammar = language.grammar().unwrap(); - - let tree = with_parser(|parser| { - parser - .set_language(&grammar.ts_language) - .expect("incompatible grammar"); - parser.parse("", None).expect("invalid language") - }); - - let chunks = chunk_parse_tree(tree, "", CHUNK_THRESHOLD); - assert!(chunks.is_empty(), "Chunks should be empty for empty text"); - } - - #[test] - fn test_single_large_node() { - let large_text = "static ".to_owned() + "a".repeat(CHUNK_THRESHOLD - 1).as_str() + " = 2"; - - let language = setup_rust_language(); - let grammar = language.grammar().unwrap(); - - let tree = with_parser(|parser| { - parser - .set_language(&grammar.ts_language) - .expect("incompatible grammar"); - parser.parse(&large_text, None).expect("invalid language") - }); - - let chunks = chunk_parse_tree(tree, &large_text, CHUNK_THRESHOLD); - - assert_eq!( - chunks.len(), - 3, - "Large chunks are broken up according to grammar as best as possible" - ); - - // Expect chunks to be static, aaaaaa..., and = 2 - assert_eq!(chunks[0].range.start, 0); - assert_eq!(chunks[0].range.end, "static".len()); - - assert_eq!(chunks[1].range.start, "static".len()); - assert_eq!(chunks[1].range.end, "static".len() + CHUNK_THRESHOLD); - - assert_eq!(chunks[2].range.start, "static".len() + CHUNK_THRESHOLD); - assert_eq!(chunks[2].range.end, large_text.len()); - } - - #[test] - fn test_multiple_small_nodes() { - let small_text = "a b c d e f g h i j k l m n o p q r s t u v w x y z"; - let language = setup_rust_language(); - let grammar = language.grammar().unwrap(); - - let tree = with_parser(|parser| { - parser - .set_language(&grammar.ts_language) - .expect("incompatible grammar"); - parser.parse(small_text, None).expect("invalid language") - }); - - let chunks = chunk_parse_tree(tree, small_text, 5); - assert!( - chunks.len() > 1, - "Should have multiple chunks for multiple small nodes" - ); - } - - #[test] - fn test_node_with_children() { - let nested_text = "fn main() { let a = 1; let b = 2; }"; - let language = setup_rust_language(); - let grammar = language.grammar().unwrap(); - - let tree = with_parser(|parser| { - parser - .set_language(&grammar.ts_language) - .expect("incompatible grammar"); - parser.parse(nested_text, None).expect("invalid language") - }); - - let chunks = chunk_parse_tree(tree, nested_text, 10); - assert!( - chunks.len() > 1, - "Should have multiple chunks for a node with children" - ); - } - - #[test] - fn test_text_with_unparsable_sections() { - // This test uses purposefully hit-or-miss sizing of 11 characters per likely chunk - let mixed_text = "fn main() { let a = 1; let b = 2; } unparsable bits here"; - let language = setup_rust_language(); - let grammar = language.grammar().unwrap(); - - let tree = with_parser(|parser| { - parser - .set_language(&grammar.ts_language) - .expect("incompatible grammar"); - parser.parse(mixed_text, None).expect("invalid language") - }); - - let chunks = chunk_parse_tree(tree, mixed_text, 11); - assert!( - chunks.len() > 1, - "Should handle both parsable and unparsable sections correctly" - ); - - let expected_chunks = [ - "fn main() {", - " let a = 1;", - " let b = 2;", - " }", - " unparsable", - " bits here", - ]; - - for (i, chunk) in chunks.iter().enumerate() { - assert_eq!( - &mixed_text[chunk.range.clone()], - expected_chunks[i], - "Chunk {} should match", - i - ); - } + .with_outline_query( + " + (function_item name: (_) @name) @item + (impl_item type: (_) @name) @item + (struct_item name: (_) @name) @item + (field_declaration name: (_) @name) @item + ", + ) + .unwrap() } }