Fix git link issue

This commit is contained in:
SilasMarvin 2024-06-22 10:14:06 -07:00
parent 2be85968e4
commit 11e8cf819e
10 changed files with 514 additions and 2 deletions

@ -1 +0,0 @@
Subproject commit 37a2e98cce5a1b39f07aec7e5b3bc75eebb41ac2

View File

@ -0,0 +1,16 @@
[package]
name = "splitter-tree-sitter"
version = "0.1.0"
edition.workspace = true
[dependencies]
thiserror = "1.0.61"
tree-sitter = "0.22"
[dev-dependencies]
tree-sitter-rust = "0.21"
tree-sitter-zig = { git = "https://github.com/maxxnino/tree-sitter-zig" }
[build-dependencies]
cc="*"

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Silas Marvin
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,3 @@
# tree-sitter-splitter
This is a code splitter that utilizes Tree-sitter to split code.

View File

@ -0,0 +1,326 @@
use thiserror::Error;
use tree_sitter::{Tree, TreeCursor};
#[derive(Error, Debug)]
pub enum NewError {
#[error("chunk_size must be greater than chunk_overlap")]
SizeOverlapError,
}
#[derive(Error, Debug)]
pub enum SplitError {
#[error("converting utf8 to str")]
Utf8Error(#[from] core::str::Utf8Error),
}
pub struct TreeSitterCodeSplitter {
chunk_size: usize,
chunk_overlap: usize,
}
pub struct ByteRange {
pub start_byte: usize,
pub end_byte: usize,
}
impl ByteRange {
fn new(start_byte: usize, end_byte: usize) -> Self {
Self {
start_byte,
end_byte,
}
}
}
pub struct Chunk<'a> {
pub text: &'a str,
pub range: ByteRange,
}
impl<'a> Chunk<'a> {
fn new(text: &'a str, range: ByteRange) -> Self {
Self { text, range }
}
}
impl TreeSitterCodeSplitter {
pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self, NewError> {
if chunk_overlap > chunk_size {
Err(NewError::SizeOverlapError)
} else {
Ok(Self {
chunk_size,
chunk_overlap,
})
}
}
pub fn split<'a, 'b, 'c>(
&'a self,
tree: &'b Tree,
utf8: &'c [u8],
) -> Result<Vec<Chunk<'c>>, SplitError> {
let cursor = tree.walk();
Ok(self
.split_recursive(cursor, utf8)?
.into_iter()
.rev()
// Let's combine some of our smaller chunks together
// We also want to do this in reverse as it (seems) to make more sense to combine code slices from bottom to top
.try_fold(vec![], |mut acc, current| {
if acc.len() == 0 {
acc.push(current);
Ok::<_, SplitError>(acc)
} else {
if acc.last().as_ref().unwrap().text.len() + current.text.len()
< self.chunk_size
{
let last = acc.pop().unwrap();
let text = std::str::from_utf8(
&utf8[current.range.start_byte..last.range.end_byte],
)?;
acc.push(Chunk::new(
text,
ByteRange::new(current.range.start_byte, last.range.end_byte),
));
} else {
acc.push(current);
}
Ok(acc)
}
})?
.into_iter()
.rev()
.collect())
}
fn split_recursive<'a, 'b, 'c>(
&'a self,
mut cursor: TreeCursor<'b>,
utf8: &'c [u8],
) -> Result<Vec<Chunk<'c>>, SplitError> {
let node = cursor.node();
let text = node.utf8_text(utf8)?;
// There are three cases:
// 1. Is the current range of code smaller than the chunk_size? If so, return it
// 2. If not, does the current node have children? If so, recursively walk down
// 3. If not, we must split our current node
let mut out = if text.chars().count() <= self.chunk_size {
vec![Chunk::new(
text,
ByteRange::new(node.range().start_byte, node.range().end_byte),
)]
} else {
let mut cursor_copy = cursor.clone();
if cursor_copy.goto_first_child() {
self.split_recursive(cursor_copy, utf8)?
} else {
let mut current_range =
ByteRange::new(node.range().start_byte, node.range().end_byte);
let mut chunks = vec![];
let mut current_chunk = text;
loop {
if current_chunk.len() < self.chunk_size {
chunks.push(Chunk::new(current_chunk, current_range));
break;
} else {
let new_chunk = &current_chunk[0..self.chunk_size.min(current_chunk.len())];
let new_range = ByteRange::new(
current_range.start_byte,
current_range.start_byte + new_chunk.as_bytes().len(),
);
chunks.push(Chunk::new(new_chunk, new_range));
let new_current_chunk =
&current_chunk[self.chunk_size - self.chunk_overlap..];
let byte_diff =
current_chunk.as_bytes().len() - new_current_chunk.as_bytes().len();
current_range = ByteRange::new(
current_range.start_byte + byte_diff,
current_range.end_byte,
);
current_chunk = new_current_chunk
}
}
chunks
}
};
if cursor.goto_next_sibling() {
out.append(&mut self.split_recursive(cursor, utf8)?);
}
Ok(out)
}
}
#[cfg(test)]
mod tests {
use super::*;
use tree_sitter::Parser;
#[test]
fn test_split_rust() {
let splitter = TreeSitterCodeSplitter::new(128, 0).unwrap();
let mut parser = Parser::new();
parser
.set_language(&tree_sitter_rust::language())
.expect("Error loading Rust grammar");
let source_code = r#"
#[derive(Debug)]
struct Rectangle {
width: u32,
height: u32,
}
impl Rectangle {
fn area(&self) -> u32 {
self.width * self.height
}
}
fn main() {
let rect1 = Rectangle {
width: 30,
height: 50,
};
println!(
"The area of the rectangle is {} square pixels.",
rect1.area()
);
}
"#;
let tree = parser.parse(source_code, None).unwrap();
let chunks = splitter.split(&tree, source_code.as_bytes()).unwrap();
assert_eq!(
chunks[0].text,
r#"#[derive(Debug)]
struct Rectangle {
width: u32,
height: u32,
}"#
);
assert_eq!(
chunks[1].text,
r#"impl Rectangle {
fn area(&self) -> u32 {
self.width * self.height
}
}"#
);
assert_eq!(
chunks[2].text,
r#"fn main() {
let rect1 = Rectangle {
width: 30,
height: 50,
};"#
);
assert_eq!(
chunks[3].text,
r#"println!(
"The area of the rectangle is {} square pixels.",
rect1.area()
);
}"#
);
}
#[test]
fn test_split_zig() {
let splitter = TreeSitterCodeSplitter::new(128, 10).unwrap();
let mut parser = Parser::new();
parser
.set_language(&tree_sitter_rust::language())
.expect("Error loading Rust grammar");
let source_code = r#"
const std = @import("std");
const parseInt = std.fmt.parseInt;
std.debug.print("Here is a long string 1 ... Here is a long string 2 ... Here is a long string 3 ... Here is a long string 4 ... Here is a long string 5 ... Here is a long string 6 ... Here is a long string 7 ... Here is a long string 8 ... Here is a long string 9 ...", .{});
test "parse integers" {
const input = "123 67 89,99";
const ally = std.testing.allocator;
var list = std.ArrayList(u32).init(ally);
// Ensure the list is freed at scope exit.
// Try commenting out this line!
defer list.deinit();
var it = std.mem.tokenizeAny(u8, input, " ,");
while (it.next()) |num| {
const n = try parseInt(u32, num, 10);
try list.append(n);
}
const expected = [_]u32{ 123, 67, 89, 99 };
for (expected, list.items) |exp, actual| {
try std.testing.expectEqual(exp, actual);
}
}
"#;
let tree = parser.parse(source_code, None).unwrap();
let chunks = splitter.split(&tree, source_code.as_bytes()).unwrap();
assert_eq!(
chunks[0].text,
r#"const std = @import("std");
const parseInt = std.fmt.parseInt;
std.debug.print(""#
);
assert_eq!(
chunks[1].text,
r#"Here is a long string 1 ... Here is a long string 2 ... Here is a long string 3 ... Here is a long string 4 ... Here is a long s"#
);
assert_eq!(
chunks[2].text,
r#"s a long string 5 ... Here is a long string 6 ... Here is a long string 7 ... Here is a long string 8 ... Here is a long string "#
);
assert_eq!(chunks[3].text, r#"ng string 9 ...", .{});"#);
assert_eq!(
chunks[4].text,
r#"test "parse integers" {
const input = "123 67 89,99";
const ally = std.testing.allocator;
var list = std.ArrayList"#
);
assert_eq!(
chunks[5].text,
r#"(u32).init(ally);
// Ensure the list is freed at scope exit.
// Try commenting out this line!"#
);
assert_eq!(
chunks[6].text,
r#"defer list.deinit();
var it = std.mem.tokenizeAny(u8, input, " ,");
while (it.next()) |num"#
);
assert_eq!(
chunks[7].text,
r#"| {
const n = try parseInt(u32, num, 10);
try list.append(n);
}
const expected = [_]u32{ 123, 67, 89,"#
);
assert_eq!(
chunks[8].text,
r#"99 };
for (expected, list.items) |exp, actual| {
try std.testing.expectEqual(exp, actual);
}
}"#
);
}
}

@ -1 +0,0 @@
Subproject commit a38e7143bcab2412348fd92904cc5105117896a1

View File

@ -0,0 +1,34 @@
[package]
name = "utils-tree-sitter"
version = "0.1.0"
edition.workspace = true
[dependencies]
thiserror = "1.0.61"
tree-sitter = "0.22"
tree-sitter-bash = { version = "0.21", optional = true }
tree-sitter-c = { version = "0.21", optional = true }
tree-sitter-cpp = { version = "0.22", optional = true }
tree-sitter-c-sharp = { version = "0.21", optional = true }
tree-sitter-css = { version = "0.21", optional = true }
tree-sitter-elixir = { version = "0.2", optional = true }
tree-sitter-erlang = { version = "0.6", optional = true }
tree-sitter-go = { version = "0.21", optional = true }
tree-sitter-html = { version = "0.20", optional = true }
tree-sitter-java = { version = "0.21", optional = true }
tree-sitter-javascript = { version = "0.21", optional = true }
tree-sitter-json = { version = "0.21", optional = true }
tree-sitter-haskell = { version = "0.21", optional = true }
tree-sitter-lua = { version = "0.1.0", optional = true }
tree-sitter-ocaml = { version = "0.22.0", optional = true }
tree-sitter-python = { version = "0.21", optional = true }
tree-sitter-rust = { version = "0.21", optional = true }
tree-sitter-zig = { git = "https://github.com/maxxnino/tree-sitter-zig", optional = true }
[build-dependencies]
cc="*"
[features]
default = []
all = ["dep:tree-sitter-python", "dep:tree-sitter-bash", "dep:tree-sitter-c", "dep:tree-sitter-cpp", "dep:tree-sitter-c-sharp", "dep:tree-sitter-css", "dep:tree-sitter-elixir", "dep:tree-sitter-erlang", "dep:tree-sitter-go", "dep:tree-sitter-html", "dep:tree-sitter-java", "dep:tree-sitter-javascript", "dep:tree-sitter-json", "dep:tree-sitter-rust", "dep:tree-sitter-zig", "dep:tree-sitter-haskell", "dep:tree-sitter-lua", "dep:tree-sitter-ocaml"]

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Silas Marvin
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,3 @@
# utils-tree-sitter
Utils for working with Tree-sitter

View File

@ -0,0 +1,90 @@
use thiserror::Error;
use tree_sitter::{LanguageError, Parser};
#[derive(Error, Debug)]
pub enum GetParserError {
#[error("no parser found for extension")]
NoParserFoundForExtension(String),
#[error("no parser found for extension")]
NoLanguageFoundForExtension(String),
#[error("loading grammer")]
LoadingGrammer(#[from] LanguageError),
}
fn get_extension_for_language(extension: &str) -> Result<String, GetParserError> {
Ok(match extension {
"py" => "Python",
"rs" => "Rust",
"zig" => "Zig",
"sh" => "Bash",
"c" => "C",
"cpp" => "C++",
"cs" => "C#",
"css" => "CSS",
"ex" => "Elixir",
"erl" => "Erlang",
"go" => "Go",
"html" => "HTML",
"java" => "Java",
"js" => "JavaScript",
"json" => "JSON",
"hs" => "Haskell",
"lua" => "Lua",
"ml" => "OCaml",
_ => {
return Err(GetParserError::NoLanguageFoundForExtension(
extension.to_string(),
))
}
}
.to_string())
}
pub fn get_parser_for_extension(extension: &str) -> Result<Parser, GetParserError> {
let language = get_extension_for_language(extension)?;
let mut parser = Parser::new();
match language.as_str() {
#[cfg(any(feature = "all", feature = "python"))]
"Python" => parser.set_language(&tree_sitter_python::language())?,
#[cfg(any(feature = "all", feature = "rust"))]
"Rust" => parser.set_language(&tree_sitter_rust::language())?,
#[cfg(any(feature = "all", feature = "zig"))]
"Zig" => parser.set_language(&tree_sitter_zig::language())?,
#[cfg(any(feature = "all", feature = "bash"))]
"Bash" => parser.set_language(&tree_sitter_bash::language())?,
#[cfg(any(feature = "all", feature = "c"))]
"C" => parser.set_language(&tree_sitter_c::language())?,
#[cfg(any(feature = "all", feature = "cpp"))]
"C++" => parser.set_language(&tree_sitter_cpp::language())?,
#[cfg(any(feature = "all", feature = "c-sharp"))]
"C#" => parser.set_language(&tree_sitter_c_sharp::language())?,
#[cfg(any(feature = "all", feature = "css"))]
"CSS" => parser.set_language(&tree_sitter_css::language())?,
#[cfg(any(feature = "all", feature = "elixir"))]
"Elixir" => parser.set_language(&tree_sitter_elixir::language())?,
#[cfg(any(feature = "all", feature = "erlang"))]
"Erlang" => parser.set_language(&tree_sitter_erlang::language())?,
#[cfg(any(feature = "all", feature = "go"))]
"Go" => parser.set_language(&tree_sitter_go::language())?,
#[cfg(any(feature = "all", feature = "html"))]
"HTML" => parser.set_language(&tree_sitter_html::language())?,
#[cfg(any(feature = "all", feature = "java"))]
"Java" => parser.set_language(&tree_sitter_java::language())?,
#[cfg(any(feature = "all", feature = "javascript"))]
"JavaScript" => parser.set_language(&tree_sitter_javascript::language())?,
#[cfg(any(feature = "all", feature = "json"))]
"JSON" => parser.set_language(&tree_sitter_json::language())?,
#[cfg(any(feature = "all", feature = "haskell"))]
"Haskell" => parser.set_language(&tree_sitter_haskell::language())?,
#[cfg(any(feature = "all", feature = "lua"))]
"Lua" => parser.set_language(&tree_sitter_lua::language())?,
#[cfg(any(feature = "all", feature = "ocaml"))]
"OCaml" => parser.set_language(&tree_sitter_ocaml::language_ocaml())?,
_ => {
return Err(GetParserError::NoParserFoundForExtension(
language.to_string(),
))
}
}
Ok(parser)
}