added openai language model tokenizer and LanguageModel trait

2024-12-28 21:43:45 +03:00 · 2023-10-17 16:21:03 -04:00 · 2023-10-17 16:21:03 -04:00 · a874a09b7e
commit a874a09b7e
parent ad92fe49c7
4 changed files with 102 additions and 44 deletions
--- a/crates/ai/src/ai.rs
+++ b/crates/ai/src/ai.rs
@ -1,3 +1,4 @@
 pub mod completion;
 pub mod embedding;
+pub mod models;
 pub mod templates;
--- a/crates/ai/src/models.rs
+++ b/crates/ai/src/models.rs
@ -0,0 +1,49 @@
+use anyhow::anyhow;
+use tiktoken_rs::CoreBPE;
+use util::ResultExt;
+
+pub trait LanguageModel {
+    fn name(&self) -> String;
+    fn count_tokens(&self, content: &str) -> anyhow::Result<usize>;
+    fn truncate(&self, content: &str, length: usize) -> anyhow::Result<String>;
+    fn capacity(&self) -> anyhow::Result<usize>;
+}
+
+struct OpenAILanguageModel {
+    name: String,
+    bpe: Option<CoreBPE>,
+}
+
+impl OpenAILanguageModel {
+    pub fn load(model_name: String) -> Self {
+        let bpe = tiktoken_rs::get_bpe_from_model(&model_name).log_err();
+        OpenAILanguageModel {
+            name: model_name,
+            bpe,
+        }
+    }
+}
+
+impl LanguageModel for OpenAILanguageModel {
+    fn name(&self) -> String {
+        self.name.clone()
+    }
+    fn count_tokens(&self, content: &str) -> anyhow::Result<usize> {
+        if let Some(bpe) = &self.bpe {
+            anyhow::Ok(bpe.encode_with_special_tokens(content).len())
+        } else {
+            Err(anyhow!("bpe for open ai model was not retrieved"))
+        }
+    }
+    fn truncate(&self, content: &str, length: usize) -> anyhow::Result<String> {
+        if let Some(bpe) = &self.bpe {
+            let tokens = bpe.encode_with_special_tokens(content);
+            bpe.decode(tokens[..length].to_vec())
+        } else {
+            Err(anyhow!("bpe for open ai model was not retrieved"))
+        }
+    }
+    fn capacity(&self) -> anyhow::Result<usize> {
+        anyhow::Ok(tiktoken_rs::model::get_context_size(&self.name))
+    }
+}
--- a/crates/ai/src/templates/base.rs
+++ b/crates/ai/src/templates/base.rs
@ -1,17 +1,11 @@
-use std::fmt::Write;
-use std::{cmp::Reverse, sync::Arc};
+use std::cmp::Reverse;
+use std::sync::Arc;

 use util::ResultExt;

+use crate::models::LanguageModel;
 use crate::templates::repository_context::PromptCodeSnippet;

-pub trait LanguageModel {
-    fn name(&self) -> String;
-    fn count_tokens(&self, content: &str) -> usize;
-    fn truncate(&self, content: &str, length: usize) -> String;
-    fn capacity(&self) -> usize;
-}
-
 pub(crate) enum PromptFileType {
    Text,
    Code,
@ -73,7 +67,7 @@ impl PromptChain {
    pub fn generate(&self, truncate: bool) -> anyhow::Result<(String, usize)> {
        // Argsort based on Prompt Priority
        let seperator = "\n";
-        let seperator_tokens = self.args.model.count_tokens(seperator);
+        let seperator_tokens = self.args.model.count_tokens(seperator)?;
        let mut sorted_indices = (0..self.templates.len()).collect::<Vec<_>>();
        sorted_indices.sort_by_key(|&i| Reverse(&self.templates[i].0));

@ -81,7 +75,7 @@ impl PromptChain {

        // If Truncate
        let mut tokens_outstanding = if truncate {
-            Some(self.args.model.capacity() - self.args.reserved_tokens)
+            Some(self.args.model.capacity()? - self.args.reserved_tokens)
        } else {
            None
        };
@ -111,7 +105,7 @@ impl PromptChain {
        }

        let full_prompt = prompts.join(seperator);
-        let total_token_count = self.args.model.count_tokens(&full_prompt);
+        let total_token_count = self.args.model.count_tokens(&full_prompt)?;
        anyhow::Ok((prompts.join(seperator), total_token_count))
    }
 }
@ -131,10 +125,10 @@ pub(crate) mod tests {
            ) -> anyhow::Result<(String, usize)> {
                let mut content = "This is a test prompt template".to_string();

-                let mut token_count = args.model.count_tokens(&content);
+                let mut token_count = args.model.count_tokens(&content)?;
                if let Some(max_token_length) = max_token_length {
                    if token_count > max_token_length {
-                        content = args.model.truncate(&content, max_token_length);
+                        content = args.model.truncate(&content, max_token_length)?;
                        token_count = max_token_length;
                    }
                }
@ -152,10 +146,10 @@ pub(crate) mod tests {
            ) -> anyhow::Result<(String, usize)> {
                let mut content = "This is a low priority test prompt template".to_string();

-                let mut token_count = args.model.count_tokens(&content);
+                let mut token_count = args.model.count_tokens(&content)?;
                if let Some(max_token_length) = max_token_length {
                    if token_count > max_token_length {
-                        content = args.model.truncate(&content, max_token_length);
+                        content = args.model.truncate(&content, max_token_length)?;
                        token_count = max_token_length;
                    }
                }
@ -169,26 +163,22 @@ pub(crate) mod tests {
            capacity: usize,
        }

-        impl DummyLanguageModel {
-            fn set_capacity(&mut self, capacity: usize) {
-                self.capacity = capacity
-            }
-        }
-
        impl LanguageModel for DummyLanguageModel {
            fn name(&self) -> String {
                "dummy".to_string()
            }
-            fn count_tokens(&self, content: &str) -> usize {
-                content.chars().collect::<Vec<char>>().len()
+            fn count_tokens(&self, content: &str) -> anyhow::Result<usize> {
+                anyhow::Ok(content.chars().collect::<Vec<char>>().len())
            }
-            fn truncate(&self, content: &str, length: usize) -> String {
-                content.chars().collect::<Vec<char>>()[..length]
-                    .into_iter()
-                    .collect::<String>()
+            fn truncate(&self, content: &str, length: usize) -> anyhow::Result<String> {
+                anyhow::Ok(
+                    content.chars().collect::<Vec<char>>()[..length]
+                        .into_iter()
+                        .collect::<String>(),
+                )
            }
-            fn capacity(&self) -> usize {
-                self.capacity
+            fn capacity(&self) -> anyhow::Result<usize> {
+                anyhow::Ok(self.capacity)
            }
        }

@ -215,7 +205,7 @@ pub(crate) mod tests {
                .to_string()
        );

-        assert_eq!(model.count_tokens(&prompt), token_count);
+        assert_eq!(model.count_tokens(&prompt).unwrap(), token_count);

        // Testing with Truncation Off
        // Should ignore capacity and return all prompts
@ -242,7 +232,7 @@ pub(crate) mod tests {
                .to_string()
        );

-        assert_eq!(model.count_tokens(&prompt), token_count);
+        assert_eq!(model.count_tokens(&prompt).unwrap(), token_count);

        // Testing with Truncation Off
        // Should ignore capacity and return all prompts
--- a/crates/ai/src/templates/preamble.rs
+++ b/crates/ai/src/templates/preamble.rs
@ -4,31 +4,49 @@ use std::fmt::Write;
 struct EngineerPreamble {}

 impl PromptTemplate for EngineerPreamble {
-    fn generate(&self, args: &PromptArguments, max_token_length: Option<usize>) -> String {
-        let mut prompt = String::new();
+    fn generate(
+        &self,
+        args: &PromptArguments,
+        max_token_length: Option<usize>,
+    ) -> anyhow::Result<(String, usize)> {
+        let mut prompts = Vec::new();

        match args.get_file_type() {
            PromptFileType::Code => {
-                writeln!(
-                    prompt,
+                prompts.push(format!(
                    "You are an expert {} engineer.",
                    args.language_name.clone().unwrap_or("".to_string())
-                )
-                .unwrap();
+                ));
            }
            PromptFileType::Text => {
-                writeln!(prompt, "You are an expert engineer.").unwrap();
+                prompts.push("You are an expert engineer.".to_string());
            }
        }

        if let Some(project_name) = args.project_name.clone() {
-            writeln!(
-                prompt,
+            prompts.push(format!(
                "You are currently working inside the '{project_name}' in Zed the code editor."
-            )
-            .unwrap();
+            ));
        }

-        prompt
+        if let Some(mut remaining_tokens) = max_token_length {
+            let mut prompt = String::new();
+            let mut total_count = 0;
+            for prompt_piece in prompts {
+                let prompt_token_count =
+                    args.model.count_tokens(&prompt_piece)? + args.model.count_tokens("\n")?;
+                if remaining_tokens > prompt_token_count {
+                    writeln!(prompt, "{prompt_piece}").unwrap();
+                    remaining_tokens -= prompt_token_count;
+                    total_count += prompt_token_count;
+                }
+            }
+
+            anyhow::Ok((prompt, total_count))
+        } else {
+            let prompt = prompts.join("\n");
+            let token_count = args.model.count_tokens(&prompt)?;
+            anyhow::Ok((prompt, token_count))
+        }
    }
 }