From 7d4d6c871ba88eafc8a084539a4619c8ba686872 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 30 Aug 2023 17:42:16 -0400 Subject: [PATCH] fix bug for truncation ensuring no valid inputs are sent to openai --- crates/semantic_index/src/embedding.rs | 10 ++++------ crates/semantic_index/src/embedding_queue.rs | 8 +++++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/semantic_index/src/embedding.rs b/crates/semantic_index/src/embedding.rs index 7db22c3716..60e13a9e01 100644 --- a/crates/semantic_index/src/embedding.rs +++ b/crates/semantic_index/src/embedding.rs @@ -78,15 +78,13 @@ impl EmbeddingProvider for DummyEmbeddings { let token_count = tokens.len(); let output = if token_count > OPENAI_INPUT_LIMIT { tokens.truncate(OPENAI_INPUT_LIMIT); - OPENAI_BPE_TOKENIZER - .decode(tokens) - .ok() - .unwrap_or_else(|| span.to_string()) + let new_input = OPENAI_BPE_TOKENIZER.decode(tokens.clone()); + new_input.ok().unwrap_or_else(|| span.to_string()) } else { span.to_string() }; - (output, token_count) + (output, tokens.len()) } } @@ -120,7 +118,7 @@ impl OpenAIEmbeddings { #[async_trait] impl EmbeddingProvider for OpenAIEmbeddings { fn max_tokens_per_batch(&self) -> usize { - OPENAI_INPUT_LIMIT + 50000 } fn truncate(&self, span: &str) -> (String, usize) { diff --git a/crates/semantic_index/src/embedding_queue.rs b/crates/semantic_index/src/embedding_queue.rs index 2b48b7a7d6..c3a5de1373 100644 --- a/crates/semantic_index/src/embedding_queue.rs +++ b/crates/semantic_index/src/embedding_queue.rs @@ -105,9 +105,11 @@ impl EmbeddingQueue { for fragment in &batch { let file = fragment.file.lock(); spans.extend( - file.documents[fragment.document_range.clone()] - .iter() - .map(|d| d.content.clone()), + { + file.documents[fragment.document_range.clone()] + .iter() + .map(|d| d.content.clone()) + } ); }