fix bug for truncation ensuring no valid inputs are sent to openai

This commit is contained in:
KCaverly 2023-08-30 17:42:16 -04:00
parent 5abad58b0d
commit 7d4d6c871b
2 changed files with 9 additions and 9 deletions

View File

@ -78,15 +78,13 @@ impl EmbeddingProvider for DummyEmbeddings {
let token_count = tokens.len();
let output = if token_count > OPENAI_INPUT_LIMIT {
tokens.truncate(OPENAI_INPUT_LIMIT);
OPENAI_BPE_TOKENIZER
.decode(tokens)
.ok()
.unwrap_or_else(|| span.to_string())
let new_input = OPENAI_BPE_TOKENIZER.decode(tokens.clone());
new_input.ok().unwrap_or_else(|| span.to_string())
} else {
span.to_string()
};
(output, token_count)
(output, tokens.len())
}
}
@ -120,7 +118,7 @@ impl OpenAIEmbeddings {
#[async_trait]
impl EmbeddingProvider for OpenAIEmbeddings {
fn max_tokens_per_batch(&self) -> usize {
OPENAI_INPUT_LIMIT
50000
}
fn truncate(&self, span: &str) -> (String, usize) {

View File

@ -105,9 +105,11 @@ impl EmbeddingQueue {
for fragment in &batch {
let file = fragment.file.lock();
spans.extend(
file.documents[fragment.document_range.clone()]
.iter()
.map(|d| d.content.clone()),
{
file.documents[fragment.document_range.clone()]
.iter()
.map(|d| d.content.clone())
}
);
}