fix: added chunk_size in tika processor (#3466)

This commit is contained in:
AmineDiro 2024-11-09 15:34:12 +01:00 committed by GitHub
parent 190d971bd7
commit 063bbd323d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,3 +1,4 @@
import tiktoken
import logging import logging
import os import os
from typing import AsyncIterable from typing import AsyncIterable
@ -39,6 +40,7 @@ class TikaProcessor(ProcessorBase):
self.max_retries = max_retries self.max_retries = max_retries
self._client = httpx.AsyncClient(timeout=timeout) self._client = httpx.AsyncClient(timeout=timeout)
self.enc = tiktoken.get_encoding("cl100k_base")
self.splitter_config = splitter_config self.splitter_config = splitter_config
if splitter: if splitter:
@ -73,5 +75,7 @@ class TikaProcessor(ProcessorBase):
txt = await self._send_parse_tika(f) txt = await self._send_parse_tika(f)
document = Document(page_content=txt) document = Document(page_content=txt)
docs = self.text_splitter.split_documents([document]) docs = self.text_splitter.split_documents([document])
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
return docs return docs