mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-14 17:03:29 +03:00
fix: added chunk_size in tika processor (#3466)
This commit is contained in:
parent
190d971bd7
commit
063bbd323d
@ -1,3 +1,4 @@
|
|||||||
|
import tiktoken
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import AsyncIterable
|
from typing import AsyncIterable
|
||||||
@ -39,6 +40,7 @@ class TikaProcessor(ProcessorBase):
|
|||||||
self.max_retries = max_retries
|
self.max_retries = max_retries
|
||||||
self._client = httpx.AsyncClient(timeout=timeout)
|
self._client = httpx.AsyncClient(timeout=timeout)
|
||||||
|
|
||||||
|
self.enc = tiktoken.get_encoding("cl100k_base")
|
||||||
self.splitter_config = splitter_config
|
self.splitter_config = splitter_config
|
||||||
|
|
||||||
if splitter:
|
if splitter:
|
||||||
@ -73,5 +75,7 @@ class TikaProcessor(ProcessorBase):
|
|||||||
txt = await self._send_parse_tika(f)
|
txt = await self._send_parse_tika(f)
|
||||||
document = Document(page_content=txt)
|
document = Document(page_content=txt)
|
||||||
docs = self.text_splitter.split_documents([document])
|
docs = self.text_splitter.split_documents([document])
|
||||||
|
for doc in docs:
|
||||||
|
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
|
Loading…
Reference in New Issue
Block a user