mirror of
https://github.com/StanGirard/quivr.git
synced 2025-01-03 08:45:26 +03:00
fix: megaparse sdk with nats (#3496)
* Adapt deps * Change megaparse processor inner file processing
This commit is contained in:
parent
a4e42b08a0
commit
e68b4f4569
@ -23,7 +23,7 @@ dependencies = [
|
||||
"faiss-cpu>=1.8.0.post1",
|
||||
"rapidfuzz>=3.10.1",
|
||||
"markupsafe>=2.1.5",
|
||||
"megaparse[all]== 0.0.43",
|
||||
"megaparse-sdk==0.1.7"
|
||||
]
|
||||
readme = "README.md"
|
||||
requires-python = ">= 3.11"
|
||||
|
@ -3,8 +3,8 @@ import logging
|
||||
import tiktoken
|
||||
from langchain_core.documents import Document
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
|
||||
from megaparse.core.megaparse import MegaParse
|
||||
from megaparse.core.parser.unstructured_parser import UnstructuredParser
|
||||
from megaparse_sdk.client import MegaParseNATSClient
|
||||
from megaparse_sdk.config import ClientNATSConfig
|
||||
|
||||
from quivr_core.config import MegaparseConfig
|
||||
from quivr_core.files.file import QuivrFile
|
||||
@ -75,9 +75,9 @@ class MegaparseProcessor(ProcessorBase):
|
||||
|
||||
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
||||
logger.info(f"Uploading file {file.path} to MegaParse")
|
||||
parser = UnstructuredParser(**self.megaparse_config.model_dump())
|
||||
megaparse = MegaParse(parser)
|
||||
response = await megaparse.aload(file.path)
|
||||
async with MegaParseNATSClient(ClientNATSConfig()) as client:
|
||||
response = await client.parse_file(file=file.path)
|
||||
|
||||
logger.info(f"File : {response}")
|
||||
document = Document(
|
||||
page_content=response,
|
||||
@ -87,28 +87,3 @@ class MegaparseProcessor(ProcessorBase):
|
||||
for doc in docs:
|
||||
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||
return docs
|
||||
|
||||
# async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
||||
# api_key = str(os.getenv("MEGAPARSE_API_KEY"))
|
||||
# megaparse = MegaParseSDK(api_key)
|
||||
# logger.info(f"Uploading file {file.path} to MegaParse")
|
||||
# data = {
|
||||
# "method": self.megaparse_config.method,
|
||||
# "strategy": self.megaparse_config.strategy,
|
||||
# "check_table": self.megaparse_config.check_table,
|
||||
# "parsing_instruction": self.megaparse_config.parsing_instruction,
|
||||
# "model_name": self.megaparse_config.model_name,
|
||||
# }
|
||||
# response = await megaparse.file.upload(
|
||||
# file_path=str(file.path),
|
||||
# **data,
|
||||
# )
|
||||
# document = Document(
|
||||
# page_content=response["result"],
|
||||
# )
|
||||
# if len(response) > self.splitter_config.chunk_size:
|
||||
# docs = self.text_splitter.split_documents([document])
|
||||
# for doc in docs:
|
||||
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||
# return docs
|
||||
# return [document]
|
||||
|
@ -11,7 +11,7 @@ from rich.prompt import Prompt
|
||||
if __name__ == "__main__":
|
||||
brain = Brain.from_files(
|
||||
name="test_brain",
|
||||
file_paths=["./tests/processor/docx/demo.docx"],
|
||||
file_paths=["./tests/processor/pdf/sample.pdf"],
|
||||
llm=LLMEndpoint(
|
||||
llm_config=LLMEndpointConfig(model="gpt-4o"),
|
||||
llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),
|
||||
|
Loading…
Reference in New Issue
Block a user