From e68b4f45698898f6b514d4779c8e5fd7332f2e67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chlo=C3=A9=20Daems?= <73901882+chloedia@users.noreply.github.com> Date: Mon, 25 Nov 2024 15:29:38 +0100 Subject: [PATCH] fix: megaparse sdk with nats (#3496) * Adapt deps * Change megaparse processor inner file processing --- core/pyproject.toml | 2 +- .../implementations/megaparse_processor.py | 35 +++---------------- examples/simple_question_megaparse.py | 2 +- 3 files changed, 7 insertions(+), 32 deletions(-) diff --git a/core/pyproject.toml b/core/pyproject.toml index d57730ec2..115db9ddd 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "faiss-cpu>=1.8.0.post1", "rapidfuzz>=3.10.1", "markupsafe>=2.1.5", - "megaparse[all]== 0.0.43", + "megaparse-sdk==0.1.7" ] readme = "README.md" requires-python = ">= 3.11" diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py index ee5dc53b8..2c46cec10 100644 --- a/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/core/quivr_core/processor/implementations/megaparse_processor.py @@ -3,8 +3,8 @@ import logging import tiktoken from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter -from megaparse.core.megaparse import MegaParse -from megaparse.core.parser.unstructured_parser import UnstructuredParser +from megaparse_sdk.client import MegaParseNATSClient +from megaparse_sdk.config import ClientNATSConfig from quivr_core.config import MegaparseConfig from quivr_core.files.file import QuivrFile @@ -75,9 +75,9 @@ class MegaparseProcessor(ProcessorBase): async def process_file_inner(self, file: QuivrFile) -> list[Document]: logger.info(f"Uploading file {file.path} to MegaParse") - parser = UnstructuredParser(**self.megaparse_config.model_dump()) - megaparse = MegaParse(parser) - response = await megaparse.aload(file.path) + async with MegaParseNATSClient(ClientNATSConfig()) as client: + response = await client.parse_file(file=file.path) + logger.info(f"File : {response}") document = Document( page_content=response, @@ -87,28 +87,3 @@ class MegaparseProcessor(ProcessorBase): for doc in docs: doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} return docs - - # async def process_file_inner(self, file: QuivrFile) -> list[Document]: - # api_key = str(os.getenv("MEGAPARSE_API_KEY")) - # megaparse = MegaParseSDK(api_key) - # logger.info(f"Uploading file {file.path} to MegaParse") - # data = { - # "method": self.megaparse_config.method, - # "strategy": self.megaparse_config.strategy, - # "check_table": self.megaparse_config.check_table, - # "parsing_instruction": self.megaparse_config.parsing_instruction, - # "model_name": self.megaparse_config.model_name, - # } - # response = await megaparse.file.upload( - # file_path=str(file.path), - # **data, - # ) - # document = Document( - # page_content=response["result"], - # ) - # if len(response) > self.splitter_config.chunk_size: - # docs = self.text_splitter.split_documents([document]) - # for doc in docs: - # doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} - # return docs - # return [document] diff --git a/examples/simple_question_megaparse.py b/examples/simple_question_megaparse.py index 0d3c229e9..a46267f92 100644 --- a/examples/simple_question_megaparse.py +++ b/examples/simple_question_megaparse.py @@ -11,7 +11,7 @@ from rich.prompt import Prompt if __name__ == "__main__": brain = Brain.from_files( name="test_brain", - file_paths=["./tests/processor/docx/demo.docx"], + file_paths=["./tests/processor/pdf/sample.pdf"], llm=LLMEndpoint( llm_config=LLMEndpointConfig(model="gpt-4o"), llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),