fix: megaparse sdk with nats (#3496)

* Adapt deps
* Change megaparse processor inner file processing
This commit is contained in:
Chloé Daems 2024-11-25 15:29:38 +01:00 committed by GitHub
parent a4e42b08a0
commit e68b4f4569
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 7 additions and 32 deletions

View File

@ -23,7 +23,7 @@ dependencies = [
"faiss-cpu>=1.8.0.post1", "faiss-cpu>=1.8.0.post1",
"rapidfuzz>=3.10.1", "rapidfuzz>=3.10.1",
"markupsafe>=2.1.5", "markupsafe>=2.1.5",
"megaparse[all]== 0.0.43", "megaparse-sdk==0.1.7"
] ]
readme = "README.md" readme = "README.md"
requires-python = ">= 3.11" requires-python = ">= 3.11"

View File

@ -3,8 +3,8 @@ import logging
import tiktoken import tiktoken
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from megaparse.core.megaparse import MegaParse from megaparse_sdk.client import MegaParseNATSClient
from megaparse.core.parser.unstructured_parser import UnstructuredParser from megaparse_sdk.config import ClientNATSConfig
from quivr_core.config import MegaparseConfig from quivr_core.config import MegaparseConfig
from quivr_core.files.file import QuivrFile from quivr_core.files.file import QuivrFile
@ -75,9 +75,9 @@ class MegaparseProcessor(ProcessorBase):
async def process_file_inner(self, file: QuivrFile) -> list[Document]: async def process_file_inner(self, file: QuivrFile) -> list[Document]:
logger.info(f"Uploading file {file.path} to MegaParse") logger.info(f"Uploading file {file.path} to MegaParse")
parser = UnstructuredParser(**self.megaparse_config.model_dump()) async with MegaParseNATSClient(ClientNATSConfig()) as client:
megaparse = MegaParse(parser) response = await client.parse_file(file=file.path)
response = await megaparse.aload(file.path)
logger.info(f"File : {response}") logger.info(f"File : {response}")
document = Document( document = Document(
page_content=response, page_content=response,
@ -87,28 +87,3 @@ class MegaparseProcessor(ProcessorBase):
for doc in docs: for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
return docs return docs
# async def process_file_inner(self, file: QuivrFile) -> list[Document]:
# api_key = str(os.getenv("MEGAPARSE_API_KEY"))
# megaparse = MegaParseSDK(api_key)
# logger.info(f"Uploading file {file.path} to MegaParse")
# data = {
# "method": self.megaparse_config.method,
# "strategy": self.megaparse_config.strategy,
# "check_table": self.megaparse_config.check_table,
# "parsing_instruction": self.megaparse_config.parsing_instruction,
# "model_name": self.megaparse_config.model_name,
# }
# response = await megaparse.file.upload(
# file_path=str(file.path),
# **data,
# )
# document = Document(
# page_content=response["result"],
# )
# if len(response) > self.splitter_config.chunk_size:
# docs = self.text_splitter.split_documents([document])
# for doc in docs:
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
# return docs
# return [document]

View File

@ -11,7 +11,7 @@ from rich.prompt import Prompt
if __name__ == "__main__": if __name__ == "__main__":
brain = Brain.from_files( brain = Brain.from_files(
name="test_brain", name="test_brain",
file_paths=["./tests/processor/docx/demo.docx"], file_paths=["./tests/processor/pdf/sample.pdf"],
llm=LLMEndpoint( llm=LLMEndpoint(
llm_config=LLMEndpointConfig(model="gpt-4o"), llm_config=LLMEndpointConfig(model="gpt-4o"),
llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))), llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),