mirror of
https://github.com/StanGirard/quivr.git
synced 2025-01-05 10:08:28 +03:00
fix: megaparse sdk with nats (#3496)
* Adapt deps * Change megaparse processor inner file processing
This commit is contained in:
parent
a4e42b08a0
commit
e68b4f4569
@ -23,7 +23,7 @@ dependencies = [
|
|||||||
"faiss-cpu>=1.8.0.post1",
|
"faiss-cpu>=1.8.0.post1",
|
||||||
"rapidfuzz>=3.10.1",
|
"rapidfuzz>=3.10.1",
|
||||||
"markupsafe>=2.1.5",
|
"markupsafe>=2.1.5",
|
||||||
"megaparse[all]== 0.0.43",
|
"megaparse-sdk==0.1.7"
|
||||||
]
|
]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">= 3.11"
|
requires-python = ">= 3.11"
|
||||||
|
@ -3,8 +3,8 @@ import logging
|
|||||||
import tiktoken
|
import tiktoken
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
|
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
|
||||||
from megaparse.core.megaparse import MegaParse
|
from megaparse_sdk.client import MegaParseNATSClient
|
||||||
from megaparse.core.parser.unstructured_parser import UnstructuredParser
|
from megaparse_sdk.config import ClientNATSConfig
|
||||||
|
|
||||||
from quivr_core.config import MegaparseConfig
|
from quivr_core.config import MegaparseConfig
|
||||||
from quivr_core.files.file import QuivrFile
|
from quivr_core.files.file import QuivrFile
|
||||||
@ -75,9 +75,9 @@ class MegaparseProcessor(ProcessorBase):
|
|||||||
|
|
||||||
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
||||||
logger.info(f"Uploading file {file.path} to MegaParse")
|
logger.info(f"Uploading file {file.path} to MegaParse")
|
||||||
parser = UnstructuredParser(**self.megaparse_config.model_dump())
|
async with MegaParseNATSClient(ClientNATSConfig()) as client:
|
||||||
megaparse = MegaParse(parser)
|
response = await client.parse_file(file=file.path)
|
||||||
response = await megaparse.aload(file.path)
|
|
||||||
logger.info(f"File : {response}")
|
logger.info(f"File : {response}")
|
||||||
document = Document(
|
document = Document(
|
||||||
page_content=response,
|
page_content=response,
|
||||||
@ -87,28 +87,3 @@ class MegaparseProcessor(ProcessorBase):
|
|||||||
for doc in docs:
|
for doc in docs:
|
||||||
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
# async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
|
||||||
# api_key = str(os.getenv("MEGAPARSE_API_KEY"))
|
|
||||||
# megaparse = MegaParseSDK(api_key)
|
|
||||||
# logger.info(f"Uploading file {file.path} to MegaParse")
|
|
||||||
# data = {
|
|
||||||
# "method": self.megaparse_config.method,
|
|
||||||
# "strategy": self.megaparse_config.strategy,
|
|
||||||
# "check_table": self.megaparse_config.check_table,
|
|
||||||
# "parsing_instruction": self.megaparse_config.parsing_instruction,
|
|
||||||
# "model_name": self.megaparse_config.model_name,
|
|
||||||
# }
|
|
||||||
# response = await megaparse.file.upload(
|
|
||||||
# file_path=str(file.path),
|
|
||||||
# **data,
|
|
||||||
# )
|
|
||||||
# document = Document(
|
|
||||||
# page_content=response["result"],
|
|
||||||
# )
|
|
||||||
# if len(response) > self.splitter_config.chunk_size:
|
|
||||||
# docs = self.text_splitter.split_documents([document])
|
|
||||||
# for doc in docs:
|
|
||||||
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
|
||||||
# return docs
|
|
||||||
# return [document]
|
|
||||||
|
@ -11,7 +11,7 @@ from rich.prompt import Prompt
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
brain = Brain.from_files(
|
brain = Brain.from_files(
|
||||||
name="test_brain",
|
name="test_brain",
|
||||||
file_paths=["./tests/processor/docx/demo.docx"],
|
file_paths=["./tests/processor/pdf/sample.pdf"],
|
||||||
llm=LLMEndpoint(
|
llm=LLMEndpoint(
|
||||||
llm_config=LLMEndpointConfig(model="gpt-4o"),
|
llm_config=LLMEndpointConfig(model="gpt-4o"),
|
||||||
llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),
|
llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),
|
||||||
|
Loading…
Reference in New Issue
Block a user