mirror of
https://github.com/StanGirard/quivr.git
synced 2024-11-22 11:33:57 +03:00
Fix/add_megaparse_lib_ (#3476)
This commit is contained in:
parent
da97b2cf14
commit
175a1cd2dd
@ -22,8 +22,8 @@ dependencies = [
|
||||
"transformers[sentencepiece]>=4.44.2",
|
||||
"faiss-cpu>=1.8.0.post1",
|
||||
"rapidfuzz>=3.10.1",
|
||||
"megaparse-sdk>=0.1.2",
|
||||
"markupsafe>=2.1.5",
|
||||
"megaparse[all]== 0.0.43",
|
||||
]
|
||||
readme = "README.md"
|
||||
requires-python = ">= 3.11"
|
||||
|
@ -1,10 +1,10 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
import tiktoken
|
||||
from langchain_core.documents import Document
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
|
||||
from megaparse_sdk import MegaParseSDK
|
||||
from megaparse.core.megaparse import MegaParse
|
||||
from megaparse.core.parser.unstructured_parser import UnstructuredParser
|
||||
|
||||
from quivr_core.config import MegaparseConfig
|
||||
from quivr_core.files.file import QuivrFile
|
||||
@ -74,26 +74,41 @@ class MegaparseProcessor(ProcessorBase):
|
||||
}
|
||||
|
||||
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
||||
api_key = str(os.getenv("MEGAPARSE_API_KEY"))
|
||||
megaparse = MegaParseSDK(api_key)
|
||||
logger.info(f"Uploading file {file.path} to MegaParse")
|
||||
data = {
|
||||
"method": self.megaparse_config.method,
|
||||
"strategy": self.megaparse_config.strategy,
|
||||
"check_table": self.megaparse_config.check_table,
|
||||
"parsing_instruction": self.megaparse_config.parsing_instruction,
|
||||
"model_name": self.megaparse_config.model_name,
|
||||
}
|
||||
response = await megaparse.file.upload(
|
||||
file_path=str(file.path),
|
||||
**data,
|
||||
)
|
||||
parser = UnstructuredParser(**self.megaparse_config.model_dump())
|
||||
megaparse = MegaParse(parser)
|
||||
response = await megaparse.aload(file.path)
|
||||
logger.info(f"File : {response}")
|
||||
document = Document(
|
||||
page_content=response["result"],
|
||||
page_content=response,
|
||||
)
|
||||
if len(response) > self.splitter_config.chunk_size:
|
||||
docs = self.text_splitter.split_documents([document])
|
||||
for doc in docs:
|
||||
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||
return docs
|
||||
return [document]
|
||||
|
||||
docs = self.text_splitter.split_documents([document])
|
||||
for doc in docs:
|
||||
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||
return docs
|
||||
|
||||
# async def process_file_inner(self, file: QuivrFile) -> list[Document]:
|
||||
# api_key = str(os.getenv("MEGAPARSE_API_KEY"))
|
||||
# megaparse = MegaParseSDK(api_key)
|
||||
# logger.info(f"Uploading file {file.path} to MegaParse")
|
||||
# data = {
|
||||
# "method": self.megaparse_config.method,
|
||||
# "strategy": self.megaparse_config.strategy,
|
||||
# "check_table": self.megaparse_config.check_table,
|
||||
# "parsing_instruction": self.megaparse_config.parsing_instruction,
|
||||
# "model_name": self.megaparse_config.model_name,
|
||||
# }
|
||||
# response = await megaparse.file.upload(
|
||||
# file_path=str(file.path),
|
||||
# **data,
|
||||
# )
|
||||
# document = Document(
|
||||
# page_content=response["result"],
|
||||
# )
|
||||
# if len(response) > self.splitter_config.chunk_size:
|
||||
# docs = self.text_splitter.split_documents([document])
|
||||
# for doc in docs:
|
||||
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
|
||||
# return docs
|
||||
# return [document]
|
||||
|
Loading…
Reference in New Issue
Block a user