mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-15 01:21:48 +03:00
feat(llamaparse): Add Llama Parse integration for complex document parsing (#2517)
This pull request adds Llama Parse integration for complex document parsing in Quivr. Llama Parse is a tool from Llama Index that allows you to read complex documents in Quivr. It provides an API key that needs to be added to the `.env` file as `LLAMA_CLOUD_API_KEY`. Once configured, you can use the Llama Parse tool to read `pdf`, `docx`, and `doc` files in Quivr.
This commit is contained in:
parent
dedb78c84d
commit
8b0c55de5c
@ -1,13 +1,21 @@
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import nest_asyncio
|
||||
import tiktoken
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from llama_parse import LlamaParse
|
||||
from logger import get_logger
|
||||
from models import File
|
||||
from modules.brain.service.brain_vector_service import BrainVectorService
|
||||
from modules.upload.service.upload_file import DocumentSerializable
|
||||
from packages.embeddings.vectors import Neurons
|
||||
|
||||
nest_asyncio.apply()
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -22,7 +30,35 @@ async def process_file(
|
||||
dateshort = time.strftime("%Y%m%d")
|
||||
neurons = Neurons()
|
||||
|
||||
file.compute_documents(loader_class)
|
||||
if os.getenv("LLAMA_CLOUD_API_KEY"):
|
||||
doc = file.file
|
||||
document_ext = os.path.splitext(doc.filename)[1]
|
||||
if document_ext in [".pdf", ".docx", ".doc"]:
|
||||
document_tmp = tempfile.NamedTemporaryFile(
|
||||
suffix=document_ext, delete=False
|
||||
)
|
||||
# Seek to the beginning of the file
|
||||
doc.file.seek(0)
|
||||
document_tmp.write(doc.file.read())
|
||||
|
||||
parser = LlamaParse(
|
||||
result_type="mardown", # "markdown" and "text" are available
|
||||
parsing_instruction="Extract all the information as possible in a way a human can understand by being as verbose as possible.",
|
||||
)
|
||||
|
||||
document_llama_parsed = parser.load_data(document_tmp.name)
|
||||
document_tmp.close()
|
||||
document_to_langchain = document_llama_parsed[0].to_langchain_format()
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
||||
chunk_size=file.chunk_size, chunk_overlap=file.chunk_overlap
|
||||
)
|
||||
document_to_langchain = Document(
|
||||
page_content=document_to_langchain.page_content
|
||||
)
|
||||
file.documents = text_splitter.split_documents([document_to_langchain])
|
||||
else:
|
||||
|
||||
file.compute_documents(loader_class)
|
||||
|
||||
metadata = {
|
||||
"file_sha1": file.file_sha1,
|
||||
@ -40,8 +76,10 @@ async def process_file(
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
if file.documents is not None:
|
||||
logger.info("Coming here?")
|
||||
for doc in file.documents: # pyright: ignore reportPrivateUsage=none
|
||||
new_metadata = metadata.copy()
|
||||
logger.info(f"Processing document {doc}")
|
||||
# Add filename at beginning of page content
|
||||
doc.page_content = f"Filename: {new_metadata['original_file_name']} Content: {doc.page_content}"
|
||||
|
||||
|
28
docs/configuring/llamaparse.mdx
Normal file
28
docs/configuring/llamaparse.mdx
Normal file
@ -0,0 +1,28 @@
|
||||
---
|
||||
title: Llama Parse
|
||||
description: Use Llama Parse to read complex document in Quivr
|
||||
---
|
||||
|
||||
|
||||
# Llama Parse
|
||||
|
||||
Llama Parse is a tool from Llama Index that allows you to read complex documents in Quivr.
|
||||
|
||||
Link to Llama Parse: [https://cloud.llamaindex.ai/parse](https://cloud.llamaindex.ai/parse)
|
||||
|
||||
## How to use Llama Parse
|
||||
|
||||
1. Go to [https://cloud.llamaindex.ai/parse](https://cloud.llamaindex.ai/parse)
|
||||
|
||||
2. Create an API key by clicking on the `Create API Key` button.
|
||||
|
||||
3. Add `LLAMA_CLOUD_API_KEY` to your `.env` file.
|
||||
|
||||
```bash
|
||||
# Llama Parse Configuration
|
||||
LLAMA_CLOUD_API_KEY=your_llama_cloud_api_key
|
||||
```
|
||||
|
||||
4. Use the Llama Parse tool to read complex documents in Quivr - Currently only supports `pdf`, `docx`, and `doc` files.
|
||||
|
||||
|
@ -66,6 +66,7 @@
|
||||
"configuring/increase-user-usage",
|
||||
"configuring/resend-emails",
|
||||
"configuring/reranking",
|
||||
"configuring/llamaparse",
|
||||
"configuring/environment-variables",
|
||||
"configuring/profiler",
|
||||
"configuring/telemetry"
|
||||
|
Loading…
Reference in New Issue
Block a user