feat(llamaparse): Add Llama Parse integration for complex document parsing (#2517)

This pull request adds Llama Parse integration for complex document parsing in Quivr. Llama Parse is a tool from Llama Index that allows you to read complex documents in Quivr. It provides an API key that needs to be added to the `.env` file as `LLAMA_CLOUD_API_KEY`. Once configured, you can use the Llama Parse tool to read `pdf`, `docx`, and `doc` files in Quivr.
2024-12-15 01:21:48 +03:00 · 2024-04-29 18:31:21 +02:00 · 2024-04-29 18:31:21 +02:00 · 8b0c55de5c
commit 8b0c55de5c
parent dedb78c84d
3 changed files with 68 additions and 1 deletions
--- a/backend/packages/files/parsers/common.py
+++ b/backend/packages/files/parsers/common.py
@ -1,13 +1,21 @@
+import os
 import re
+import tempfile
 import time

+import nest_asyncio
 import tiktoken
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from llama_parse import LlamaParse
 from logger import get_logger
 from models import File
 from modules.brain.service.brain_vector_service import BrainVectorService
 from modules.upload.service.upload_file import DocumentSerializable
 from packages.embeddings.vectors import Neurons

+nest_asyncio.apply()
+
 logger = get_logger(__name__)


@ -22,7 +30,35 @@ async def process_file(
    dateshort = time.strftime("%Y%m%d")
    neurons = Neurons()

-    file.compute_documents(loader_class)
+    if os.getenv("LLAMA_CLOUD_API_KEY"):
+        doc = file.file
+        document_ext = os.path.splitext(doc.filename)[1]
+        if document_ext in [".pdf", ".docx", ".doc"]:
+            document_tmp = tempfile.NamedTemporaryFile(
+                suffix=document_ext, delete=False
+            )
+            # Seek to the beginning of the file
+            doc.file.seek(0)
+            document_tmp.write(doc.file.read())
+
+            parser = LlamaParse(
+                result_type="mardown",  # "markdown" and "text" are available
+                parsing_instruction="Extract all the information as possible in a way a human can understand by being as verbose as possible.",
+            )
+
+            document_llama_parsed = parser.load_data(document_tmp.name)
+            document_tmp.close()
+            document_to_langchain = document_llama_parsed[0].to_langchain_format()
+            text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+                chunk_size=file.chunk_size, chunk_overlap=file.chunk_overlap
+            )
+            document_to_langchain = Document(
+                page_content=document_to_langchain.page_content
+            )
+            file.documents = text_splitter.split_documents([document_to_langchain])
+    else:
+
+        file.compute_documents(loader_class)

    metadata = {
        "file_sha1": file.file_sha1,
@ -40,8 +76,10 @@ async def process_file(
    enc = tiktoken.get_encoding("cl100k_base")

    if file.documents is not None:
+        logger.info("Coming here?")
        for doc in file.documents:  # pyright: ignore reportPrivateUsage=none
            new_metadata = metadata.copy()
+            logger.info(f"Processing document {doc}")
            # Add filename at beginning of page content
            doc.page_content = f"Filename: {new_metadata['original_file_name']} Content: {doc.page_content}"

--- a/docs/configuring/llamaparse.mdx
+++ b/docs/configuring/llamaparse.mdx
@ -0,0 +1,28 @@
+---
+title: Llama Parse
+description: Use Llama Parse to read complex document in Quivr
+---
+
+
+# Llama Parse
+
+Llama Parse is a tool from Llama Index that allows you to read complex documents in Quivr.
+
+Link to Llama Parse: [https://cloud.llamaindex.ai/parse](https://cloud.llamaindex.ai/parse)
+
+## How to use Llama Parse
+
+1. Go to [https://cloud.llamaindex.ai/parse](https://cloud.llamaindex.ai/parse)
+
+2. Create an API key by clicking on the `Create API Key` button.
+
+3. Add `LLAMA_CLOUD_API_KEY` to your `.env` file.
+
+```bash
+# Llama Parse Configuration
+LLAMA_CLOUD_API_KEY=your_llama_cloud_api_key
+```
+
+4. Use the Llama Parse tool to read complex documents in Quivr - Currently only supports `pdf`, `docx`, and `doc` files.
+
+
--- a/docs/mint.json
+++ b/docs/mint.json
@ -66,6 +66,7 @@
            "configuring/increase-user-usage",
            "configuring/resend-emails",
            "configuring/reranking",
+            "configuring/llamaparse",
            "configuring/environment-variables",
            "configuring/profiler",
            "configuring/telemetry"