feat(ingestion): Add ingestion module and routes (#2393)

This pull request adds the ingestion module and routes to the project. It includes the necessary files and code changes to implement the ingestion functionality.
2024-09-11 14:36:35 +03:00 · 2024-04-01 18:40:56 -07:00 · 2024-04-01 18:40:56 -07:00 · a95e311712
commit a95e311712
parent 65c0ed505e
25 changed files with 600 additions and 79 deletions
--- a/backend/celery_worker.py
+++ b/backend/celery_worker.py
@ -189,6 +189,8 @@ def process_integration_brain_sync():
    time = datetime.now(timezone.utc)  # Make `time` timezone-aware
    # last_synced is a string that represents a timestampz in the database
    # only call process_integration_brain_sync_user_brain if more than 1 day has passed since the last sync
+    if not integrations:
+        return
    for integration in integrations:
        print(f"last_synced: {integration.last_synced}")  # Add this line
        last_synced = datetime.strptime(
--- a/backend/main.py
+++ b/backend/main.py
@ -15,6 +15,7 @@ from modules.api_key.controller import api_key_router
 from modules.brain.controller import brain_router
 from modules.chat.controller import chat_router
 from modules.contact_support.controller import contact_router
+from modules.ingestion.controller import ingestion_router
 from modules.knowledge.controller import knowledge_router
 from modules.misc.controller import misc_router
 from modules.notification.controller import notification_router
@ -70,6 +71,7 @@ add_cors_middleware(app)
 app.include_router(brain_router)
 app.include_router(chat_router)
 app.include_router(crawl_router)
+app.include_router(ingestion_router)
 app.include_router(onboarding_router)
 app.include_router(misc_router)

--- a/backend/modules/ingestion/init.py
+++ b/backend/modules/ingestion/init.py
--- a/backend/modules/ingestion/controller/init.py
+++ b/backend/modules/ingestion/controller/init.py
@ -0,0 +1 @@
+from .ingestion_routes import ingestion_router
--- a/backend/modules/ingestion/controller/ingestion_routes.py
+++ b/backend/modules/ingestion/controller/ingestion_routes.py
@ -0,0 +1,82 @@
+from typing import List
+from uuid import UUID
+
+from fastapi import APIRouter, Depends, File, Query, UploadFile
+from logger import get_logger
+from middlewares.auth import AuthBearer, get_current_user
+from modules.ingestion.entity.ingestion import IngestionEntity
+from modules.ingestion.ito.audio_transcript import AudioTranscriptIngestion
+from modules.ingestion.ito.crawler import CrawlerIngestion
+from modules.ingestion.ito.summary import SummaryIngestion
+from modules.ingestion.service.ingestion import Ingestion
+from modules.user.entity.user_identity import UserIdentity
+
+ingestion_router = APIRouter()
+logger = get_logger(__name__)
+
+ingestion_service = Ingestion()
+
+
+@ingestion_router.get(
+    "/ingestion", dependencies=[Depends(AuthBearer())], tags=["Ingestion"]
+)
+async def list_ingestion(
+    current_user: UserIdentity = Depends(get_current_user),
+) -> List[IngestionEntity]:
+    """
+    Retrieve and list all the knowledge in a brain.
+    """
+
+    ingestions = ingestion_service.get_all_ingestions()
+    return ingestions
+
+
+@ingestion_router.post(
+    "/ingestion/{ingestion_id}/process",
+    dependencies=[Depends(AuthBearer())],
+    tags=["Ingestion"],
+)
+async def process_ingestion(
+    ingestion_id: UUID,
+    file_1: UploadFile = File(None),
+    current_user: UserIdentity = Depends(get_current_user),
+    brain_id: UUID = Query(None, description="The ID of the brain"),
+    send_file_email: bool = Query(False, description="Send the file by email"),
+    url: str = Query(None, description="The URL to process"),
+):
+    if ingestion_id is None:
+        raise ValueError("Ingestion ID is required")
+
+    ingestion = ingestion_service.get_ingestion_by_id(ingestion_id)
+
+    if ingestion.name == "summary":
+        summary = SummaryIngestion(
+            uploadFile=file_1,
+            current_user=current_user,
+            brain_id=brain_id,
+            send_file_email=send_file_email,
+            url=url,
+        )
+        return await summary.process_ingestion()
+
+    if ingestion.name == "audio_transcript":
+        audio_summary = AudioTranscriptIngestion(
+            uploadFile=file_1,
+            current_user=current_user,
+            brain_id=brain_id,
+            send_file_email=send_file_email,
+            url=url,
+        )
+        return await audio_summary.process_ingestion()
+
+    if ingestion.name == "crawler":
+        crawler = CrawlerIngestion(
+            uploadFile=file_1,
+            current_user=current_user,
+            brain_id=brain_id,
+            send_file_email=send_file_email,
+            url=url,
+        )
+        return await crawler.process_ingestion()
+
+    return {"message": "Not found"}
--- a/backend/modules/ingestion/dto/init.py
+++ b/backend/modules/ingestion/dto/init.py
@ -0,0 +1,2 @@
+from .inputs import CreateKnowledgeProperties
+from .outputs import DeleteKnowledgeResponse
--- a/backend/modules/ingestion/dto/inputs.py
+++ b/backend/modules/ingestion/dto/inputs.py
@ -0,0 +1,18 @@
+from typing import Optional
+from uuid import UUID
+
+from pydantic import BaseModel
+
+
+class CreateKnowledgeProperties(BaseModel):
+    brain_id: UUID
+    file_name: Optional[str] = None
+    url: Optional[str] = None
+    extension: str = "txt"
+    integration: Optional[str] = None
+    integration_link: Optional[str] = None
+
+    def dict(self, *args, **kwargs):
+        knowledge_dict = super().dict(*args, **kwargs)
+        knowledge_dict["brain_id"] = str(knowledge_dict.get("brain_id"))
+        return knowledge_dict
--- a/backend/modules/ingestion/dto/outputs.py
+++ b/backend/modules/ingestion/dto/outputs.py
@ -0,0 +1,8 @@
+from uuid import UUID
+
+from pydantic import BaseModel
+
+
+class DeleteKnowledgeResponse(BaseModel):
+    status: str = "delete"
+    knowledge_id: UUID
--- a/backend/modules/ingestion/entity/init.py
+++ b/backend/modules/ingestion/entity/init.py
@ -0,0 +1 @@
+from .ingestion import IngestionEntity
--- a/backend/modules/ingestion/entity/ingestion.py
+++ b/backend/modules/ingestion/entity/ingestion.py
@ -0,0 +1,10 @@
+from uuid import UUID
+
+from pydantic import BaseModel
+
+
+class IngestionEntity(BaseModel):
+    id: UUID
+    name: str
+    brain_id_required: bool
+    file_1_required: bool
--- a/backend/modules/ingestion/ito/init.py
+++ b/backend/modules/ingestion/ito/init.py
--- a/backend/modules/ingestion/ito/audio_transcript.py
+++ b/backend/modules/ingestion/ito/audio_transcript.py
@ -0,0 +1,47 @@
+import os
+from tempfile import NamedTemporaryFile
+
+from logger import get_logger
+from modules.ingestion.ito.ito import ITO
+from openai import OpenAI
+
+logger = get_logger(__name__)
+
+
+class AudioTranscriptIngestion(ITO):
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+
+    async def process_ingestion(self):
+        client = OpenAI()
+
+        logger.info(f"Processing audio file {self.uploadFile.filename}")
+
+        # Extract the original filename and create a temporary file with the same name
+        filename = os.path.basename(self.uploadFile.filename)
+        temp_file = NamedTemporaryFile(delete=False, suffix=filename)
+
+        # Write the uploaded file's data to the temporary file
+        data = await self.uploadFile.read()
+        temp_file.write(data)
+        temp_file.close()
+
+        # Open the temporary file and pass it to the OpenAI API
+        with open(temp_file.name, "rb") as file:
+            transcription = client.audio.transcriptions.create(
+                model="whisper-1", file=file, response_format="text"
+            )
+            logger.info(f"Transcription: {transcription}")
+
+            # Delete the temporary file
+            os.remove(temp_file.name)
+
+        return await self.create_and_upload_processed_file(
+            transcription, self.uploadFile.filename, "Audio Transcript"
+        )
--- a/backend/modules/ingestion/ito/crawler.py
+++ b/backend/modules/ingestion/ito/crawler.py
@ -0,0 +1,33 @@
+from bs4 import BeautifulSoup as Soup
+from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
+from logger import get_logger
+from modules.ingestion.ito.ito import ITO
+
+logger = get_logger(__name__)
+
+
+class CrawlerIngestion(ITO):
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+
+    async def process_ingestion(self):
+
+        url = self.url
+        loader = RecursiveUrlLoader(
+            url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
+        )
+        docs = loader.load()
+
+        nice_url = url.split("://")[1].replace("/", "_").replace(".", "_")
+        nice_url += ".txt"
+
+        for docs in docs:
+            await self.create_and_upload_processed_file(
+                docs.page_content, nice_url, "Crawler"
+            )
--- a/backend/modules/ingestion/ito/ito.py
+++ b/backend/modules/ingestion/ito/ito.py
@ -0,0 +1,114 @@
+import random
+from abc import abstractmethod
+from io import BytesIO
+from tempfile import NamedTemporaryFile
+from uuid import UUID
+
+from fastapi import UploadFile
+from logger import get_logger
+from modules.contact_support.controller.settings import ContactsSettings
+from modules.upload.controller.upload_routes import upload_file
+from modules.user.entity.user_identity import UserIdentity
+from packages.emails.send_email import send_email
+from pydantic import BaseModel
+
+logger = get_logger(__name__)
+
+
+class ITO(BaseModel):
+    uploadFile: UploadFile | None = None
+    current_user: UserIdentity = None
+    brain_id: UUID | None = None
+    send_file_email: bool = False
+    url: str | None = None
+
+    def __init__(
+        self,
+        uploadFile: UploadFile,
+        current_user: UserIdentity,
+        brain_id: UUID,
+        send_file_email: bool = False,
+        url: str = None,
+    ):
+        super().__init__(
+            uploadFile=uploadFile,
+            current_user=current_user,
+            brain_id=brain_id,
+            send_file_email=send_file_email,
+            url=url,
+        )
+
+    @abstractmethod
+    async def process_ingestion(self):
+        pass
+
+    async def send_output_by_email(
+        self, file: UploadFile, name: str, custom_message: str = None
+    ):
+        settings = ContactsSettings()
+        file = await self.uploadfile_to_file(file)
+
+        with open(file.name, "rb") as f:
+
+            mail_from = settings.resend_contact_sales_from
+            mail_to = self.current_user.email
+            body = f"""
+            <p>{custom_message}</p>
+            """
+            params = {
+                "from": mail_from,
+                "to": mail_to,
+                "subject": "Quivr Ingestion Processed",
+                "reply_to": "no-reply@quivr.app",
+                "html": body,
+                "attachments": [{"filename": name, "content": list(f.read())}],
+            }
+            logger.info(f"Sending email to {mail_to} with file {name}")
+            send_email(params)
+
+    async def uploadfile_to_file(self, uploadFile: UploadFile):
+        # Transform the UploadFile object to a file object with same name and content
+        tmp_file = NamedTemporaryFile(delete=False)
+        tmp_file.write(uploadFile.file.read())
+        tmp_file.flush()  # Make sure all data is written to disk
+        return tmp_file
+
+    async def create_and_upload_processed_file(
+        self, processed_content: str, original_filename: str, file_description: str
+    ) -> dict:
+        """Handles creation and uploading of the processed file."""
+        content_io = BytesIO(processed_content.encode("utf-8"))
+        content_io.seek(0)
+
+        new_filename = (
+            original_filename.split(".")[0]
+            + "_"
+            + file_description.lower().replace(" ", "_")
+            + "_"
+            + str(random.randint(1000, 9999))
+            + ".txt"
+        )
+
+        file_to_upload = UploadFile(
+            filename=new_filename,
+            file=content_io,
+            headers={"content-type": "text/plain"},
+        )
+
+        if self.send_file_email:
+            await self.send_output_by_email(
+                file_to_upload,
+                new_filename,
+                f"{file_description} of {original_filename}",
+            )
+
+        # Reset to start of file before upload
+        file_to_upload.file.seek(0)
+        await upload_file(
+            uploadFile=file_to_upload,
+            brain_id=self.brain_id,
+            current_user=self.current_user,
+            chat_id=None,
+        )
+
+        return {"message": f"{file_description} generated successfully"}
--- a/backend/modules/ingestion/ito/summary.py
+++ b/backend/modules/ingestion/ito/summary.py
@ -0,0 +1,100 @@
+import tempfile
+
+from langchain.chains import (
+    MapReduceDocumentsChain,
+    ReduceDocumentsChain,
+    StuffDocumentsChain,
+)
+from langchain.chains.llm import LLMChain
+from langchain_community.chat_models import ChatLiteLLM
+from langchain_community.document_loaders import UnstructuredPDFLoader
+from langchain_core.prompts import PromptTemplate
+from langchain_text_splitters import CharacterTextSplitter
+from logger import get_logger
+from modules.ingestion.ito.ito import ITO
+
+logger = get_logger(__name__)
+
+
+class SummaryIngestion(ITO):
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+
+    async def process_ingestion(self):
+
+        # Create a temporary file with the uploaded file as a temporary file and then pass it to the loader
+        tmp_file = tempfile.NamedTemporaryFile(delete=False)
+
+        # Write the file to the temporary file
+        tmp_file.write(self.uploadFile.file.read())
+
+        # Now pass the path of the temporary file to the loader
+
+        loader = UnstructuredPDFLoader(tmp_file.name)
+
+        tmp_file.close()
+
+        data = loader.load()
+
+        llm = ChatLiteLLM(model="gpt-3.5-turbo")
+
+        map_template = """The following is a set of documents
+        {docs}
+        Based on this list of docs, please identify the main themes 
+        Helpful Answer:"""
+        map_prompt = PromptTemplate.from_template(map_template)
+        map_chain = LLMChain(llm=llm, prompt=map_prompt)
+
+        # Reduce
+        reduce_template = """The following is set of summaries:
+        {docs}
+        Take these and distill it into a final, consolidated summary of the main themes. 
+        Helpful Answer:"""
+        reduce_prompt = PromptTemplate.from_template(reduce_template)
+
+        # Run chain
+        reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
+
+        # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
+        combine_documents_chain = StuffDocumentsChain(
+            llm_chain=reduce_chain, document_variable_name="docs"
+        )
+
+        # Combines and iteratively reduces the mapped documents
+        reduce_documents_chain = ReduceDocumentsChain(
+            # This is final chain that is called.
+            combine_documents_chain=combine_documents_chain,
+            # If documents exceed context for `StuffDocumentsChain`
+            collapse_documents_chain=combine_documents_chain,
+            # The maximum number of tokens to group documents into.
+            token_max=4000,
+        )
+
+        # Combining documents by mapping a chain over them, then combining results
+        map_reduce_chain = MapReduceDocumentsChain(
+            # Map chain
+            llm_chain=map_chain,
+            # Reduce chain
+            reduce_documents_chain=reduce_documents_chain,
+            # The variable name in the llm_chain to put the documents in
+            document_variable_name="docs",
+            # Return the results of the map steps in the output
+            return_intermediate_steps=False,
+        )
+
+        text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+            chunk_size=1000, chunk_overlap=0
+        )
+        split_docs = text_splitter.split_documents(data)
+
+        content = map_reduce_chain.run(split_docs)
+
+        return await self.create_and_upload_processed_file(
+            content, self.uploadFile.filename, "Summary"
+        )
--- a/backend/modules/ingestion/repository/init.py
+++ b/backend/modules/ingestion/repository/init.py
@ -0,0 +1 @@
+from .ingestion_interface import IngestionInterface
--- a/backend/modules/ingestion/repository/ingestion_interface.py
+++ b/backend/modules/ingestion/repository/ingestion_interface.py
@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+from modules.ingestion.entity.ingestion import IngestionEntity
+
+
+class IngestionInterface(ABC):
+
+    @abstractmethod
+    def get_all_ingestions(self) -> List[IngestionEntity]:
+        """
+        Get all the knowledge in a brain
+        Args:
+            brain_id (UUID): The id of the brain
+        """
+        pass
--- a/backend/modules/ingestion/service/init.py
+++ b/backend/modules/ingestion/service/init.py
--- a/backend/modules/ingestion/service/ingestion.py
+++ b/backend/modules/ingestion/service/ingestion.py
@ -0,0 +1,30 @@
+from models.settings import get_supabase_client
+from modules.ingestion.entity.ingestion import IngestionEntity
+from modules.ingestion.repository.ingestion_interface import IngestionInterface
+
+
+class Ingestion(IngestionInterface):
+    def __init__(self):
+        supabase_client = get_supabase_client()
+        self.db = supabase_client
+
+    def get_all_ingestions(self):
+        response = self.db.from_("ingestions").select("*").execute()
+
+        if response.data:
+            return response.data
+
+        return []
+
+    def get_ingestion_by_id(self, ingestion_id) -> IngestionEntity:
+        response = (
+            self.db.from_("ingestions")
+            .select("*")
+            .filter("id", "eq", ingestion_id)
+            .execute()
+        )
+
+        if response.data:
+            return IngestionEntity(**response.data[0])
+
+        return None
--- a/backend/modules/upload/controller/upload_routes.py
+++ b/backend/modules/upload/controller/upload_routes.py
@ -3,7 +3,7 @@ from typing import Optional
 from uuid import UUID

 from celery_worker import process_file_and_notify
-from fastapi import APIRouter, Depends, HTTPException, Query, Request, UploadFile
+from fastapi import APIRouter, Depends, HTTPException, Query, UploadFile
 from logger import get_logger
 from middlewares.auth import AuthBearer, get_current_user
 from models import UserUsage
@ -38,7 +38,6 @@ async def healthz():

@upload_router.post("/upload", dependencies=[Depends(AuthBearer())], tags=["Upload"])
 async def upload_file(
-    request: Request,
    uploadFile: UploadFile,
    brain_id: UUID = Query(..., description="The ID of the brain"),
    chat_id: Optional[UUID] = Query(None, description="The ID of the chat"),
@ -47,7 +46,8 @@ async def upload_file(
    validate_brain_authorization(
        brain_id, current_user.id, [RoleEnum.Editor, RoleEnum.Owner]
    )
-
+    uploadFile.file.seek(0)
+    logger.info(f"Uploading file {uploadFile.filename} to brain {brain_id}")
    user_daily_usage = UserUsage(
        id=current_user.id,
        email=current_user.email,
@ -72,6 +72,8 @@ async def upload_file(
        )

    file_content = await uploadFile.read()
+    logger.info(f"File {uploadFile.filename} read successfully")
+    logger.info(f"Content length: {len(file_content)}")
    filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)

    try:
--- a/backend/packages/files/file.py
+++ b/backend/packages/files/file.py
@ -1,4 +1,5 @@
 import hashlib
+from io import BytesIO

 from fastapi import UploadFile

@ -17,6 +18,10 @@ def convert_bytes(bytes, precision=2):


 def get_file_size(file: UploadFile):
+    if isinstance(file.file, BytesIO):
+        # If the file object is a BytesIO object, get the size of the bytes data
+        file_size = len(file.file.getvalue())
+        return file_size
    # move the cursor to the end of the file
    file.file._file.seek(0, 2)  # pyright: ignore reportPrivateUsage=none
    file_size = (
--- a/supabase/migrations/20240330233534_ingestion.sql
+++ b/supabase/migrations/20240330233534_ingestion.sql
@ -0,0 +1,69 @@
+create table "public"."ingestions" (
+    "name" text,
+    "id" uuid not null default gen_random_uuid()
+);
+
+
+alter table "public"."ingestions" enable row level security;
+
+CREATE UNIQUE INDEX ingestions_pkey ON public.ingestions USING btree (id);
+
+alter table "public"."ingestions" add constraint "ingestions_pkey" PRIMARY KEY using index "ingestions_pkey";
+
+grant delete on table "public"."ingestions" to "anon";
+
+grant insert on table "public"."ingestions" to "anon";
+
+grant references on table "public"."ingestions" to "anon";
+
+grant select on table "public"."ingestions" to "anon";
+
+grant trigger on table "public"."ingestions" to "anon";
+
+grant truncate on table "public"."ingestions" to "anon";
+
+grant update on table "public"."ingestions" to "anon";
+
+grant delete on table "public"."ingestions" to "authenticated";
+
+grant insert on table "public"."ingestions" to "authenticated";
+
+grant references on table "public"."ingestions" to "authenticated";
+
+grant select on table "public"."ingestions" to "authenticated";
+
+grant trigger on table "public"."ingestions" to "authenticated";
+
+grant truncate on table "public"."ingestions" to "authenticated";
+
+grant update on table "public"."ingestions" to "authenticated";
+
+grant delete on table "public"."ingestions" to "service_role";
+
+grant insert on table "public"."ingestions" to "service_role";
+
+grant references on table "public"."ingestions" to "service_role";
+
+grant select on table "public"."ingestions" to "service_role";
+
+grant trigger on table "public"."ingestions" to "service_role";
+
+grant truncate on table "public"."ingestions" to "service_role";
+
+grant update on table "public"."ingestions" to "service_role";
+
+create policy "INGESTION"
+on "public"."ingestions"
+as permissive
+for all
+to service_role;
+
+
+create policy "INTEGRATIONS"
+on "public"."integrations"
+as permissive
+for all
+to service_role;
+
+
+
--- a/supabase/migrations/20240402005455_ingestion.sql
+++ b/supabase/migrations/20240402005455_ingestion.sql
@ -0,0 +1,5 @@
+alter table "public"."ingestions" add column "brain_id_required" boolean not null default true;
+
+alter table "public"."ingestions" add column "file_1_required" boolean not null default false;
+
+
--- a/supabase/migrations/20240402013303_ingestion_url.sql
+++ b/supabase/migrations/20240402013303_ingestion_url.sql
@ -0,0 +1,3 @@
+alter table "public"."ingestions" add column "url_required" boolean default false;
+
+
--- a/supabase/seed.sql
+++ b/supabase/seed.sql
				`@ -0,0 +1 @@`
				`from .ingestion_routes import ingestion_router`
				`@ -0,0 +1 @@`
				`from .ingestion_interface import IngestionInterface`
				`@ -0,0 +1,3 @@`
				`alter table "public"."ingestions" add column "url_required" boolean default false;`