feat(ingestion): Add ingestion module and routes (#2393)

This pull request adds the ingestion module and routes to the project.
It includes the necessary files and code changes to implement the
ingestion functionality.
This commit is contained in:
Stan Girard 2024-04-01 18:40:56 -07:00 committed by GitHub
parent 65c0ed505e
commit a95e311712
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 600 additions and 79 deletions

View File

@ -189,6 +189,8 @@ def process_integration_brain_sync():
time = datetime.now(timezone.utc) # Make `time` timezone-aware
# last_synced is a string that represents a timestampz in the database
# only call process_integration_brain_sync_user_brain if more than 1 day has passed since the last sync
if not integrations:
return
for integration in integrations:
print(f"last_synced: {integration.last_synced}") # Add this line
last_synced = datetime.strptime(

View File

@ -15,6 +15,7 @@ from modules.api_key.controller import api_key_router
from modules.brain.controller import brain_router
from modules.chat.controller import chat_router
from modules.contact_support.controller import contact_router
from modules.ingestion.controller import ingestion_router
from modules.knowledge.controller import knowledge_router
from modules.misc.controller import misc_router
from modules.notification.controller import notification_router
@ -70,6 +71,7 @@ add_cors_middleware(app)
app.include_router(brain_router)
app.include_router(chat_router)
app.include_router(crawl_router)
app.include_router(ingestion_router)
app.include_router(onboarding_router)
app.include_router(misc_router)

View File

View File

@ -0,0 +1 @@
from .ingestion_routes import ingestion_router

View File

@ -0,0 +1,82 @@
from typing import List
from uuid import UUID
from fastapi import APIRouter, Depends, File, Query, UploadFile
from logger import get_logger
from middlewares.auth import AuthBearer, get_current_user
from modules.ingestion.entity.ingestion import IngestionEntity
from modules.ingestion.ito.audio_transcript import AudioTranscriptIngestion
from modules.ingestion.ito.crawler import CrawlerIngestion
from modules.ingestion.ito.summary import SummaryIngestion
from modules.ingestion.service.ingestion import Ingestion
from modules.user.entity.user_identity import UserIdentity
ingestion_router = APIRouter()
logger = get_logger(__name__)
ingestion_service = Ingestion()
@ingestion_router.get(
"/ingestion", dependencies=[Depends(AuthBearer())], tags=["Ingestion"]
)
async def list_ingestion(
current_user: UserIdentity = Depends(get_current_user),
) -> List[IngestionEntity]:
"""
Retrieve and list all the knowledge in a brain.
"""
ingestions = ingestion_service.get_all_ingestions()
return ingestions
@ingestion_router.post(
"/ingestion/{ingestion_id}/process",
dependencies=[Depends(AuthBearer())],
tags=["Ingestion"],
)
async def process_ingestion(
ingestion_id: UUID,
file_1: UploadFile = File(None),
current_user: UserIdentity = Depends(get_current_user),
brain_id: UUID = Query(None, description="The ID of the brain"),
send_file_email: bool = Query(False, description="Send the file by email"),
url: str = Query(None, description="The URL to process"),
):
if ingestion_id is None:
raise ValueError("Ingestion ID is required")
ingestion = ingestion_service.get_ingestion_by_id(ingestion_id)
if ingestion.name == "summary":
summary = SummaryIngestion(
uploadFile=file_1,
current_user=current_user,
brain_id=brain_id,
send_file_email=send_file_email,
url=url,
)
return await summary.process_ingestion()
if ingestion.name == "audio_transcript":
audio_summary = AudioTranscriptIngestion(
uploadFile=file_1,
current_user=current_user,
brain_id=brain_id,
send_file_email=send_file_email,
url=url,
)
return await audio_summary.process_ingestion()
if ingestion.name == "crawler":
crawler = CrawlerIngestion(
uploadFile=file_1,
current_user=current_user,
brain_id=brain_id,
send_file_email=send_file_email,
url=url,
)
return await crawler.process_ingestion()
return {"message": "Not found"}

View File

@ -0,0 +1,2 @@
from .inputs import CreateKnowledgeProperties
from .outputs import DeleteKnowledgeResponse

View File

@ -0,0 +1,18 @@
from typing import Optional
from uuid import UUID
from pydantic import BaseModel
class CreateKnowledgeProperties(BaseModel):
brain_id: UUID
file_name: Optional[str] = None
url: Optional[str] = None
extension: str = "txt"
integration: Optional[str] = None
integration_link: Optional[str] = None
def dict(self, *args, **kwargs):
knowledge_dict = super().dict(*args, **kwargs)
knowledge_dict["brain_id"] = str(knowledge_dict.get("brain_id"))
return knowledge_dict

View File

@ -0,0 +1,8 @@
from uuid import UUID
from pydantic import BaseModel
class DeleteKnowledgeResponse(BaseModel):
status: str = "delete"
knowledge_id: UUID

View File

@ -0,0 +1 @@
from .ingestion import IngestionEntity

View File

@ -0,0 +1,10 @@
from uuid import UUID
from pydantic import BaseModel
class IngestionEntity(BaseModel):
id: UUID
name: str
brain_id_required: bool
file_1_required: bool

View File

@ -0,0 +1,47 @@
import os
from tempfile import NamedTemporaryFile
from logger import get_logger
from modules.ingestion.ito.ito import ITO
from openai import OpenAI
logger = get_logger(__name__)
class AudioTranscriptIngestion(ITO):
def __init__(
self,
**kwargs,
):
super().__init__(
**kwargs,
)
async def process_ingestion(self):
client = OpenAI()
logger.info(f"Processing audio file {self.uploadFile.filename}")
# Extract the original filename and create a temporary file with the same name
filename = os.path.basename(self.uploadFile.filename)
temp_file = NamedTemporaryFile(delete=False, suffix=filename)
# Write the uploaded file's data to the temporary file
data = await self.uploadFile.read()
temp_file.write(data)
temp_file.close()
# Open the temporary file and pass it to the OpenAI API
with open(temp_file.name, "rb") as file:
transcription = client.audio.transcriptions.create(
model="whisper-1", file=file, response_format="text"
)
logger.info(f"Transcription: {transcription}")
# Delete the temporary file
os.remove(temp_file.name)
return await self.create_and_upload_processed_file(
transcription, self.uploadFile.filename, "Audio Transcript"
)

View File

@ -0,0 +1,33 @@
from bs4 import BeautifulSoup as Soup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from logger import get_logger
from modules.ingestion.ito.ito import ITO
logger = get_logger(__name__)
class CrawlerIngestion(ITO):
def __init__(
self,
**kwargs,
):
super().__init__(
**kwargs,
)
async def process_ingestion(self):
url = self.url
loader = RecursiveUrlLoader(
url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()
nice_url = url.split("://")[1].replace("/", "_").replace(".", "_")
nice_url += ".txt"
for docs in docs:
await self.create_and_upload_processed_file(
docs.page_content, nice_url, "Crawler"
)

View File

@ -0,0 +1,114 @@
import random
from abc import abstractmethod
from io import BytesIO
from tempfile import NamedTemporaryFile
from uuid import UUID
from fastapi import UploadFile
from logger import get_logger
from modules.contact_support.controller.settings import ContactsSettings
from modules.upload.controller.upload_routes import upload_file
from modules.user.entity.user_identity import UserIdentity
from packages.emails.send_email import send_email
from pydantic import BaseModel
logger = get_logger(__name__)
class ITO(BaseModel):
uploadFile: UploadFile | None = None
current_user: UserIdentity = None
brain_id: UUID | None = None
send_file_email: bool = False
url: str | None = None
def __init__(
self,
uploadFile: UploadFile,
current_user: UserIdentity,
brain_id: UUID,
send_file_email: bool = False,
url: str = None,
):
super().__init__(
uploadFile=uploadFile,
current_user=current_user,
brain_id=brain_id,
send_file_email=send_file_email,
url=url,
)
@abstractmethod
async def process_ingestion(self):
pass
async def send_output_by_email(
self, file: UploadFile, name: str, custom_message: str = None
):
settings = ContactsSettings()
file = await self.uploadfile_to_file(file)
with open(file.name, "rb") as f:
mail_from = settings.resend_contact_sales_from
mail_to = self.current_user.email
body = f"""
<p>{custom_message}</p>
"""
params = {
"from": mail_from,
"to": mail_to,
"subject": "Quivr Ingestion Processed",
"reply_to": "no-reply@quivr.app",
"html": body,
"attachments": [{"filename": name, "content": list(f.read())}],
}
logger.info(f"Sending email to {mail_to} with file {name}")
send_email(params)
async def uploadfile_to_file(self, uploadFile: UploadFile):
# Transform the UploadFile object to a file object with same name and content
tmp_file = NamedTemporaryFile(delete=False)
tmp_file.write(uploadFile.file.read())
tmp_file.flush() # Make sure all data is written to disk
return tmp_file
async def create_and_upload_processed_file(
self, processed_content: str, original_filename: str, file_description: str
) -> dict:
"""Handles creation and uploading of the processed file."""
content_io = BytesIO(processed_content.encode("utf-8"))
content_io.seek(0)
new_filename = (
original_filename.split(".")[0]
+ "_"
+ file_description.lower().replace(" ", "_")
+ "_"
+ str(random.randint(1000, 9999))
+ ".txt"
)
file_to_upload = UploadFile(
filename=new_filename,
file=content_io,
headers={"content-type": "text/plain"},
)
if self.send_file_email:
await self.send_output_by_email(
file_to_upload,
new_filename,
f"{file_description} of {original_filename}",
)
# Reset to start of file before upload
file_to_upload.file.seek(0)
await upload_file(
uploadFile=file_to_upload,
brain_id=self.brain_id,
current_user=self.current_user,
chat_id=None,
)
return {"message": f"{file_description} generated successfully"}

View File

@ -0,0 +1,100 @@
import tempfile
from langchain.chains import (
MapReduceDocumentsChain,
ReduceDocumentsChain,
StuffDocumentsChain,
)
from langchain.chains.llm import LLMChain
from langchain_community.chat_models import ChatLiteLLM
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import CharacterTextSplitter
from logger import get_logger
from modules.ingestion.ito.ito import ITO
logger = get_logger(__name__)
class SummaryIngestion(ITO):
def __init__(
self,
**kwargs,
):
super().__init__(
**kwargs,
)
async def process_ingestion(self):
# Create a temporary file with the uploaded file as a temporary file and then pass it to the loader
tmp_file = tempfile.NamedTemporaryFile(delete=False)
# Write the file to the temporary file
tmp_file.write(self.uploadFile.file.read())
# Now pass the path of the temporary file to the loader
loader = UnstructuredPDFLoader(tmp_file.name)
tmp_file.close()
data = loader.load()
llm = ChatLiteLLM(model="gpt-3.5-turbo")
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)
# Reduce
reduce_template = """The following is set of summaries:
{docs}
Take these and distill it into a final, consolidated summary of the main themes.
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="docs"
)
# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=4000,
)
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="docs",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(data)
content = map_reduce_chain.run(split_docs)
return await self.create_and_upload_processed_file(
content, self.uploadFile.filename, "Summary"
)

View File

@ -0,0 +1 @@
from .ingestion_interface import IngestionInterface

View File

@ -0,0 +1,16 @@
from abc import ABC, abstractmethod
from typing import List
from modules.ingestion.entity.ingestion import IngestionEntity
class IngestionInterface(ABC):
@abstractmethod
def get_all_ingestions(self) -> List[IngestionEntity]:
"""
Get all the knowledge in a brain
Args:
brain_id (UUID): The id of the brain
"""
pass

View File

@ -0,0 +1,30 @@
from models.settings import get_supabase_client
from modules.ingestion.entity.ingestion import IngestionEntity
from modules.ingestion.repository.ingestion_interface import IngestionInterface
class Ingestion(IngestionInterface):
def __init__(self):
supabase_client = get_supabase_client()
self.db = supabase_client
def get_all_ingestions(self):
response = self.db.from_("ingestions").select("*").execute()
if response.data:
return response.data
return []
def get_ingestion_by_id(self, ingestion_id) -> IngestionEntity:
response = (
self.db.from_("ingestions")
.select("*")
.filter("id", "eq", ingestion_id)
.execute()
)
if response.data:
return IngestionEntity(**response.data[0])
return None

View File

@ -3,7 +3,7 @@ from typing import Optional
from uuid import UUID
from celery_worker import process_file_and_notify
from fastapi import APIRouter, Depends, HTTPException, Query, Request, UploadFile
from fastapi import APIRouter, Depends, HTTPException, Query, UploadFile
from logger import get_logger
from middlewares.auth import AuthBearer, get_current_user
from models import UserUsage
@ -38,7 +38,6 @@ async def healthz():
@upload_router.post("/upload", dependencies=[Depends(AuthBearer())], tags=["Upload"])
async def upload_file(
request: Request,
uploadFile: UploadFile,
brain_id: UUID = Query(..., description="The ID of the brain"),
chat_id: Optional[UUID] = Query(None, description="The ID of the chat"),
@ -47,7 +46,8 @@ async def upload_file(
validate_brain_authorization(
brain_id, current_user.id, [RoleEnum.Editor, RoleEnum.Owner]
)
uploadFile.file.seek(0)
logger.info(f"Uploading file {uploadFile.filename} to brain {brain_id}")
user_daily_usage = UserUsage(
id=current_user.id,
email=current_user.email,
@ -72,6 +72,8 @@ async def upload_file(
)
file_content = await uploadFile.read()
logger.info(f"File {uploadFile.filename} read successfully")
logger.info(f"Content length: {len(file_content)}")
filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
try:

View File

@ -1,4 +1,5 @@
import hashlib
from io import BytesIO
from fastapi import UploadFile
@ -17,6 +18,10 @@ def convert_bytes(bytes, precision=2):
def get_file_size(file: UploadFile):
if isinstance(file.file, BytesIO):
# If the file object is a BytesIO object, get the size of the bytes data
file_size = len(file.file.getvalue())
return file_size
# move the cursor to the end of the file
file.file._file.seek(0, 2) # pyright: ignore reportPrivateUsage=none
file_size = (

View File

@ -0,0 +1,69 @@
create table "public"."ingestions" (
"name" text,
"id" uuid not null default gen_random_uuid()
);
alter table "public"."ingestions" enable row level security;
CREATE UNIQUE INDEX ingestions_pkey ON public.ingestions USING btree (id);
alter table "public"."ingestions" add constraint "ingestions_pkey" PRIMARY KEY using index "ingestions_pkey";
grant delete on table "public"."ingestions" to "anon";
grant insert on table "public"."ingestions" to "anon";
grant references on table "public"."ingestions" to "anon";
grant select on table "public"."ingestions" to "anon";
grant trigger on table "public"."ingestions" to "anon";
grant truncate on table "public"."ingestions" to "anon";
grant update on table "public"."ingestions" to "anon";
grant delete on table "public"."ingestions" to "authenticated";
grant insert on table "public"."ingestions" to "authenticated";
grant references on table "public"."ingestions" to "authenticated";
grant select on table "public"."ingestions" to "authenticated";
grant trigger on table "public"."ingestions" to "authenticated";
grant truncate on table "public"."ingestions" to "authenticated";
grant update on table "public"."ingestions" to "authenticated";
grant delete on table "public"."ingestions" to "service_role";
grant insert on table "public"."ingestions" to "service_role";
grant references on table "public"."ingestions" to "service_role";
grant select on table "public"."ingestions" to "service_role";
grant trigger on table "public"."ingestions" to "service_role";
grant truncate on table "public"."ingestions" to "service_role";
grant update on table "public"."ingestions" to "service_role";
create policy "INGESTION"
on "public"."ingestions"
as permissive
for all
to service_role;
create policy "INTEGRATIONS"
on "public"."integrations"
as permissive
for all
to service_role;

View File

@ -0,0 +1,5 @@
alter table "public"."ingestions" add column "brain_id_required" boolean not null default true;
alter table "public"."ingestions" add column "file_1_required" boolean not null default false;

View File

@ -0,0 +1,3 @@
alter table "public"."ingestions" add column "url_required" boolean default false;

File diff suppressed because one or more lines are too long