feat: get files from storage (#1205)

* 🌱 list files in storage & generate signed URL

*  add knowledge router

* 🗃️ add knowledge tables

*  add knowledge during upload

* 🚧 add knowledge a brain_knowledge models and repo

* 🔥 remove brain_knowledge

*  add upload to knowledge table

*  add crawl to knowledge table

* ✏️ fixes
This commit is contained in:
Zineb El Bachiri 2023-09-20 09:35:37 +02:00 committed by GitHub
parent 37935c59ca
commit be7acf052b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 393 additions and 17 deletions

View File

@ -18,6 +18,7 @@ from routes.brain_routes import brain_router
from routes.chat_routes import chat_router
from routes.crawl_routes import crawl_router
from routes.explore_routes import explore_router
from routes.knowledge_routes import knowledge_router
from routes.misc_routes import misc_router
from routes.notification_routes import notification_router
from routes.prompt_routes import prompt_router
@ -56,6 +57,7 @@ app.include_router(api_key_router)
app.include_router(subscription_router)
app.include_router(prompt_router)
app.include_router(notification_router)
app.include_router(knowledge_router)
@app.exception_handler(HTTPException)

View File

@ -16,7 +16,7 @@ class Brain(BaseModel):
name: Optional[str] = "Default brain"
description: Optional[str] = "This is a description"
status: Optional[str] = "private"
model: Optional[str]
model: Optional[str] = None
temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 256
openai_api_key: Optional[str] = None

View File

@ -239,3 +239,15 @@ class Repository(ABC):
@abstractmethod
def get_notifications_by_chat_id(self, chat_id: UUID):
pass
@abstractmethod
def insert_knowledge(self, brain_id: UUID):
pass
@abstractmethod
def remove_knowledge_by_id(self, knowledge_id: UUID):
pass
@abstractmethod
def get_knowledge_by_id(self, knowledge_id: UUID):
pass

View File

@ -1,9 +1,11 @@
from models.databases.supabase.api_key_handler import ApiKeyHandler
from models.databases.supabase.brains import Brain
from models.databases.supabase.brains_subscription_invitations import BrainSubscription
from models.databases.supabase.brains_subscription_invitations import \
BrainSubscription
from models.databases.supabase.chats import Chats
from models.databases.supabase.files import File
from models.databases.supabase.knowledge import Knowledges
from models.databases.supabase.notifications import Notifications
from models.databases.supabase.prompts import Prompts
from models.databases.supabase.user_usage import UserUsage
from models.databases.supabase.vectors import Vector
from models.databases.supabase.notifications import Notifications

View File

@ -0,0 +1,80 @@
from typing import Optional
from uuid import UUID
from fastapi import HTTPException
from models.databases.repository import Repository
from models.knowledge import Knowledge
from pydantic import BaseModel
class CreateKnowledgeProperties(BaseModel):
brain_id: UUID
file_name: Optional[str] = None
url: Optional[str] = None
extension: str = "txt"
def dict(self, *args, **kwargs):
knowledge_dict = super().dict(*args, **kwargs)
knowledge_dict["brain_id"] = str(knowledge_dict.get("brain_id"))
return knowledge_dict
class DeleteKnowledgeResponse(BaseModel):
status: str = "delete"
knowledge_id: UUID
class Knowledges(Repository):
def __init__(self, supabase_client):
self.db = supabase_client
def insert_knowledge(self, knowledge: CreateKnowledgeProperties) -> Knowledge:
"""
Add a knowledge
"""
response = (self.db.from_("knowledge").insert(knowledge.dict()).execute()).data
return Knowledge(**response[0])
def remove_knowledge_by_id(
# todo: remove brain
self,
knowledge_id: UUID,
) -> DeleteKnowledgeResponse:
"""
Args:
knowledge_id (UUID): The id of the knowledge
Returns:
str: Status message
"""
response = (
self.db.from_("knowledge")
.delete()
.filter("id", "eq", knowledge_id)
.execute()
.data
)
if response == []:
raise HTTPException(404, "Knowledge not found")
return DeleteKnowledgeResponse(
# change to response[0].brain_id and knowledge_id[0].brain_id
status="deleted",
knowledge_id=knowledge_id,
)
def get_knowledge_by_id(self, knowledge_id: UUID) -> Knowledge:
"""
Get a knowledge by its id
Args:
brain_id (UUID): The id of the brain
"""
knowledge = (
self.db.from_("knowledge")
.select("*")
.filter("knowledge_id", "eq", str(knowledge_id))
.execute()
).data
return Knowledge(**knowledge[0])

View File

@ -5,6 +5,7 @@ from models.databases.supabase import (
BrainSubscription,
Chats,
File,
Knowledges,
Notifications,
Prompts,
UserUsage,
@ -24,6 +25,7 @@ class SupabaseDB(
Vector,
Prompts,
Notifications,
Knowledges,
):
def __init__(self, supabase_client):
self.db = supabase_client
@ -36,3 +38,4 @@ class SupabaseDB(
Vector.__init__(self, supabase_client)
Prompts.__init__(self, supabase_client)
Notifications.__init__(self, supabase_client)
Knowledges.__init__(self, supabase_client)

View File

@ -0,0 +1,16 @@
from uuid import UUID
from pydantic import BaseModel
class FileInStorage(BaseModel):
Id: UUID
Key: str
@property
def id(self) -> UUID:
return self.Id
@property
def key(self) -> str:
return self.Key

View File

@ -0,0 +1,12 @@
from typing import Optional
from uuid import UUID
from pydantic import BaseModel
class Knowledge(BaseModel):
id: UUID
brain_id: UUID
file_name: Optional[str] = None
url: Optional[str] = None
extension: str = "txt"

View File

@ -0,0 +1,21 @@
from multiprocessing import get_logger
from models import get_supabase_client
from supabase.client import Client
logger = get_logger()
SIGNED_URL_EXPIRATION_PERIOD_IN_SECONDS = 600
def generate_file_signed_url(path):
supabase_client: Client = get_supabase_client()
try:
response = supabase_client.storage.from_("quivr").create_signed_url(
path, SIGNED_URL_EXPIRATION_PERIOD_IN_SECONDS
)
logger.info("RESPONSE SIGNED URL", response)
return response
except Exception as e:
logger.error(e)

View File

@ -0,0 +1,17 @@
from multiprocessing import get_logger
from models import get_supabase_client
from supabase.client import Client
logger = get_logger()
def list_files_from_storage(path):
supabase_client: Client = get_supabase_client()
try:
response = supabase_client.storage.from_("quivr").list(path)
logger.info("RESPONSE", response)
return response
except Exception as e:
logger.error(e)

View File

@ -1,7 +1,6 @@
import json
from multiprocessing import get_logger
from httpx import Response
from langchain.pydantic_v1 import Field
from langchain.schema import Document
from models import get_supabase_client
@ -10,7 +9,7 @@ from supabase.client import Client
logger = get_logger()
def upload_file_storage(file, file_identifier: str) -> Response:
def upload_file_storage(file, file_identifier: str):
supabase_client: Client = get_supabase_client()
# res = supabase_client.storage.create_bucket("quivr")
response = None
@ -20,8 +19,7 @@ def upload_file_storage(file, file_identifier: str) -> Response:
return response
except Exception as e:
logger.error(e)
print(e)
return response
raise e
class DocumentSerializable(Document):

View File

View File

@ -0,0 +1,14 @@
from logger import get_logger
from models.databases.supabase.knowledge import CreateKnowledgeProperties
from models.settings import get_supabase_db
logger = get_logger(__name__)
def add_knowledge(knowledge_to_add: CreateKnowledgeProperties):
supabase_db = get_supabase_db()
knowledge = supabase_db.insert_knowledge(knowledge_to_add)
logger.info(f"Knowledge { knowledge.id} added successfully")
return knowledge

View File

@ -5,12 +5,16 @@ from auth import AuthBearer, get_current_user
from celery_worker import process_crawl_and_notify
from crawl.crawler import CrawlWebsite
from fastapi import APIRouter, Depends, Query, Request
from logger import get_logger
from models import Brain, UserIdentity, UserUsage
from models.databases.supabase.knowledge import CreateKnowledgeProperties
from models.databases.supabase.notifications import CreateNotificationProperties
from models.notifications import NotificationsStatusEnum
from repository.knowledge.add_knowledge import add_knowledge
from repository.notification.add_notification import add_notification
from utils.file import convert_bytes
logger = get_logger(__name__)
crawl_router = APIRouter()
@ -64,6 +68,16 @@ async def crawl_endpoint(
status=NotificationsStatusEnum.Pending,
)
)
knowledge_to_add = CreateKnowledgeProperties(
brain_id=brain_id,
url=crawl_website.url,
extension="html",
)
added_knowledge = add_knowledge(knowledge_to_add)
logger.info(f"Knowledge {added_knowledge} added successfully")
process_crawl_and_notify.delay(
crawl_website_url=crawl_website.url,
enable_summarization=enable_summarization,

View File

@ -0,0 +1,110 @@
from uuid import UUID
from auth import AuthBearer, get_current_user
from fastapi import APIRouter, Depends, Query
from logger import get_logger
from models import Brain, UserIdentity, get_supabase_db
from repository.files.generate_file_signed_url import generate_file_signed_url
from repository.files.list_files import list_files_from_storage
from routes.authorizations.brain_authorization import (
RoleEnum,
has_brain_authorization,
validate_brain_authorization,
)
knowledge_router = APIRouter()
logger = get_logger(__name__)
@knowledge_router.get(
"/knowledge/", dependencies=[Depends(AuthBearer())], tags=["Knowledge"]
)
async def list_knowledge_in_brain_endpoint(
brain_id: UUID = Query(..., description="The ID of the brain"),
current_user: UserIdentity = Depends(get_current_user),
):
"""
Retrieve and list all the knowledge in a brain.
"""
validate_brain_authorization(brain_id=brain_id, user_id=current_user.id)
brain = Brain(id=brain_id)
files = list_files_from_storage(str(brain_id))
logger.info("List of files from storage", files)
# TO DO: Retrieve from Knowledge table instead of storage or vectors
unique_data = brain.get_unique_brain_files()
print("UNIQUE DATA", unique_data)
unique_data.sort(key=lambda x: int(x["size"]), reverse=True)
return {"documents": unique_data}
@knowledge_router.delete(
"/knowledge/{file_name}/",
dependencies=[
Depends(AuthBearer()),
Depends(has_brain_authorization(RoleEnum.Owner)),
],
tags=["Knowledge"],
)
async def delete_endpoint(
file_name: str,
current_user: UserIdentity = Depends(get_current_user),
brain_id: UUID = Query(..., description="The ID of the brain"),
):
"""
Delete a specific user file by file name.
"""
validate_brain_authorization(brain_id=brain_id, user_id=current_user.id)
brain = Brain(id=brain_id)
brain.delete_file_from_brain(file_name)
return {
"message": f"{file_name} of brain {brain_id} has been deleted by user {current_user.email}."
}
@knowledge_router.get(
"/explore/{file_name}/signed_download_url",
dependencies=[Depends(AuthBearer())],
tags=["Knowledge"],
)
async def generate_signed_url_endpoint(
file_name: str, current_user: UserIdentity = Depends(get_current_user)
):
"""
Generate a signed url to download the file from storage.
"""
# check if user has the right to get the file: add brain_id to the query
supabase_db = get_supabase_db()
response = supabase_db.get_vectors_by_file_name(file_name)
documents = response.data
if len(documents) == 0:
return {"documents": []}
related_brain_id = (
documents[0]["brains_vectors"][0]["brain_id"]
if len(documents[0]["brains_vectors"]) != 0
else None
)
if related_brain_id is None:
raise Exception(f"File {file_name} has no brain_id associated with it")
file_path_in_storage = f"{related_brain_id}/{file_name}"
print("FILE PATH IN STORAGE", file_path_in_storage)
file_signed_url = generate_file_signed_url(file_path_in_storage)
print("FILE SIGNED URL", file_signed_url)
validate_brain_authorization(brain_id=related_brain_id, user_id=current_user.id)
return file_signed_url

View File

@ -1,25 +1,27 @@
import os
from typing import Optional
from uuid import UUID
from auth import AuthBearer, get_current_user
from celery_worker import process_file_and_notify
from fastapi import APIRouter, Depends, Query, Request, UploadFile
from fastapi import APIRouter, Depends, HTTPException, Query, Request, UploadFile
from logger import get_logger
from models import Brain, UserIdentity, UserUsage
from models.databases.supabase.notifications import (
CreateNotificationProperties,
)
from models.databases.supabase.knowledge import CreateKnowledgeProperties
from models.databases.supabase.notifications import CreateNotificationProperties
from models.notifications import NotificationsStatusEnum
from repository.brain import get_brain_details
from repository.files.upload_file import upload_file_storage
from repository.knowledge.add_knowledge import add_knowledge
from repository.notification.add_notification import add_notification
from repository.user_identity import get_user_identity
from utils.file import convert_bytes, get_file_size
from routes.authorizations.brain_authorization import (
RoleEnum,
validate_brain_authorization,
)
from utils.file import convert_bytes, get_file_size
logger = get_logger(__name__)
upload_router = APIRouter()
@ -78,10 +80,34 @@ async def upload_file(
openai_api_key = get_user_identity(current_user.id).openai_api_key
file_content = await uploadFile.read()
# filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
upload_file_storage(file_content, filename_with_brain_id)
try:
fileInStorage = upload_file_storage(file_content, filename_with_brain_id)
logger.info(f"File {fileInStorage} uploaded successfully")
except Exception as e:
if "The resource already exists" in str(e):
raise HTTPException(
status_code=403,
detail=f"File {uploadFile.filename} already exists in storage.",
)
else:
raise HTTPException(
status_code=500, detail="Failed to upload file to storage."
)
knowledge_to_add = CreateKnowledgeProperties(
brain_id=brain_id,
file_name=uploadFile.filename,
extension=os.path.splitext(
uploadFile.filename # pyright: ignore reportPrivateUsage=none
)[-1].lower(),
)
added_knowledge = add_knowledge(knowledge_to_add)
logger.info(f"Knowledge {added_knowledge} added successfully")
process_file_and_notify.delay(
file_name=filename_with_brain_id,
file_original_name=uploadFile.filename,

View File

@ -0,0 +1,29 @@
BEGIN;
-- knowledge table
CREATE TABLE IF NOT EXISTS knowledge (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
file_name TEXT,
url TEXT,
brain_id UUID NOT NULL REFERENCES brains(brain_id),
extension TEXT NOT NULL,
CHECK ((file_name IS NOT NULL AND url IS NULL) OR (file_name IS NULL AND url IS NOT NULL))
);
-- knowledge_vectors table
CREATE TABLE IF NOT EXISTS knowledge_vectors (
knowledge_id UUID NOT NULL REFERENCES knowledge(id),
vector_id UUID NOT NULL REFERENCES vectors(id),
embedding_model TEXT NOT NULL,
PRIMARY KEY (knowledge_id, vector_id, embedding_model)
);
-- Update migrations table
INSERT INTO migrations (name)
SELECT '202309151054032_add_knowledge_tables'
WHERE NOT EXISTS (
SELECT 1 FROM migrations WHERE name = '202309151054032_add_knowledge_tables'
);
COMMIT;

View File

@ -235,6 +235,26 @@ CREATE TABLE IF NOT EXISTS user_settings (
max_brain_size INT DEFAULT 1000000
);
-- knowledge table
CREATE TABLE IF NOT EXISTS knowledge (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
file_name TEXT,
url TEXT,
brain_id UUID NOT NULL REFERENCES brains(brain_id),
extension TEXT NOT NULL,
CHECK ((file_name IS NOT NULL AND url IS NULL) OR (file_name IS NULL AND url IS NOT NULL))
);
-- knowledge_vectors table
CREATE TABLE IF NOT EXISTS knowledge_vectors (
knowledge_id UUID NOT NULL REFERENCES knowledge(id),
vector_id UUID NOT NULL REFERENCES vectors(id),
embedding_model TEXT NOT NULL,
PRIMARY KEY (knowledge_id, vector_id, embedding_model)
);
insert into
storage.buckets (id, name)
values
@ -249,9 +269,9 @@ CREATE POLICY "Access Quivr Storage 1jccrwz_2" ON storage.objects FOR UPDATE TO
CREATE POLICY "Access Quivr Storage 1jccrwz_3" ON storage.objects FOR DELETE TO anon USING (bucket_id = 'quivr');
INSERT INTO migrations (name)
SELECT '202309157004032_add_sha1_column'
SELECT '202309151054032_add_knowledge_tables'
WHERE NOT EXISTS (
SELECT 1 FROM migrations WHERE name = '202309157004032_add_sha1_column'
SELECT 1 FROM migrations WHERE name = '202309151054032_add_knowledge_tables'
);