feat: get files from storage (#1205)

* 🌱 list files in storage & generate signed URL

*  add knowledge router

* 🗃️ add knowledge tables

*  add knowledge during upload

* 🚧 add knowledge a brain_knowledge models and repo

* 🔥 remove brain_knowledge

*  add upload to knowledge table

*  add crawl to knowledge table

* ✏️ fixes
This commit is contained in:
Zineb El Bachiri 2023-09-20 09:35:37 +02:00 committed by GitHub
parent 37935c59ca
commit be7acf052b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 393 additions and 17 deletions

View File

@ -18,6 +18,7 @@ from routes.brain_routes import brain_router
from routes.chat_routes import chat_router from routes.chat_routes import chat_router
from routes.crawl_routes import crawl_router from routes.crawl_routes import crawl_router
from routes.explore_routes import explore_router from routes.explore_routes import explore_router
from routes.knowledge_routes import knowledge_router
from routes.misc_routes import misc_router from routes.misc_routes import misc_router
from routes.notification_routes import notification_router from routes.notification_routes import notification_router
from routes.prompt_routes import prompt_router from routes.prompt_routes import prompt_router
@ -56,6 +57,7 @@ app.include_router(api_key_router)
app.include_router(subscription_router) app.include_router(subscription_router)
app.include_router(prompt_router) app.include_router(prompt_router)
app.include_router(notification_router) app.include_router(notification_router)
app.include_router(knowledge_router)
@app.exception_handler(HTTPException) @app.exception_handler(HTTPException)

View File

@ -16,7 +16,7 @@ class Brain(BaseModel):
name: Optional[str] = "Default brain" name: Optional[str] = "Default brain"
description: Optional[str] = "This is a description" description: Optional[str] = "This is a description"
status: Optional[str] = "private" status: Optional[str] = "private"
model: Optional[str] model: Optional[str] = None
temperature: Optional[float] = 0.0 temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 256 max_tokens: Optional[int] = 256
openai_api_key: Optional[str] = None openai_api_key: Optional[str] = None

View File

@ -239,3 +239,15 @@ class Repository(ABC):
@abstractmethod @abstractmethod
def get_notifications_by_chat_id(self, chat_id: UUID): def get_notifications_by_chat_id(self, chat_id: UUID):
pass pass
@abstractmethod
def insert_knowledge(self, brain_id: UUID):
pass
@abstractmethod
def remove_knowledge_by_id(self, knowledge_id: UUID):
pass
@abstractmethod
def get_knowledge_by_id(self, knowledge_id: UUID):
pass

View File

@ -1,9 +1,11 @@
from models.databases.supabase.api_key_handler import ApiKeyHandler from models.databases.supabase.api_key_handler import ApiKeyHandler
from models.databases.supabase.brains import Brain from models.databases.supabase.brains import Brain
from models.databases.supabase.brains_subscription_invitations import BrainSubscription from models.databases.supabase.brains_subscription_invitations import \
BrainSubscription
from models.databases.supabase.chats import Chats from models.databases.supabase.chats import Chats
from models.databases.supabase.files import File from models.databases.supabase.files import File
from models.databases.supabase.knowledge import Knowledges
from models.databases.supabase.notifications import Notifications
from models.databases.supabase.prompts import Prompts from models.databases.supabase.prompts import Prompts
from models.databases.supabase.user_usage import UserUsage from models.databases.supabase.user_usage import UserUsage
from models.databases.supabase.vectors import Vector from models.databases.supabase.vectors import Vector
from models.databases.supabase.notifications import Notifications

View File

@ -0,0 +1,80 @@
from typing import Optional
from uuid import UUID
from fastapi import HTTPException
from models.databases.repository import Repository
from models.knowledge import Knowledge
from pydantic import BaseModel
class CreateKnowledgeProperties(BaseModel):
brain_id: UUID
file_name: Optional[str] = None
url: Optional[str] = None
extension: str = "txt"
def dict(self, *args, **kwargs):
knowledge_dict = super().dict(*args, **kwargs)
knowledge_dict["brain_id"] = str(knowledge_dict.get("brain_id"))
return knowledge_dict
class DeleteKnowledgeResponse(BaseModel):
status: str = "delete"
knowledge_id: UUID
class Knowledges(Repository):
def __init__(self, supabase_client):
self.db = supabase_client
def insert_knowledge(self, knowledge: CreateKnowledgeProperties) -> Knowledge:
"""
Add a knowledge
"""
response = (self.db.from_("knowledge").insert(knowledge.dict()).execute()).data
return Knowledge(**response[0])
def remove_knowledge_by_id(
# todo: remove brain
self,
knowledge_id: UUID,
) -> DeleteKnowledgeResponse:
"""
Args:
knowledge_id (UUID): The id of the knowledge
Returns:
str: Status message
"""
response = (
self.db.from_("knowledge")
.delete()
.filter("id", "eq", knowledge_id)
.execute()
.data
)
if response == []:
raise HTTPException(404, "Knowledge not found")
return DeleteKnowledgeResponse(
# change to response[0].brain_id and knowledge_id[0].brain_id
status="deleted",
knowledge_id=knowledge_id,
)
def get_knowledge_by_id(self, knowledge_id: UUID) -> Knowledge:
"""
Get a knowledge by its id
Args:
brain_id (UUID): The id of the brain
"""
knowledge = (
self.db.from_("knowledge")
.select("*")
.filter("knowledge_id", "eq", str(knowledge_id))
.execute()
).data
return Knowledge(**knowledge[0])

View File

@ -5,6 +5,7 @@ from models.databases.supabase import (
BrainSubscription, BrainSubscription,
Chats, Chats,
File, File,
Knowledges,
Notifications, Notifications,
Prompts, Prompts,
UserUsage, UserUsage,
@ -24,6 +25,7 @@ class SupabaseDB(
Vector, Vector,
Prompts, Prompts,
Notifications, Notifications,
Knowledges,
): ):
def __init__(self, supabase_client): def __init__(self, supabase_client):
self.db = supabase_client self.db = supabase_client
@ -36,3 +38,4 @@ class SupabaseDB(
Vector.__init__(self, supabase_client) Vector.__init__(self, supabase_client)
Prompts.__init__(self, supabase_client) Prompts.__init__(self, supabase_client)
Notifications.__init__(self, supabase_client) Notifications.__init__(self, supabase_client)
Knowledges.__init__(self, supabase_client)

View File

@ -0,0 +1,16 @@
from uuid import UUID
from pydantic import BaseModel
class FileInStorage(BaseModel):
Id: UUID
Key: str
@property
def id(self) -> UUID:
return self.Id
@property
def key(self) -> str:
return self.Key

View File

@ -0,0 +1,12 @@
from typing import Optional
from uuid import UUID
from pydantic import BaseModel
class Knowledge(BaseModel):
id: UUID
brain_id: UUID
file_name: Optional[str] = None
url: Optional[str] = None
extension: str = "txt"

View File

@ -0,0 +1,21 @@
from multiprocessing import get_logger
from models import get_supabase_client
from supabase.client import Client
logger = get_logger()
SIGNED_URL_EXPIRATION_PERIOD_IN_SECONDS = 600
def generate_file_signed_url(path):
supabase_client: Client = get_supabase_client()
try:
response = supabase_client.storage.from_("quivr").create_signed_url(
path, SIGNED_URL_EXPIRATION_PERIOD_IN_SECONDS
)
logger.info("RESPONSE SIGNED URL", response)
return response
except Exception as e:
logger.error(e)

View File

@ -0,0 +1,17 @@
from multiprocessing import get_logger
from models import get_supabase_client
from supabase.client import Client
logger = get_logger()
def list_files_from_storage(path):
supabase_client: Client = get_supabase_client()
try:
response = supabase_client.storage.from_("quivr").list(path)
logger.info("RESPONSE", response)
return response
except Exception as e:
logger.error(e)

View File

@ -1,7 +1,6 @@
import json import json
from multiprocessing import get_logger from multiprocessing import get_logger
from httpx import Response
from langchain.pydantic_v1 import Field from langchain.pydantic_v1 import Field
from langchain.schema import Document from langchain.schema import Document
from models import get_supabase_client from models import get_supabase_client
@ -10,7 +9,7 @@ from supabase.client import Client
logger = get_logger() logger = get_logger()
def upload_file_storage(file, file_identifier: str) -> Response: def upload_file_storage(file, file_identifier: str):
supabase_client: Client = get_supabase_client() supabase_client: Client = get_supabase_client()
# res = supabase_client.storage.create_bucket("quivr") # res = supabase_client.storage.create_bucket("quivr")
response = None response = None
@ -20,8 +19,7 @@ def upload_file_storage(file, file_identifier: str) -> Response:
return response return response
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
print(e) raise e
return response
class DocumentSerializable(Document): class DocumentSerializable(Document):

View File

View File

@ -0,0 +1,14 @@
from logger import get_logger
from models.databases.supabase.knowledge import CreateKnowledgeProperties
from models.settings import get_supabase_db
logger = get_logger(__name__)
def add_knowledge(knowledge_to_add: CreateKnowledgeProperties):
supabase_db = get_supabase_db()
knowledge = supabase_db.insert_knowledge(knowledge_to_add)
logger.info(f"Knowledge { knowledge.id} added successfully")
return knowledge

View File

@ -5,12 +5,16 @@ from auth import AuthBearer, get_current_user
from celery_worker import process_crawl_and_notify from celery_worker import process_crawl_and_notify
from crawl.crawler import CrawlWebsite from crawl.crawler import CrawlWebsite
from fastapi import APIRouter, Depends, Query, Request from fastapi import APIRouter, Depends, Query, Request
from logger import get_logger
from models import Brain, UserIdentity, UserUsage from models import Brain, UserIdentity, UserUsage
from models.databases.supabase.knowledge import CreateKnowledgeProperties
from models.databases.supabase.notifications import CreateNotificationProperties from models.databases.supabase.notifications import CreateNotificationProperties
from models.notifications import NotificationsStatusEnum from models.notifications import NotificationsStatusEnum
from repository.knowledge.add_knowledge import add_knowledge
from repository.notification.add_notification import add_notification from repository.notification.add_notification import add_notification
from utils.file import convert_bytes from utils.file import convert_bytes
logger = get_logger(__name__)
crawl_router = APIRouter() crawl_router = APIRouter()
@ -64,6 +68,16 @@ async def crawl_endpoint(
status=NotificationsStatusEnum.Pending, status=NotificationsStatusEnum.Pending,
) )
) )
knowledge_to_add = CreateKnowledgeProperties(
brain_id=brain_id,
url=crawl_website.url,
extension="html",
)
added_knowledge = add_knowledge(knowledge_to_add)
logger.info(f"Knowledge {added_knowledge} added successfully")
process_crawl_and_notify.delay( process_crawl_and_notify.delay(
crawl_website_url=crawl_website.url, crawl_website_url=crawl_website.url,
enable_summarization=enable_summarization, enable_summarization=enable_summarization,

View File

@ -0,0 +1,110 @@
from uuid import UUID
from auth import AuthBearer, get_current_user
from fastapi import APIRouter, Depends, Query
from logger import get_logger
from models import Brain, UserIdentity, get_supabase_db
from repository.files.generate_file_signed_url import generate_file_signed_url
from repository.files.list_files import list_files_from_storage
from routes.authorizations.brain_authorization import (
RoleEnum,
has_brain_authorization,
validate_brain_authorization,
)
knowledge_router = APIRouter()
logger = get_logger(__name__)
@knowledge_router.get(
"/knowledge/", dependencies=[Depends(AuthBearer())], tags=["Knowledge"]
)
async def list_knowledge_in_brain_endpoint(
brain_id: UUID = Query(..., description="The ID of the brain"),
current_user: UserIdentity = Depends(get_current_user),
):
"""
Retrieve and list all the knowledge in a brain.
"""
validate_brain_authorization(brain_id=brain_id, user_id=current_user.id)
brain = Brain(id=brain_id)
files = list_files_from_storage(str(brain_id))
logger.info("List of files from storage", files)
# TO DO: Retrieve from Knowledge table instead of storage or vectors
unique_data = brain.get_unique_brain_files()
print("UNIQUE DATA", unique_data)
unique_data.sort(key=lambda x: int(x["size"]), reverse=True)
return {"documents": unique_data}
@knowledge_router.delete(
"/knowledge/{file_name}/",
dependencies=[
Depends(AuthBearer()),
Depends(has_brain_authorization(RoleEnum.Owner)),
],
tags=["Knowledge"],
)
async def delete_endpoint(
file_name: str,
current_user: UserIdentity = Depends(get_current_user),
brain_id: UUID = Query(..., description="The ID of the brain"),
):
"""
Delete a specific user file by file name.
"""
validate_brain_authorization(brain_id=brain_id, user_id=current_user.id)
brain = Brain(id=brain_id)
brain.delete_file_from_brain(file_name)
return {
"message": f"{file_name} of brain {brain_id} has been deleted by user {current_user.email}."
}
@knowledge_router.get(
"/explore/{file_name}/signed_download_url",
dependencies=[Depends(AuthBearer())],
tags=["Knowledge"],
)
async def generate_signed_url_endpoint(
file_name: str, current_user: UserIdentity = Depends(get_current_user)
):
"""
Generate a signed url to download the file from storage.
"""
# check if user has the right to get the file: add brain_id to the query
supabase_db = get_supabase_db()
response = supabase_db.get_vectors_by_file_name(file_name)
documents = response.data
if len(documents) == 0:
return {"documents": []}
related_brain_id = (
documents[0]["brains_vectors"][0]["brain_id"]
if len(documents[0]["brains_vectors"]) != 0
else None
)
if related_brain_id is None:
raise Exception(f"File {file_name} has no brain_id associated with it")
file_path_in_storage = f"{related_brain_id}/{file_name}"
print("FILE PATH IN STORAGE", file_path_in_storage)
file_signed_url = generate_file_signed_url(file_path_in_storage)
print("FILE SIGNED URL", file_signed_url)
validate_brain_authorization(brain_id=related_brain_id, user_id=current_user.id)
return file_signed_url

View File

@ -1,25 +1,27 @@
import os
from typing import Optional from typing import Optional
from uuid import UUID from uuid import UUID
from auth import AuthBearer, get_current_user from auth import AuthBearer, get_current_user
from celery_worker import process_file_and_notify from celery_worker import process_file_and_notify
from fastapi import APIRouter, Depends, Query, Request, UploadFile from fastapi import APIRouter, Depends, HTTPException, Query, Request, UploadFile
from logger import get_logger
from models import Brain, UserIdentity, UserUsage from models import Brain, UserIdentity, UserUsage
from models.databases.supabase.notifications import ( from models.databases.supabase.knowledge import CreateKnowledgeProperties
CreateNotificationProperties, from models.databases.supabase.notifications import CreateNotificationProperties
)
from models.notifications import NotificationsStatusEnum from models.notifications import NotificationsStatusEnum
from repository.brain import get_brain_details from repository.brain import get_brain_details
from repository.files.upload_file import upload_file_storage from repository.files.upload_file import upload_file_storage
from repository.knowledge.add_knowledge import add_knowledge
from repository.notification.add_notification import add_notification from repository.notification.add_notification import add_notification
from repository.user_identity import get_user_identity from repository.user_identity import get_user_identity
from utils.file import convert_bytes, get_file_size
from routes.authorizations.brain_authorization import ( from routes.authorizations.brain_authorization import (
RoleEnum, RoleEnum,
validate_brain_authorization, validate_brain_authorization,
) )
from utils.file import convert_bytes, get_file_size
logger = get_logger(__name__)
upload_router = APIRouter() upload_router = APIRouter()
@ -78,10 +80,34 @@ async def upload_file(
openai_api_key = get_user_identity(current_user.id).openai_api_key openai_api_key = get_user_identity(current_user.id).openai_api_key
file_content = await uploadFile.read() file_content = await uploadFile.read()
# filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename) filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
upload_file_storage(file_content, filename_with_brain_id) try:
fileInStorage = upload_file_storage(file_content, filename_with_brain_id)
logger.info(f"File {fileInStorage} uploaded successfully")
except Exception as e:
if "The resource already exists" in str(e):
raise HTTPException(
status_code=403,
detail=f"File {uploadFile.filename} already exists in storage.",
)
else:
raise HTTPException(
status_code=500, detail="Failed to upload file to storage."
)
knowledge_to_add = CreateKnowledgeProperties(
brain_id=brain_id,
file_name=uploadFile.filename,
extension=os.path.splitext(
uploadFile.filename # pyright: ignore reportPrivateUsage=none
)[-1].lower(),
)
added_knowledge = add_knowledge(knowledge_to_add)
logger.info(f"Knowledge {added_knowledge} added successfully")
process_file_and_notify.delay( process_file_and_notify.delay(
file_name=filename_with_brain_id, file_name=filename_with_brain_id,
file_original_name=uploadFile.filename, file_original_name=uploadFile.filename,

View File

@ -0,0 +1,29 @@
BEGIN;
-- knowledge table
CREATE TABLE IF NOT EXISTS knowledge (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
file_name TEXT,
url TEXT,
brain_id UUID NOT NULL REFERENCES brains(brain_id),
extension TEXT NOT NULL,
CHECK ((file_name IS NOT NULL AND url IS NULL) OR (file_name IS NULL AND url IS NOT NULL))
);
-- knowledge_vectors table
CREATE TABLE IF NOT EXISTS knowledge_vectors (
knowledge_id UUID NOT NULL REFERENCES knowledge(id),
vector_id UUID NOT NULL REFERENCES vectors(id),
embedding_model TEXT NOT NULL,
PRIMARY KEY (knowledge_id, vector_id, embedding_model)
);
-- Update migrations table
INSERT INTO migrations (name)
SELECT '202309151054032_add_knowledge_tables'
WHERE NOT EXISTS (
SELECT 1 FROM migrations WHERE name = '202309151054032_add_knowledge_tables'
);
COMMIT;

View File

@ -235,6 +235,26 @@ CREATE TABLE IF NOT EXISTS user_settings (
max_brain_size INT DEFAULT 1000000 max_brain_size INT DEFAULT 1000000
); );
-- knowledge table
CREATE TABLE IF NOT EXISTS knowledge (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
file_name TEXT,
url TEXT,
brain_id UUID NOT NULL REFERENCES brains(brain_id),
extension TEXT NOT NULL,
CHECK ((file_name IS NOT NULL AND url IS NULL) OR (file_name IS NULL AND url IS NOT NULL))
);
-- knowledge_vectors table
CREATE TABLE IF NOT EXISTS knowledge_vectors (
knowledge_id UUID NOT NULL REFERENCES knowledge(id),
vector_id UUID NOT NULL REFERENCES vectors(id),
embedding_model TEXT NOT NULL,
PRIMARY KEY (knowledge_id, vector_id, embedding_model)
);
insert into insert into
storage.buckets (id, name) storage.buckets (id, name)
values values
@ -249,9 +269,9 @@ CREATE POLICY "Access Quivr Storage 1jccrwz_2" ON storage.objects FOR UPDATE TO
CREATE POLICY "Access Quivr Storage 1jccrwz_3" ON storage.objects FOR DELETE TO anon USING (bucket_id = 'quivr'); CREATE POLICY "Access Quivr Storage 1jccrwz_3" ON storage.objects FOR DELETE TO anon USING (bucket_id = 'quivr');
INSERT INTO migrations (name) INSERT INTO migrations (name)
SELECT '202309157004032_add_sha1_column' SELECT '202309151054032_add_knowledge_tables'
WHERE NOT EXISTS ( WHERE NOT EXISTS (
SELECT 1 FROM migrations WHERE name = '202309157004032_add_sha1_column' SELECT 1 FROM migrations WHERE name = '202309151054032_add_knowledge_tables'
); );