mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-14 17:03:29 +03:00
feat: get files from storage (#1205)
* 🌱 list files in storage & generate signed URL * ✨ add knowledge router * 🗃️ add knowledge tables * ✨ add knowledge during upload * 🚧 add knowledge a brain_knowledge models and repo * 🔥 remove brain_knowledge * ✨ add upload to knowledge table * ✨ add crawl to knowledge table * ✏️ fixes
This commit is contained in:
parent
37935c59ca
commit
be7acf052b
@ -18,6 +18,7 @@ from routes.brain_routes import brain_router
|
||||
from routes.chat_routes import chat_router
|
||||
from routes.crawl_routes import crawl_router
|
||||
from routes.explore_routes import explore_router
|
||||
from routes.knowledge_routes import knowledge_router
|
||||
from routes.misc_routes import misc_router
|
||||
from routes.notification_routes import notification_router
|
||||
from routes.prompt_routes import prompt_router
|
||||
@ -56,6 +57,7 @@ app.include_router(api_key_router)
|
||||
app.include_router(subscription_router)
|
||||
app.include_router(prompt_router)
|
||||
app.include_router(notification_router)
|
||||
app.include_router(knowledge_router)
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
|
@ -16,7 +16,7 @@ class Brain(BaseModel):
|
||||
name: Optional[str] = "Default brain"
|
||||
description: Optional[str] = "This is a description"
|
||||
status: Optional[str] = "private"
|
||||
model: Optional[str]
|
||||
model: Optional[str] = None
|
||||
temperature: Optional[float] = 0.0
|
||||
max_tokens: Optional[int] = 256
|
||||
openai_api_key: Optional[str] = None
|
||||
|
@ -239,3 +239,15 @@ class Repository(ABC):
|
||||
@abstractmethod
|
||||
def get_notifications_by_chat_id(self, chat_id: UUID):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def insert_knowledge(self, brain_id: UUID):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def remove_knowledge_by_id(self, knowledge_id: UUID):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_knowledge_by_id(self, knowledge_id: UUID):
|
||||
pass
|
||||
|
@ -1,9 +1,11 @@
|
||||
from models.databases.supabase.api_key_handler import ApiKeyHandler
|
||||
from models.databases.supabase.brains import Brain
|
||||
from models.databases.supabase.brains_subscription_invitations import BrainSubscription
|
||||
from models.databases.supabase.brains_subscription_invitations import \
|
||||
BrainSubscription
|
||||
from models.databases.supabase.chats import Chats
|
||||
from models.databases.supabase.files import File
|
||||
from models.databases.supabase.knowledge import Knowledges
|
||||
from models.databases.supabase.notifications import Notifications
|
||||
from models.databases.supabase.prompts import Prompts
|
||||
from models.databases.supabase.user_usage import UserUsage
|
||||
from models.databases.supabase.vectors import Vector
|
||||
from models.databases.supabase.notifications import Notifications
|
||||
|
80
backend/models/databases/supabase/knowledge.py
Normal file
80
backend/models/databases/supabase/knowledge.py
Normal file
@ -0,0 +1,80 @@
|
||||
from typing import Optional
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import HTTPException
|
||||
from models.databases.repository import Repository
|
||||
from models.knowledge import Knowledge
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class CreateKnowledgeProperties(BaseModel):
|
||||
brain_id: UUID
|
||||
file_name: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
extension: str = "txt"
|
||||
|
||||
def dict(self, *args, **kwargs):
|
||||
knowledge_dict = super().dict(*args, **kwargs)
|
||||
knowledge_dict["brain_id"] = str(knowledge_dict.get("brain_id"))
|
||||
return knowledge_dict
|
||||
|
||||
|
||||
class DeleteKnowledgeResponse(BaseModel):
|
||||
status: str = "delete"
|
||||
knowledge_id: UUID
|
||||
|
||||
|
||||
class Knowledges(Repository):
|
||||
def __init__(self, supabase_client):
|
||||
self.db = supabase_client
|
||||
|
||||
def insert_knowledge(self, knowledge: CreateKnowledgeProperties) -> Knowledge:
|
||||
"""
|
||||
Add a knowledge
|
||||
"""
|
||||
response = (self.db.from_("knowledge").insert(knowledge.dict()).execute()).data
|
||||
return Knowledge(**response[0])
|
||||
|
||||
def remove_knowledge_by_id(
|
||||
# todo: remove brain
|
||||
self,
|
||||
knowledge_id: UUID,
|
||||
) -> DeleteKnowledgeResponse:
|
||||
"""
|
||||
Args:
|
||||
knowledge_id (UUID): The id of the knowledge
|
||||
|
||||
Returns:
|
||||
str: Status message
|
||||
"""
|
||||
response = (
|
||||
self.db.from_("knowledge")
|
||||
.delete()
|
||||
.filter("id", "eq", knowledge_id)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
|
||||
if response == []:
|
||||
raise HTTPException(404, "Knowledge not found")
|
||||
|
||||
return DeleteKnowledgeResponse(
|
||||
# change to response[0].brain_id and knowledge_id[0].brain_id
|
||||
status="deleted",
|
||||
knowledge_id=knowledge_id,
|
||||
)
|
||||
|
||||
def get_knowledge_by_id(self, knowledge_id: UUID) -> Knowledge:
|
||||
"""
|
||||
Get a knowledge by its id
|
||||
Args:
|
||||
brain_id (UUID): The id of the brain
|
||||
"""
|
||||
knowledge = (
|
||||
self.db.from_("knowledge")
|
||||
.select("*")
|
||||
.filter("knowledge_id", "eq", str(knowledge_id))
|
||||
.execute()
|
||||
).data
|
||||
|
||||
return Knowledge(**knowledge[0])
|
@ -5,6 +5,7 @@ from models.databases.supabase import (
|
||||
BrainSubscription,
|
||||
Chats,
|
||||
File,
|
||||
Knowledges,
|
||||
Notifications,
|
||||
Prompts,
|
||||
UserUsage,
|
||||
@ -24,6 +25,7 @@ class SupabaseDB(
|
||||
Vector,
|
||||
Prompts,
|
||||
Notifications,
|
||||
Knowledges,
|
||||
):
|
||||
def __init__(self, supabase_client):
|
||||
self.db = supabase_client
|
||||
@ -36,3 +38,4 @@ class SupabaseDB(
|
||||
Vector.__init__(self, supabase_client)
|
||||
Prompts.__init__(self, supabase_client)
|
||||
Notifications.__init__(self, supabase_client)
|
||||
Knowledges.__init__(self, supabase_client)
|
||||
|
16
backend/models/files_in_storage.py
Normal file
16
backend/models/files_in_storage.py
Normal file
@ -0,0 +1,16 @@
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class FileInStorage(BaseModel):
|
||||
Id: UUID
|
||||
Key: str
|
||||
|
||||
@property
|
||||
def id(self) -> UUID:
|
||||
return self.Id
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
return self.Key
|
12
backend/models/knowledge.py
Normal file
12
backend/models/knowledge.py
Normal file
@ -0,0 +1,12 @@
|
||||
from typing import Optional
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class Knowledge(BaseModel):
|
||||
id: UUID
|
||||
brain_id: UUID
|
||||
file_name: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
extension: str = "txt"
|
21
backend/repository/files/generate_file_signed_url.py
Normal file
21
backend/repository/files/generate_file_signed_url.py
Normal file
@ -0,0 +1,21 @@
|
||||
from multiprocessing import get_logger
|
||||
|
||||
from models import get_supabase_client
|
||||
from supabase.client import Client
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
SIGNED_URL_EXPIRATION_PERIOD_IN_SECONDS = 600
|
||||
|
||||
|
||||
def generate_file_signed_url(path):
|
||||
supabase_client: Client = get_supabase_client()
|
||||
|
||||
try:
|
||||
response = supabase_client.storage.from_("quivr").create_signed_url(
|
||||
path, SIGNED_URL_EXPIRATION_PERIOD_IN_SECONDS
|
||||
)
|
||||
logger.info("RESPONSE SIGNED URL", response)
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(e)
|
17
backend/repository/files/list_files.py
Normal file
17
backend/repository/files/list_files.py
Normal file
@ -0,0 +1,17 @@
|
||||
from multiprocessing import get_logger
|
||||
|
||||
from models import get_supabase_client
|
||||
from supabase.client import Client
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def list_files_from_storage(path):
|
||||
supabase_client: Client = get_supabase_client()
|
||||
|
||||
try:
|
||||
response = supabase_client.storage.from_("quivr").list(path)
|
||||
logger.info("RESPONSE", response)
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(e)
|
@ -1,7 +1,6 @@
|
||||
import json
|
||||
from multiprocessing import get_logger
|
||||
|
||||
from httpx import Response
|
||||
from langchain.pydantic_v1 import Field
|
||||
from langchain.schema import Document
|
||||
from models import get_supabase_client
|
||||
@ -10,7 +9,7 @@ from supabase.client import Client
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def upload_file_storage(file, file_identifier: str) -> Response:
|
||||
def upload_file_storage(file, file_identifier: str):
|
||||
supabase_client: Client = get_supabase_client()
|
||||
# res = supabase_client.storage.create_bucket("quivr")
|
||||
response = None
|
||||
@ -20,8 +19,7 @@ def upload_file_storage(file, file_identifier: str) -> Response:
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
print(e)
|
||||
return response
|
||||
raise e
|
||||
|
||||
|
||||
class DocumentSerializable(Document):
|
||||
|
0
backend/repository/knowledge/__init__.py
Normal file
0
backend/repository/knowledge/__init__.py
Normal file
14
backend/repository/knowledge/add_knowledge.py
Normal file
14
backend/repository/knowledge/add_knowledge.py
Normal file
@ -0,0 +1,14 @@
|
||||
from logger import get_logger
|
||||
from models.databases.supabase.knowledge import CreateKnowledgeProperties
|
||||
from models.settings import get_supabase_db
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def add_knowledge(knowledge_to_add: CreateKnowledgeProperties):
|
||||
supabase_db = get_supabase_db()
|
||||
|
||||
knowledge = supabase_db.insert_knowledge(knowledge_to_add)
|
||||
|
||||
logger.info(f"Knowledge { knowledge.id} added successfully")
|
||||
return knowledge
|
@ -5,12 +5,16 @@ from auth import AuthBearer, get_current_user
|
||||
from celery_worker import process_crawl_and_notify
|
||||
from crawl.crawler import CrawlWebsite
|
||||
from fastapi import APIRouter, Depends, Query, Request
|
||||
from logger import get_logger
|
||||
from models import Brain, UserIdentity, UserUsage
|
||||
from models.databases.supabase.knowledge import CreateKnowledgeProperties
|
||||
from models.databases.supabase.notifications import CreateNotificationProperties
|
||||
from models.notifications import NotificationsStatusEnum
|
||||
from repository.knowledge.add_knowledge import add_knowledge
|
||||
from repository.notification.add_notification import add_notification
|
||||
from utils.file import convert_bytes
|
||||
|
||||
logger = get_logger(__name__)
|
||||
crawl_router = APIRouter()
|
||||
|
||||
|
||||
@ -64,6 +68,16 @@ async def crawl_endpoint(
|
||||
status=NotificationsStatusEnum.Pending,
|
||||
)
|
||||
)
|
||||
|
||||
knowledge_to_add = CreateKnowledgeProperties(
|
||||
brain_id=brain_id,
|
||||
url=crawl_website.url,
|
||||
extension="html",
|
||||
)
|
||||
|
||||
added_knowledge = add_knowledge(knowledge_to_add)
|
||||
logger.info(f"Knowledge {added_knowledge} added successfully")
|
||||
|
||||
process_crawl_and_notify.delay(
|
||||
crawl_website_url=crawl_website.url,
|
||||
enable_summarization=enable_summarization,
|
||||
|
110
backend/routes/knowledge_routes.py
Normal file
110
backend/routes/knowledge_routes.py
Normal file
@ -0,0 +1,110 @@
|
||||
from uuid import UUID
|
||||
|
||||
from auth import AuthBearer, get_current_user
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from logger import get_logger
|
||||
from models import Brain, UserIdentity, get_supabase_db
|
||||
from repository.files.generate_file_signed_url import generate_file_signed_url
|
||||
from repository.files.list_files import list_files_from_storage
|
||||
from routes.authorizations.brain_authorization import (
|
||||
RoleEnum,
|
||||
has_brain_authorization,
|
||||
validate_brain_authorization,
|
||||
)
|
||||
|
||||
knowledge_router = APIRouter()
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@knowledge_router.get(
|
||||
"/knowledge/", dependencies=[Depends(AuthBearer())], tags=["Knowledge"]
|
||||
)
|
||||
async def list_knowledge_in_brain_endpoint(
|
||||
brain_id: UUID = Query(..., description="The ID of the brain"),
|
||||
current_user: UserIdentity = Depends(get_current_user),
|
||||
):
|
||||
"""
|
||||
Retrieve and list all the knowledge in a brain.
|
||||
"""
|
||||
|
||||
validate_brain_authorization(brain_id=brain_id, user_id=current_user.id)
|
||||
|
||||
brain = Brain(id=brain_id)
|
||||
|
||||
files = list_files_from_storage(str(brain_id))
|
||||
logger.info("List of files from storage", files)
|
||||
|
||||
# TO DO: Retrieve from Knowledge table instead of storage or vectors
|
||||
unique_data = brain.get_unique_brain_files()
|
||||
|
||||
print("UNIQUE DATA", unique_data)
|
||||
unique_data.sort(key=lambda x: int(x["size"]), reverse=True)
|
||||
|
||||
return {"documents": unique_data}
|
||||
|
||||
|
||||
@knowledge_router.delete(
|
||||
"/knowledge/{file_name}/",
|
||||
dependencies=[
|
||||
Depends(AuthBearer()),
|
||||
Depends(has_brain_authorization(RoleEnum.Owner)),
|
||||
],
|
||||
tags=["Knowledge"],
|
||||
)
|
||||
async def delete_endpoint(
|
||||
file_name: str,
|
||||
current_user: UserIdentity = Depends(get_current_user),
|
||||
brain_id: UUID = Query(..., description="The ID of the brain"),
|
||||
):
|
||||
"""
|
||||
Delete a specific user file by file name.
|
||||
"""
|
||||
|
||||
validate_brain_authorization(brain_id=brain_id, user_id=current_user.id)
|
||||
|
||||
brain = Brain(id=brain_id)
|
||||
brain.delete_file_from_brain(file_name)
|
||||
|
||||
return {
|
||||
"message": f"{file_name} of brain {brain_id} has been deleted by user {current_user.email}."
|
||||
}
|
||||
|
||||
|
||||
@knowledge_router.get(
|
||||
"/explore/{file_name}/signed_download_url",
|
||||
dependencies=[Depends(AuthBearer())],
|
||||
tags=["Knowledge"],
|
||||
)
|
||||
async def generate_signed_url_endpoint(
|
||||
file_name: str, current_user: UserIdentity = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Generate a signed url to download the file from storage.
|
||||
"""
|
||||
# check if user has the right to get the file: add brain_id to the query
|
||||
|
||||
supabase_db = get_supabase_db()
|
||||
response = supabase_db.get_vectors_by_file_name(file_name)
|
||||
documents = response.data
|
||||
|
||||
if len(documents) == 0:
|
||||
return {"documents": []}
|
||||
|
||||
related_brain_id = (
|
||||
documents[0]["brains_vectors"][0]["brain_id"]
|
||||
if len(documents[0]["brains_vectors"]) != 0
|
||||
else None
|
||||
)
|
||||
if related_brain_id is None:
|
||||
raise Exception(f"File {file_name} has no brain_id associated with it")
|
||||
|
||||
file_path_in_storage = f"{related_brain_id}/{file_name}"
|
||||
|
||||
print("FILE PATH IN STORAGE", file_path_in_storage)
|
||||
file_signed_url = generate_file_signed_url(file_path_in_storage)
|
||||
|
||||
print("FILE SIGNED URL", file_signed_url)
|
||||
|
||||
validate_brain_authorization(brain_id=related_brain_id, user_id=current_user.id)
|
||||
|
||||
return file_signed_url
|
@ -1,25 +1,27 @@
|
||||
import os
|
||||
from typing import Optional
|
||||
from uuid import UUID
|
||||
|
||||
from auth import AuthBearer, get_current_user
|
||||
from celery_worker import process_file_and_notify
|
||||
from fastapi import APIRouter, Depends, Query, Request, UploadFile
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, Request, UploadFile
|
||||
from logger import get_logger
|
||||
from models import Brain, UserIdentity, UserUsage
|
||||
from models.databases.supabase.notifications import (
|
||||
CreateNotificationProperties,
|
||||
)
|
||||
from models.databases.supabase.knowledge import CreateKnowledgeProperties
|
||||
from models.databases.supabase.notifications import CreateNotificationProperties
|
||||
from models.notifications import NotificationsStatusEnum
|
||||
from repository.brain import get_brain_details
|
||||
from repository.files.upload_file import upload_file_storage
|
||||
from repository.knowledge.add_knowledge import add_knowledge
|
||||
from repository.notification.add_notification import add_notification
|
||||
from repository.user_identity import get_user_identity
|
||||
from utils.file import convert_bytes, get_file_size
|
||||
|
||||
from routes.authorizations.brain_authorization import (
|
||||
RoleEnum,
|
||||
validate_brain_authorization,
|
||||
)
|
||||
from utils.file import convert_bytes, get_file_size
|
||||
|
||||
logger = get_logger(__name__)
|
||||
upload_router = APIRouter()
|
||||
|
||||
|
||||
@ -78,10 +80,34 @@ async def upload_file(
|
||||
openai_api_key = get_user_identity(current_user.id).openai_api_key
|
||||
|
||||
file_content = await uploadFile.read()
|
||||
# filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
|
||||
filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
|
||||
|
||||
upload_file_storage(file_content, filename_with_brain_id)
|
||||
try:
|
||||
fileInStorage = upload_file_storage(file_content, filename_with_brain_id)
|
||||
logger.info(f"File {fileInStorage} uploaded successfully")
|
||||
|
||||
except Exception as e:
|
||||
if "The resource already exists" in str(e):
|
||||
raise HTTPException(
|
||||
status_code=403,
|
||||
detail=f"File {uploadFile.filename} already exists in storage.",
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to upload file to storage."
|
||||
)
|
||||
|
||||
knowledge_to_add = CreateKnowledgeProperties(
|
||||
brain_id=brain_id,
|
||||
file_name=uploadFile.filename,
|
||||
extension=os.path.splitext(
|
||||
uploadFile.filename # pyright: ignore reportPrivateUsage=none
|
||||
)[-1].lower(),
|
||||
)
|
||||
|
||||
added_knowledge = add_knowledge(knowledge_to_add)
|
||||
logger.info(f"Knowledge {added_knowledge} added successfully")
|
||||
|
||||
process_file_and_notify.delay(
|
||||
file_name=filename_with_brain_id,
|
||||
file_original_name=uploadFile.filename,
|
||||
|
29
scripts/202309151054032_add_knowledge_tables.sql
Normal file
29
scripts/202309151054032_add_knowledge_tables.sql
Normal file
@ -0,0 +1,29 @@
|
||||
BEGIN;
|
||||
|
||||
-- knowledge table
|
||||
CREATE TABLE IF NOT EXISTS knowledge (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
file_name TEXT,
|
||||
url TEXT,
|
||||
brain_id UUID NOT NULL REFERENCES brains(brain_id),
|
||||
extension TEXT NOT NULL,
|
||||
CHECK ((file_name IS NOT NULL AND url IS NULL) OR (file_name IS NULL AND url IS NOT NULL))
|
||||
);
|
||||
|
||||
|
||||
-- knowledge_vectors table
|
||||
CREATE TABLE IF NOT EXISTS knowledge_vectors (
|
||||
knowledge_id UUID NOT NULL REFERENCES knowledge(id),
|
||||
vector_id UUID NOT NULL REFERENCES vectors(id),
|
||||
embedding_model TEXT NOT NULL,
|
||||
PRIMARY KEY (knowledge_id, vector_id, embedding_model)
|
||||
);
|
||||
|
||||
-- Update migrations table
|
||||
INSERT INTO migrations (name)
|
||||
SELECT '202309151054032_add_knowledge_tables'
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM migrations WHERE name = '202309151054032_add_knowledge_tables'
|
||||
);
|
||||
|
||||
COMMIT;
|
@ -235,6 +235,26 @@ CREATE TABLE IF NOT EXISTS user_settings (
|
||||
max_brain_size INT DEFAULT 1000000
|
||||
);
|
||||
|
||||
-- knowledge table
|
||||
CREATE TABLE IF NOT EXISTS knowledge (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
file_name TEXT,
|
||||
url TEXT,
|
||||
brain_id UUID NOT NULL REFERENCES brains(brain_id),
|
||||
extension TEXT NOT NULL,
|
||||
CHECK ((file_name IS NOT NULL AND url IS NULL) OR (file_name IS NULL AND url IS NOT NULL))
|
||||
);
|
||||
|
||||
|
||||
-- knowledge_vectors table
|
||||
CREATE TABLE IF NOT EXISTS knowledge_vectors (
|
||||
knowledge_id UUID NOT NULL REFERENCES knowledge(id),
|
||||
vector_id UUID NOT NULL REFERENCES vectors(id),
|
||||
embedding_model TEXT NOT NULL,
|
||||
PRIMARY KEY (knowledge_id, vector_id, embedding_model)
|
||||
);
|
||||
|
||||
|
||||
insert into
|
||||
storage.buckets (id, name)
|
||||
values
|
||||
@ -249,9 +269,9 @@ CREATE POLICY "Access Quivr Storage 1jccrwz_2" ON storage.objects FOR UPDATE TO
|
||||
CREATE POLICY "Access Quivr Storage 1jccrwz_3" ON storage.objects FOR DELETE TO anon USING (bucket_id = 'quivr');
|
||||
|
||||
INSERT INTO migrations (name)
|
||||
SELECT '202309157004032_add_sha1_column'
|
||||
SELECT '202309151054032_add_knowledge_tables'
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM migrations WHERE name = '202309157004032_add_sha1_column'
|
||||
SELECT 1 FROM migrations WHERE name = '202309151054032_add_knowledge_tables'
|
||||
);
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user