From 10e94e5a913b76ba446859efd68f482f3fa70c8e Mon Sep 17 00:00:00 2001 From: Mamadou DICKO <63923024+mamadoudicko@users.noreply.github.com> Date: Mon, 27 Nov 2023 16:47:13 +0100 Subject: [PATCH] feat: improve delete knowledge performance (#1733) Issue: https://github.com/StanGirard/quivr/issues/1724 --- backend/models/databases/supabase/brains.py | 47 +++++++++++---------- backend/routes/knowledge_routes.py | 10 +++-- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/backend/models/databases/supabase/brains.py b/backend/models/databases/supabase/brains.py index 959aafbaa..8422e3294 100644 --- a/backend/models/databases/supabase/brains.py +++ b/backend/models/databases/supabase/brains.py @@ -278,36 +278,39 @@ class Brain(Repository): def delete_file_from_brain(self, brain_id, file_name: str): # First, get the vector_ids associated with the file_name - vector_response = ( + file_vectors = ( self.db.table("vectors") .select("id") .filter("metadata->>file_name", "eq", file_name) .execute() ) - vector_ids = [item["id"] for item in vector_response.data] - # For each vector_id, delete the corresponding entry from the 'brains_vectors' table - for vector_id in vector_ids: - self.db.table("brains_vectors").delete().filter( - "vector_id", "eq", vector_id - ).filter("brain_id", "eq", brain_id).execute() + file_vectors_ids = [item["id"] for item in file_vectors.data] - # Check if the vector is still associated with any other brains - associated_brains_response = ( - self.db.table("brains_vectors") - .select("brain_id") - .filter("vector_id", "eq", vector_id) - .execute() - ) - associated_brains = [ - item["brain_id"] for item in associated_brains_response.data - ] + # remove current file vectors from brain vectors + self.db.table("brains_vectors").delete().filter( + "vector_id", "in", file_vectors_ids + ).filter("brain_id", "eq", brain_id).execute() - # If the vector is not associated with any other brains, delete it from 'vectors' table - if not associated_brains: - self.db.table("vectors").delete().filter( - "id", "eq", vector_id - ).execute() + vectors_used_by_another_brain = ( + self.db.table("brains_vectors") + .select("vector_id") + .filter("vector_id", "in", file_vectors_ids) + .filter("brain_id", "neq", brain_id) + .execute() + ) + + vectors_used_by_another_brain_ids = [ + item["vector_id"] for item in vectors_used_by_another_brain.data + ] + + vectors_no_longer_used_ids = [ + id for id in file_vectors_ids if id not in vectors_used_by_another_brain_ids + ] + + self.db.table("vectors").delete().filter( + "id", "in", vectors_no_longer_used_ids + ).execute() return {"message": f"File {file_name} in brain {brain_id} has been deleted."} diff --git a/backend/routes/knowledge_routes.py b/backend/routes/knowledge_routes.py index 0cf82525e..fa4ca43fe 100644 --- a/backend/routes/knowledge_routes.py +++ b/backend/routes/knowledge_routes.py @@ -1,6 +1,6 @@ from uuid import UUID -from fastapi import APIRouter, Depends, Query +from fastapi import APIRouter, Depends, HTTPException, Query from logger import get_logger from middlewares.auth import AuthBearer, get_current_user from models import Brain @@ -10,6 +10,7 @@ from repository.files.generate_file_signed_url import generate_file_signed_url from repository.knowledge.get_all_knowledge import get_all_knowledge from repository.knowledge.get_knowledge import get_knowledge from repository.knowledge.remove_knowledge import remove_knowledge + from routes.authorizations.brain_authorization import ( RoleEnum, has_brain_authorization, @@ -56,8 +57,6 @@ async def delete_endpoint( Delete a specific knowledge from a brain. """ - validate_brain_authorization(brain_id=brain_id, user_id=current_user.id) - brain = Brain(id=brain_id) knowledge = get_knowledge(knowledge_id) @@ -93,7 +92,10 @@ async def generate_signed_url_endpoint( validate_brain_authorization(brain_id=knowledge.brain_id, user_id=current_user.id) if knowledge.file_name == None: - raise Exception(f"Knowledge {knowledge_id} has no file_name associated with it") + raise HTTPException( + status_code=404, + detail=f"Knowledge with id {knowledge_id} is not a file.", + ) file_path_in_storage = f"{knowledge.brain_id}/{knowledge.file_name}"