refactor: knowledge module (#1743)

# Description

- Refactor knowledge to a module

- This PR breaks the Github Processor function -> need to comment brain
and file imports as it creates a circular dependency issue. Should be
fixed and reverted in next PR.
This commit is contained in:
Zineb El Bachiri 2023-11-29 09:04:03 +01:00 committed by GitHub
parent f9f05d9d71
commit 9766befb53
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 186 additions and 162 deletions

View File

@ -7,6 +7,7 @@
},
"python.linting.enabled": true,
"python.linting.flake8Enabled": true,
"python.analysis.extraPaths": ["./backend"],
"editor.formatOnSave": true,
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",

View File

@ -13,6 +13,7 @@ from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from logger import get_logger
from middlewares.cors import add_cors_middleware
from modules.knowledge.controller.knowledge_routes import knowledge_router
from modules.notification.controller.notification_routes import notification_router
from modules.onboarding.controller.onboarding_routes import onboarding_router
from modules.prompt.controller.prompt_routes import prompt_router
@ -22,7 +23,6 @@ from routes.brain_routes import brain_router
from routes.chat_routes import chat_router
from routes.contact_routes import router as contact_router
from routes.crawl_routes import crawl_router
from routes.knowledge_routes import knowledge_router
from routes.misc_routes import misc_router
from routes.subscription_routes import subscription_router
from routes.upload_routes import upload_router

View File

@ -204,26 +204,6 @@ class Repository(ABC):
def get_vectors_by_file_sha1(self, file_sha1):
pass
@abstractmethod
def insert_knowledge(self, brain_id: UUID):
pass
@abstractmethod
def remove_knowledge_by_id(self, knowledge_id: UUID):
pass
@abstractmethod
def remove_brain_all_knowledge(self, brain_id: UUID):
pass
@abstractmethod
def get_knowledge_by_id(self, knowledge_id: UUID):
pass
@abstractmethod
def get_all_knowledge_in_brain(self, brain_id: UUID):
pass
@abstractmethod
def get_api_brain_definition(self, brain_id: UUID):
pass

View File

@ -5,6 +5,5 @@ from models.databases.supabase.brains_subscription_invitations import \
BrainSubscription
from models.databases.supabase.chats import Chats
from models.databases.supabase.files import File
from models.databases.supabase.knowledge import Knowledges
from models.databases.supabase.user_usage import UserUsage
from models.databases.supabase.vectors import Vector

View File

@ -6,7 +6,6 @@ from models.databases.supabase import (
BrainSubscription,
Chats,
File,
Knowledges,
UserUsage,
Vector,
)
@ -22,7 +21,6 @@ class SupabaseDB(
ApiKeyHandler,
Chats,
Vector,
Knowledges,
ApiBrainDefinitions,
):
def __init__(self, supabase_client):
@ -34,5 +32,4 @@ class SupabaseDB(
ApiKeyHandler.__init__(self, supabase_client)
Chats.__init__(self, supabase_client)
Vector.__init__(self, supabase_client)
Knowledges.__init__(self, supabase_client)
ApiBrainDefinitions.__init__(self, supabase_client)

View File

@ -0,0 +1 @@
from .knowledge_routes import knowledge_router

View File

@ -4,13 +4,10 @@ from fastapi import APIRouter, Depends, HTTPException, Query
from logger import get_logger
from middlewares.auth import AuthBearer, get_current_user
from models import Brain
from modules.knowledge.service.knowledge_service import KnowledgeService
from modules.user.entity.user_identity import UserIdentity
from repository.files.delete_file import delete_file_from_storage
from repository.files.generate_file_signed_url import generate_file_signed_url
from repository.knowledge.get_all_knowledge import get_all_knowledge
from repository.knowledge.get_knowledge import get_knowledge
from repository.knowledge.remove_knowledge import remove_knowledge
from routes.authorizations.brain_authorization import (
RoleEnum,
has_brain_authorization,
@ -20,6 +17,8 @@ from routes.authorizations.brain_authorization import (
knowledge_router = APIRouter()
logger = get_logger(__name__)
knowledge_service = KnowledgeService()
@knowledge_router.get(
"/knowledge", dependencies=[Depends(AuthBearer())], tags=["Knowledge"]
@ -34,7 +33,7 @@ async def list_knowledge_in_brain_endpoint(
validate_brain_authorization(brain_id=brain_id, user_id=current_user.id)
knowledges = get_all_knowledge(brain_id)
knowledges = knowledge_service.get_all_knowledge(brain_id)
logger.info(f"List of knowledge from knowledge table: {knowledges}")
return {"knowledges": knowledges}
@ -59,9 +58,9 @@ async def delete_endpoint(
brain = Brain(id=brain_id)
knowledge = get_knowledge(knowledge_id)
knowledge = knowledge_service.get_knowledge(knowledge_id)
file_name = knowledge.file_name if knowledge.file_name else knowledge.url
remove_knowledge(knowledge_id)
knowledge_service.remove_knowledge(knowledge_id)
if knowledge.file_name:
delete_file_from_storage(f"{brain_id}/{knowledge.file_name}")
@ -87,7 +86,7 @@ async def generate_signed_url_endpoint(
Generate a signed url to download the file from storage.
"""
knowledge = get_knowledge(knowledge_id)
knowledge = knowledge_service.get_knowledge(knowledge_id)
validate_brain_authorization(brain_id=knowledge.brain_id, user_id=current_user.id)

View File

@ -0,0 +1,2 @@
from .inputs import CreateKnowledgeProperties
from .outputs import DeleteKnowledgeResponse

View File

@ -0,0 +1,16 @@
from typing import Optional
from uuid import UUID
from pydantic import BaseModel
class CreateKnowledgeProperties(BaseModel):
brain_id: UUID
file_name: Optional[str] = None
url: Optional[str] = None
extension: str = "txt"
def dict(self, *args, **kwargs):
knowledge_dict = super().dict(*args, **kwargs)
knowledge_dict["brain_id"] = str(knowledge_dict.get("brain_id"))
return knowledge_dict

View File

@ -0,0 +1,8 @@
from uuid import UUID
from pydantic import BaseModel
class DeleteKnowledgeResponse(BaseModel):
status: str = "delete"
knowledge_id: UUID

View File

@ -0,0 +1 @@
from .knowledge import Knowledge

View File

@ -0,0 +1 @@
from .knowledges import Knowledges

View File

@ -0,0 +1,58 @@
from abc import ABC, abstractmethod
from typing import List
from uuid import UUID
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
from modules.knowledge.dto.outputs import DeleteKnowledgeResponse
from modules.knowledge.entity.knowledge import Knowledge
class KnowledgeInterface(ABC):
@abstractmethod
def insert_knowledge(self, knowledge: CreateKnowledgeProperties) -> Knowledge:
"""
Add a knowledge
"""
pass
@abstractmethod
def remove_knowledge_by_id(
# todo: update remove brain endpoints to first delete the knowledge
self,
knowledge_id: UUID,
) -> DeleteKnowledgeResponse:
"""
Args:
knowledge_id (UUID): The id of the knowledge
Returns:
str: Status message
"""
pass
@abstractmethod
def get_knowledge_by_id(self, knowledge_id: UUID) -> Knowledge:
"""
Get a knowledge by its id
Args:
brain_id (UUID): The id of the brain
"""
pass
@abstractmethod
def get_all_knowledge_in_brain(self, brain_id: UUID) -> List[Knowledge]:
"""
Get all the knowledge in a brain
Args:
brain_id (UUID): The id of the brain
"""
pass
@abstractmethod
def remove_brain_all_knowledge(self, brain_id: UUID) -> None:
"""
Remove all knowledge in a brain
Args:
brain_id (UUID): The id of the brain
"""
pass

View File

@ -1,34 +1,16 @@
from typing import List, Optional
from uuid import UUID
from fastapi import HTTPException
from models.databases.repository import Repository
from models.knowledge import Knowledge
from pydantic import BaseModel
from models.settings import get_supabase_client
from modules.knowledge.dto.outputs import DeleteKnowledgeResponse
from modules.knowledge.entity.knowledge import Knowledge
from modules.knowledge.repository.knowledge_interface import KnowledgeInterface
class CreateKnowledgeProperties(BaseModel):
brain_id: UUID
file_name: Optional[str] = None
url: Optional[str] = None
extension: str = "txt"
def dict(self, *args, **kwargs):
knowledge_dict = super().dict(*args, **kwargs)
knowledge_dict["brain_id"] = str(knowledge_dict.get("brain_id"))
return knowledge_dict
class DeleteKnowledgeResponse(BaseModel):
status: str = "delete"
knowledge_id: UUID
class Knowledges(Repository):
def __init__(self, supabase_client):
class Knowledges(KnowledgeInterface):
def __init__(self):
supabase_client = get_supabase_client()
self.db = supabase_client
def insert_knowledge(self, knowledge: CreateKnowledgeProperties) -> Knowledge:
def insert_knowledge(self, knowledge):
"""
Add a knowledge
"""
@ -38,8 +20,8 @@ class Knowledges(Repository):
def remove_knowledge_by_id(
# todo: update remove brain endpoints to first delete the knowledge
self,
knowledge_id: UUID,
) -> DeleteKnowledgeResponse:
knowledge_id,
):
"""
Args:
knowledge_id (UUID): The id of the knowledge
@ -64,7 +46,7 @@ class Knowledges(Repository):
knowledge_id=knowledge_id,
)
def get_knowledge_by_id(self, knowledge_id: UUID) -> Knowledge:
def get_knowledge_by_id(self, knowledge_id):
"""
Get a knowledge by its id
Args:
@ -79,7 +61,7 @@ class Knowledges(Repository):
return Knowledge(**knowledge[0])
def get_all_knowledge_in_brain(self, brain_id: UUID) -> List[Knowledge]:
def get_all_knowledge_in_brain(self, brain_id):
"""
Get all the knowledge in a brain
Args:
@ -94,7 +76,7 @@ class Knowledges(Repository):
return [Knowledge(**knowledge) for knowledge in all_knowledge]
def remove_brain_all_knowledge(self, brain_id: UUID) -> None:
def remove_brain_all_knowledge(self, brain_id):
"""
Remove all knowledge in a brain
Args:

View File

@ -0,0 +1,46 @@
from uuid import UUID
from logger import get_logger
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
from modules.knowledge.entity.knowledge import Knowledge
from modules.knowledge.repository.knowledge_interface import KnowledgeInterface
from modules.knowledge.repository.knowledges import Knowledges
logger = get_logger(__name__)
class KnowledgeService:
repository: KnowledgeInterface
def __init__(self):
self.repository = Knowledges()
def add_knowledge(self, knowledge_to_add: CreateKnowledgeProperties):
knowledge = self.repository.insert_knowledge(knowledge_to_add)
logger.info(f"Knowledge { knowledge.id} added successfully")
return knowledge
def get_all_knowledge(self, brain_id: UUID):
knowledges = self.repository.get_all_knowledge_in_brain(brain_id)
return knowledges
def get_knowledge(self, knowledge_id: UUID) -> Knowledge:
knowledge = self.repository.get_knowledge_by_id(knowledge_id)
return knowledge
def remove_brain_all_knowledge(self, brain_id: UUID) -> None:
self.repository.remove_brain_all_knowledge(brain_id)
logger.info(
f"All knowledge in brain {brain_id} removed successfully from table"
)
def remove_knowledge(self, knowledge_id: UUID):
message = self.repository.remove_knowledge_by_id(knowledge_id)
logger.info(f"Knowledge { knowledge_id} removed successfully from table")
return message

View File

@ -4,8 +4,9 @@ import time
from langchain.document_loaders import GitLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from models import Brain, File
from packages.embeddings.vectors import Neurons
# from models import Brain, File
# from packages.embeddings.vectors import Neurons
from packages.files.file import compute_sha1_from_content
@ -55,22 +56,26 @@ async def process_github(
}
doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)
file = File(
file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8"))
)
print(doc_with_metadata.metadata["file_name"])
file_exists = file.file_already_exists()
# TO FIX: Import of file and brain creates a circular dependency
# file = File(
# file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8"))
# )
if not file_exists:
neurons = Neurons()
created_vector = neurons.create_vector(doc_with_metadata)
# file_exists = file.file_already_exists()
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
# if not file_exists:
# neurons = Neurons()
# created_vector = neurons.create_vector(doc_with_metadata)
if not file_exists_in_brain:
brain = Brain(id=brain_id)
file.link_file_to_brain(brain)
# file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
# if not file_exists_in_brain:
# brain = Brain(id=brain_id)
# file.link_file_to_brain(brain)
return {
"message": f"✅ Github with {len(documents)} files has been uploaded.",
# "message": f"✅ Github with {len(documents)} files has been uploaded.",
"message": "Github processor is currently unavailable.",
"type": "success",
}

View File

@ -3,15 +3,14 @@ from uuid import UUID
from fastapi import HTTPException
from models.brain_entity import BrainType
from models.settings import get_supabase_db
from modules.knowledge.service.knowledge_service import KnowledgeService
from repository.api_brain_definition.delete_api_brain_definition import (
delete_api_brain_definition,
)
from repository.brain import get_brain_by_id
from repository.brain.delete_brain_secrets import delete_brain_secrets_values
from repository.knowledge.remove_brain_all_knowledge import (
remove_brain_all_knowledge,
)
knowledge_service = KnowledgeService()
def delete_brain(brain_id: UUID) -> dict[str, str]:
@ -27,7 +26,7 @@ def delete_brain(brain_id: UUID) -> dict[str, str]:
)
delete_api_brain_definition(brain_id=brain_id)
else:
remove_brain_all_knowledge(brain_id)
knowledge_service.remove_brain_all_knowledge(brain_id)
supabase_db.delete_brain_vector(str(brain_id))
supabase_db.delete_brain_users(str(brain_id))

View File

@ -1,14 +0,0 @@
from logger import get_logger
from models.databases.supabase.knowledge import CreateKnowledgeProperties
from models.settings import get_supabase_db
logger = get_logger(__name__)
def add_knowledge(knowledge_to_add: CreateKnowledgeProperties):
supabase_db = get_supabase_db()
knowledge = supabase_db.insert_knowledge(knowledge_to_add)
logger.info(f"Knowledge { knowledge.id} added successfully")
return knowledge

View File

@ -1,14 +0,0 @@
from uuid import UUID
from logger import get_logger
from models.settings import get_supabase_db
logger = get_logger(__name__)
def get_all_knowledge(brain_id: UUID):
supabase_db = get_supabase_db()
knowledges = supabase_db.get_all_knowledge_in_brain(brain_id)
return knowledges

View File

@ -1,15 +0,0 @@
from uuid import UUID
from logger import get_logger
from models.knowledge import Knowledge
from models.settings import get_supabase_db
logger = get_logger(__name__)
def get_knowledge(knowledge_id: UUID) -> Knowledge:
supabase_db = get_supabase_db()
knowledge = supabase_db.get_knowledge_by_id(knowledge_id)
return knowledge

View File

@ -1,14 +0,0 @@
from uuid import UUID
from logger import get_logger
from models.settings import get_supabase_db
logger = get_logger(__name__)
def remove_brain_all_knowledge(brain_id: UUID) -> None:
supabase_db = get_supabase_db()
supabase_db.remove_brain_all_knowledge(brain_id)
logger.info(f"All knowledge in brain {brain_id} removed successfully from table")

View File

@ -1,16 +0,0 @@
from uuid import UUID
from logger import get_logger
from models.settings import get_supabase_db
logger = get_logger(__name__)
def remove_knowledge(knowledge_id: UUID):
supabase_db = get_supabase_db()
message = supabase_db.remove_knowledge_by_id(knowledge_id)
logger.info(f"Knowledge { knowledge_id} removed successfully from table")
return message

View File

@ -6,19 +6,20 @@ from fastapi import APIRouter, Depends, Query, Request
from logger import get_logger
from middlewares.auth import AuthBearer, get_current_user
from models import Brain, UserUsage
from models.databases.supabase.knowledge import CreateKnowledgeProperties
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
from modules.knowledge.service.knowledge_service import KnowledgeService
from modules.notification.dto.inputs import CreateNotificationProperties
from modules.notification.entity.notification import NotificationsStatusEnum
from modules.notification.service.notification_service import NotificationService
from modules.user.entity.user_identity import UserIdentity
from packages.files.crawl.crawler import CrawlWebsite
from packages.files.file import convert_bytes
from repository.knowledge.add_knowledge import add_knowledge
logger = get_logger(__name__)
crawl_router = APIRouter()
notification_service = NotificationService()
knowledge_service = KnowledgeService()
@crawl_router.get("/crawl/healthz", tags=["Health"])
@ -72,7 +73,7 @@ async def crawl_endpoint(
extension="html",
)
added_knowledge = add_knowledge(knowledge_to_add)
added_knowledge = knowledge_service.add_knowledge(knowledge_to_add)
logger.info(f"Knowledge {added_knowledge} added successfully")
process_crawl_and_notify.delay(

View File

@ -7,7 +7,8 @@ from fastapi import APIRouter, Depends, HTTPException, Query, Request, UploadFil
from logger import get_logger
from middlewares.auth import AuthBearer, get_current_user
from models import UserUsage
from models.databases.supabase.knowledge import CreateKnowledgeProperties
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
from modules.knowledge.service.knowledge_service import KnowledgeService
from modules.notification.dto.inputs import (
CreateNotificationProperties,
NotificationUpdatableProperties,
@ -17,7 +18,6 @@ from modules.notification.service.notification_service import NotificationServic
from modules.user.entity.user_identity import UserIdentity
from packages.files.file import convert_bytes, get_file_size
from repository.files.upload_file import upload_file_storage
from repository.knowledge.add_knowledge import add_knowledge
from routes.authorizations.brain_authorization import (
RoleEnum,
validate_brain_authorization,
@ -27,6 +27,7 @@ logger = get_logger(__name__)
upload_router = APIRouter()
notification_service = NotificationService()
knowledge_service = KnowledgeService()
@upload_router.get("/upload/healthz", tags=["Health"])
@ -108,7 +109,7 @@ async def upload_file(
)[-1].lower(),
)
added_knowledge = add_knowledge(knowledge_to_add)
added_knowledge = knowledge_service.add_knowledge(knowledge_to_add)
logger.info(f"Knowledge {added_knowledge} added successfully")
process_file_and_notify.delay(