diff --git a/backend/celery_task.py b/backend/celery_task.py deleted file mode 100644 index f18f232c8..000000000 --- a/backend/celery_task.py +++ /dev/null @@ -1,19 +0,0 @@ -from celery import shared_task -from models.settings import get_supabase_db -from modules.brain.service.brain_vector_service import BrainVectorService -from packages.embeddings.vectors import Neurons -from repository.files.upload_file import DocumentSerializable - - -@shared_task -def create_embedding_for_document(brain_id, doc_with_metadata, file_sha1): - neurons = Neurons() - doc = DocumentSerializable.from_json(doc_with_metadata) - created_vector = neurons.create_vector(doc) - database = get_supabase_db() - database.set_file_sha_from_metadata(file_sha1) - - created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none - - brain_vector_service = BrainVectorService(brain_id) - brain_vector_service.create_brain_vector(created_vector_id, file_sha1) diff --git a/backend/packages/embeddings/vectors.py b/backend/packages/embeddings/vectors.py index 34cbc707c..6ffad44e8 100644 --- a/backend/packages/embeddings/vectors.py +++ b/backend/packages/embeddings/vectors.py @@ -11,13 +11,13 @@ logger = get_logger(__name__) # TODO: Create interface for embeddings and implement it for Supabase and OpenAI (current Quivr) class Neurons(BaseModel): - def create_vector(self, doc): + def create_vector(self, docs): documents_vector_store = get_documents_vector_store() logger.info("Creating vector for document") - logger.info(f"Document: {doc}") + logger.info(f"Document: {docs}") try: - sids = documents_vector_store.add_documents([doc]) + sids = documents_vector_store.add_documents(docs) if sids and len(sids) > 0: return sids diff --git a/backend/packages/files/parsers/common.py b/backend/packages/files/parsers/common.py index cfd7d1b68..5a3e88c4d 100644 --- a/backend/packages/files/parsers/common.py +++ b/backend/packages/files/parsers/common.py @@ -1,7 +1,9 @@ import time -from celery_task import create_embedding_for_document from models import File +from models.settings import get_supabase_db +from modules.brain.service.brain_vector_service import BrainVectorService +from packages.embeddings.vectors import Neurons from repository.files.upload_file import DocumentSerializable @@ -10,25 +12,40 @@ async def process_file( loader_class, brain_id, ): + database = get_supabase_db() dateshort = time.strftime("%Y%m%d") + neurons = Neurons() file.compute_documents(loader_class) - for doc in file.documents: # pyright: ignore reportPrivateUsage=none - metadata = { - "file_sha1": file.file_sha1, - "file_size": file.file_size, - "file_name": file.file_name, - "chunk_size": file.chunk_size, - "chunk_overlap": file.chunk_overlap, - "date": dateshort, - } - doc_with_metadata = DocumentSerializable( - page_content=doc.page_content, metadata=metadata + metadata = { + "file_sha1": file.file_sha1, + "file_size": file.file_size, + "file_name": file.file_name, + "chunk_size": file.chunk_size, + "chunk_overlap": file.chunk_overlap, + "date": dateshort, + } + docs = [] + + if file.documents is not None: + for doc in file.documents: # pyright: ignore reportPrivateUsage=none + doc_with_metadata = DocumentSerializable( + page_content=doc.page_content, metadata=metadata + ) + docs.append(doc_with_metadata) + + created_vector = neurons.create_vector(docs) + + brain_vector_service = BrainVectorService(brain_id) + for created_vector_id in created_vector: + brain_vector_service.create_brain_vector( + created_vector_id, metadata["file_sha1"] ) - create_embedding_for_document.delay( - brain_id, doc_with_metadata.to_json(), file.file_sha1 - ) + database.set_file_sha_from_metadata(metadata["file_sha1"]) - return len(file.documents) + if created_vector: + return len(created_vector) + else: + return 0