From ef53590b58fe3902eb6abc311ed8de6818b757d0 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Mon, 4 Dec 2023 11:59:08 +0100 Subject: [PATCH] =?UTF-8?q?feat(embedding):=20now=20100=20times=20faster?= =?UTF-8?q?=20=E2=9A=A1=EF=B8=8F=F0=9F=94=A5=20(#1807)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --- backend/celery_task.py | 19 --------- backend/packages/embeddings/vectors.py | 6 +-- backend/packages/files/parsers/common.py | 49 ++++++++++++++++-------- 3 files changed, 36 insertions(+), 38 deletions(-) delete mode 100644 backend/celery_task.py diff --git a/backend/celery_task.py b/backend/celery_task.py deleted file mode 100644 index f18f232c8..000000000 --- a/backend/celery_task.py +++ /dev/null @@ -1,19 +0,0 @@ -from celery import shared_task -from models.settings import get_supabase_db -from modules.brain.service.brain_vector_service import BrainVectorService -from packages.embeddings.vectors import Neurons -from repository.files.upload_file import DocumentSerializable - - -@shared_task -def create_embedding_for_document(brain_id, doc_with_metadata, file_sha1): - neurons = Neurons() - doc = DocumentSerializable.from_json(doc_with_metadata) - created_vector = neurons.create_vector(doc) - database = get_supabase_db() - database.set_file_sha_from_metadata(file_sha1) - - created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none - - brain_vector_service = BrainVectorService(brain_id) - brain_vector_service.create_brain_vector(created_vector_id, file_sha1) diff --git a/backend/packages/embeddings/vectors.py b/backend/packages/embeddings/vectors.py index 34cbc707c..6ffad44e8 100644 --- a/backend/packages/embeddings/vectors.py +++ b/backend/packages/embeddings/vectors.py @@ -11,13 +11,13 @@ logger = get_logger(__name__) # TODO: Create interface for embeddings and implement it for Supabase and OpenAI (current Quivr) class Neurons(BaseModel): - def create_vector(self, doc): + def create_vector(self, docs): documents_vector_store = get_documents_vector_store() logger.info("Creating vector for document") - logger.info(f"Document: {doc}") + logger.info(f"Document: {docs}") try: - sids = documents_vector_store.add_documents([doc]) + sids = documents_vector_store.add_documents(docs) if sids and len(sids) > 0: return sids diff --git a/backend/packages/files/parsers/common.py b/backend/packages/files/parsers/common.py index cfd7d1b68..5a3e88c4d 100644 --- a/backend/packages/files/parsers/common.py +++ b/backend/packages/files/parsers/common.py @@ -1,7 +1,9 @@ import time -from celery_task import create_embedding_for_document from models import File +from models.settings import get_supabase_db +from modules.brain.service.brain_vector_service import BrainVectorService +from packages.embeddings.vectors import Neurons from repository.files.upload_file import DocumentSerializable @@ -10,25 +12,40 @@ async def process_file( loader_class, brain_id, ): + database = get_supabase_db() dateshort = time.strftime("%Y%m%d") + neurons = Neurons() file.compute_documents(loader_class) - for doc in file.documents: # pyright: ignore reportPrivateUsage=none - metadata = { - "file_sha1": file.file_sha1, - "file_size": file.file_size, - "file_name": file.file_name, - "chunk_size": file.chunk_size, - "chunk_overlap": file.chunk_overlap, - "date": dateshort, - } - doc_with_metadata = DocumentSerializable( - page_content=doc.page_content, metadata=metadata + metadata = { + "file_sha1": file.file_sha1, + "file_size": file.file_size, + "file_name": file.file_name, + "chunk_size": file.chunk_size, + "chunk_overlap": file.chunk_overlap, + "date": dateshort, + } + docs = [] + + if file.documents is not None: + for doc in file.documents: # pyright: ignore reportPrivateUsage=none + doc_with_metadata = DocumentSerializable( + page_content=doc.page_content, metadata=metadata + ) + docs.append(doc_with_metadata) + + created_vector = neurons.create_vector(docs) + + brain_vector_service = BrainVectorService(brain_id) + for created_vector_id in created_vector: + brain_vector_service.create_brain_vector( + created_vector_id, metadata["file_sha1"] ) - create_embedding_for_document.delay( - brain_id, doc_with_metadata.to_json(), file.file_sha1 - ) + database.set_file_sha_from_metadata(metadata["file_sha1"]) - return len(file.documents) + if created_vector: + return len(created_vector) + else: + return 0