feat(embedding): now 100 times faster 🔥 (#1807)

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
This commit is contained in:
Stan Girard 2023-12-04 11:59:08 +01:00 committed by GitHub
parent b47f411372
commit ef53590b58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 38 deletions

View File

@ -1,19 +0,0 @@
from celery import shared_task
from models.settings import get_supabase_db
from modules.brain.service.brain_vector_service import BrainVectorService
from packages.embeddings.vectors import Neurons
from repository.files.upload_file import DocumentSerializable
@shared_task
def create_embedding_for_document(brain_id, doc_with_metadata, file_sha1):
neurons = Neurons()
doc = DocumentSerializable.from_json(doc_with_metadata)
created_vector = neurons.create_vector(doc)
database = get_supabase_db()
database.set_file_sha_from_metadata(file_sha1)
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
brain_vector_service = BrainVectorService(brain_id)
brain_vector_service.create_brain_vector(created_vector_id, file_sha1)

View File

@ -11,13 +11,13 @@ logger = get_logger(__name__)
# TODO: Create interface for embeddings and implement it for Supabase and OpenAI (current Quivr)
class Neurons(BaseModel):
def create_vector(self, doc):
def create_vector(self, docs):
documents_vector_store = get_documents_vector_store()
logger.info("Creating vector for document")
logger.info(f"Document: {doc}")
logger.info(f"Document: {docs}")
try:
sids = documents_vector_store.add_documents([doc])
sids = documents_vector_store.add_documents(docs)
if sids and len(sids) > 0:
return sids

View File

@ -1,7 +1,9 @@
import time
from celery_task import create_embedding_for_document
from models import File
from models.settings import get_supabase_db
from modules.brain.service.brain_vector_service import BrainVectorService
from packages.embeddings.vectors import Neurons
from repository.files.upload_file import DocumentSerializable
@ -10,25 +12,40 @@ async def process_file(
loader_class,
brain_id,
):
database = get_supabase_db()
dateshort = time.strftime("%Y%m%d")
neurons = Neurons()
file.compute_documents(loader_class)
for doc in file.documents: # pyright: ignore reportPrivateUsage=none
metadata = {
"file_sha1": file.file_sha1,
"file_size": file.file_size,
"file_name": file.file_name,
"chunk_size": file.chunk_size,
"chunk_overlap": file.chunk_overlap,
"date": dateshort,
}
doc_with_metadata = DocumentSerializable(
page_content=doc.page_content, metadata=metadata
metadata = {
"file_sha1": file.file_sha1,
"file_size": file.file_size,
"file_name": file.file_name,
"chunk_size": file.chunk_size,
"chunk_overlap": file.chunk_overlap,
"date": dateshort,
}
docs = []
if file.documents is not None:
for doc in file.documents: # pyright: ignore reportPrivateUsage=none
doc_with_metadata = DocumentSerializable(
page_content=doc.page_content, metadata=metadata
)
docs.append(doc_with_metadata)
created_vector = neurons.create_vector(docs)
brain_vector_service = BrainVectorService(brain_id)
for created_vector_id in created_vector:
brain_vector_service.create_brain_vector(
created_vector_id, metadata["file_sha1"]
)
create_embedding_for_document.delay(
brain_id, doc_with_metadata.to_json(), file.file_sha1
)
database.set_file_sha_from_metadata(metadata["file_sha1"])
return len(file.documents)
if created_vector:
return len(created_vector)
else:
return 0