mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-15 01:21:48 +03:00
feat(embedding): now 100 times faster ⚡️🔥 (#1807)
# Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate):
This commit is contained in:
parent
b47f411372
commit
ef53590b58
@ -1,19 +0,0 @@
|
||||
from celery import shared_task
|
||||
from models.settings import get_supabase_db
|
||||
from modules.brain.service.brain_vector_service import BrainVectorService
|
||||
from packages.embeddings.vectors import Neurons
|
||||
from repository.files.upload_file import DocumentSerializable
|
||||
|
||||
|
||||
@shared_task
|
||||
def create_embedding_for_document(brain_id, doc_with_metadata, file_sha1):
|
||||
neurons = Neurons()
|
||||
doc = DocumentSerializable.from_json(doc_with_metadata)
|
||||
created_vector = neurons.create_vector(doc)
|
||||
database = get_supabase_db()
|
||||
database.set_file_sha_from_metadata(file_sha1)
|
||||
|
||||
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
|
||||
|
||||
brain_vector_service = BrainVectorService(brain_id)
|
||||
brain_vector_service.create_brain_vector(created_vector_id, file_sha1)
|
@ -11,13 +11,13 @@ logger = get_logger(__name__)
|
||||
|
||||
# TODO: Create interface for embeddings and implement it for Supabase and OpenAI (current Quivr)
|
||||
class Neurons(BaseModel):
|
||||
def create_vector(self, doc):
|
||||
def create_vector(self, docs):
|
||||
documents_vector_store = get_documents_vector_store()
|
||||
logger.info("Creating vector for document")
|
||||
logger.info(f"Document: {doc}")
|
||||
logger.info(f"Document: {docs}")
|
||||
|
||||
try:
|
||||
sids = documents_vector_store.add_documents([doc])
|
||||
sids = documents_vector_store.add_documents(docs)
|
||||
if sids and len(sids) > 0:
|
||||
return sids
|
||||
|
||||
|
@ -1,7 +1,9 @@
|
||||
import time
|
||||
|
||||
from celery_task import create_embedding_for_document
|
||||
from models import File
|
||||
from models.settings import get_supabase_db
|
||||
from modules.brain.service.brain_vector_service import BrainVectorService
|
||||
from packages.embeddings.vectors import Neurons
|
||||
from repository.files.upload_file import DocumentSerializable
|
||||
|
||||
|
||||
@ -10,25 +12,40 @@ async def process_file(
|
||||
loader_class,
|
||||
brain_id,
|
||||
):
|
||||
database = get_supabase_db()
|
||||
dateshort = time.strftime("%Y%m%d")
|
||||
neurons = Neurons()
|
||||
|
||||
file.compute_documents(loader_class)
|
||||
|
||||
for doc in file.documents: # pyright: ignore reportPrivateUsage=none
|
||||
metadata = {
|
||||
"file_sha1": file.file_sha1,
|
||||
"file_size": file.file_size,
|
||||
"file_name": file.file_name,
|
||||
"chunk_size": file.chunk_size,
|
||||
"chunk_overlap": file.chunk_overlap,
|
||||
"date": dateshort,
|
||||
}
|
||||
doc_with_metadata = DocumentSerializable(
|
||||
page_content=doc.page_content, metadata=metadata
|
||||
metadata = {
|
||||
"file_sha1": file.file_sha1,
|
||||
"file_size": file.file_size,
|
||||
"file_name": file.file_name,
|
||||
"chunk_size": file.chunk_size,
|
||||
"chunk_overlap": file.chunk_overlap,
|
||||
"date": dateshort,
|
||||
}
|
||||
docs = []
|
||||
|
||||
if file.documents is not None:
|
||||
for doc in file.documents: # pyright: ignore reportPrivateUsage=none
|
||||
doc_with_metadata = DocumentSerializable(
|
||||
page_content=doc.page_content, metadata=metadata
|
||||
)
|
||||
docs.append(doc_with_metadata)
|
||||
|
||||
created_vector = neurons.create_vector(docs)
|
||||
|
||||
brain_vector_service = BrainVectorService(brain_id)
|
||||
for created_vector_id in created_vector:
|
||||
brain_vector_service.create_brain_vector(
|
||||
created_vector_id, metadata["file_sha1"]
|
||||
)
|
||||
|
||||
create_embedding_for_document.delay(
|
||||
brain_id, doc_with_metadata.to_json(), file.file_sha1
|
||||
)
|
||||
database.set_file_sha_from_metadata(metadata["file_sha1"])
|
||||
|
||||
return len(file.documents)
|
||||
if created_vector:
|
||||
return len(created_vector)
|
||||
else:
|
||||
return 0
|
||||
|
Loading…
Reference in New Issue
Block a user