feat(upload): changed to task (#1178)

This commit is contained in:
Stan Girard 2023-09-15 15:52:06 +02:00 committed by GitHub
parent 223d3d9102
commit 980a704002
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 63 additions and 12 deletions

19
backend/celery_task.py Normal file
View File

@ -0,0 +1,19 @@
from celery import shared_task
from models.brains import Brain
from repository.files.upload_file import DocumentSerializable
from utils.vectors import Neurons
@shared_task
def create_embedding_for_document(
brain_id, doc_with_metadata, user_openai_api_key, file_sha1
):
neurons = Neurons()
doc = DocumentSerializable.from_json(doc_with_metadata)
created_vector = neurons.create_vector(doc, user_openai_api_key)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id, file_sha1)

View File

@ -1,8 +1,8 @@
import time import time
from langchain.schema import Document from celery_task import create_embedding_for_document
from models import Brain, File from models import File
from utils.vectors import Neurons from repository.files.upload_file import DocumentSerializable
async def process_file( async def process_file(
@ -26,15 +26,12 @@ async def process_file(
"date": dateshort, "date": dateshort,
"summarization": "true" if enable_summarization else "false", "summarization": "true" if enable_summarization else "false",
} }
doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata) doc_with_metadata = DocumentSerializable(
page_content=doc.page_content, metadata=metadata
)
neurons = Neurons() create_embedding_for_document.delay(
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key) brain_id, doc_with_metadata.to_json(), user_openai_api_key, file.file_sha1
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap}) )
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id, file.file_sha1)
return return

View File

@ -1,6 +1,9 @@
import json
from multiprocessing import get_logger from multiprocessing import get_logger
from httpx import Response from httpx import Response
from langchain.pydantic_v1 import Field
from langchain.schema import Document
from models import get_supabase_client from models import get_supabase_client
from supabase.client import Client from supabase.client import Client
@ -19,3 +22,35 @@ def upload_file_storage(file, file_identifier: str) -> Response:
logger.error(e) logger.error(e)
print(e) print(e)
return response return response
class DocumentSerializable(Document):
"""Class for storing a piece of text and associated metadata."""
page_content: str
metadata: dict = Field(default_factory=dict)
@property
def lc_serializable(self) -> bool:
return True
def __repr__(self):
return f"Document(page_content='{self.page_content[:50]}...', metadata={self.metadata})"
def __str__(self):
return self.__repr__()
def to_json(self) -> str:
"""Convert the Document object to a JSON string."""
return json.dumps(
{
"page_content": self.page_content,
"metadata": self.metadata,
}
)
@classmethod
def from_json(cls, json_str: str):
"""Create a Document object from a JSON string."""
data = json.loads(json_str)
return cls(page_content=data["page_content"], metadata=data["metadata"])