mirror of
https://github.com/StanGirard/quivr.git
synced 2024-11-23 12:26:03 +03:00
feat(upload): changed to task (#1178)
This commit is contained in:
parent
223d3d9102
commit
980a704002
19
backend/celery_task.py
Normal file
19
backend/celery_task.py
Normal file
@ -0,0 +1,19 @@
|
||||
from celery import shared_task
|
||||
from models.brains import Brain
|
||||
from repository.files.upload_file import DocumentSerializable
|
||||
from utils.vectors import Neurons
|
||||
|
||||
|
||||
@shared_task
|
||||
def create_embedding_for_document(
|
||||
brain_id, doc_with_metadata, user_openai_api_key, file_sha1
|
||||
):
|
||||
neurons = Neurons()
|
||||
doc = DocumentSerializable.from_json(doc_with_metadata)
|
||||
created_vector = neurons.create_vector(doc, user_openai_api_key)
|
||||
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
||||
|
||||
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
|
||||
|
||||
brain = Brain(id=brain_id)
|
||||
brain.create_brain_vector(created_vector_id, file_sha1)
|
@ -1,8 +1,8 @@
|
||||
import time
|
||||
|
||||
from langchain.schema import Document
|
||||
from models import Brain, File
|
||||
from utils.vectors import Neurons
|
||||
from celery_task import create_embedding_for_document
|
||||
from models import File
|
||||
from repository.files.upload_file import DocumentSerializable
|
||||
|
||||
|
||||
async def process_file(
|
||||
@ -26,15 +26,12 @@ async def process_file(
|
||||
"date": dateshort,
|
||||
"summarization": "true" if enable_summarization else "false",
|
||||
}
|
||||
doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)
|
||||
doc_with_metadata = DocumentSerializable(
|
||||
page_content=doc.page_content, metadata=metadata
|
||||
)
|
||||
|
||||
neurons = Neurons()
|
||||
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
|
||||
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
||||
|
||||
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
|
||||
|
||||
brain = Brain(id=brain_id)
|
||||
brain.create_brain_vector(created_vector_id, file.file_sha1)
|
||||
create_embedding_for_document.delay(
|
||||
brain_id, doc_with_metadata.to_json(), user_openai_api_key, file.file_sha1
|
||||
)
|
||||
|
||||
return
|
||||
|
@ -1,6 +1,9 @@
|
||||
import json
|
||||
from multiprocessing import get_logger
|
||||
|
||||
from httpx import Response
|
||||
from langchain.pydantic_v1 import Field
|
||||
from langchain.schema import Document
|
||||
from models import get_supabase_client
|
||||
from supabase.client import Client
|
||||
|
||||
@ -19,3 +22,35 @@ def upload_file_storage(file, file_identifier: str) -> Response:
|
||||
logger.error(e)
|
||||
print(e)
|
||||
return response
|
||||
|
||||
|
||||
class DocumentSerializable(Document):
|
||||
"""Class for storing a piece of text and associated metadata."""
|
||||
|
||||
page_content: str
|
||||
metadata: dict = Field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def lc_serializable(self) -> bool:
|
||||
return True
|
||||
|
||||
def __repr__(self):
|
||||
return f"Document(page_content='{self.page_content[:50]}...', metadata={self.metadata})"
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""Convert the Document object to a JSON string."""
|
||||
return json.dumps(
|
||||
{
|
||||
"page_content": self.page_content,
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str):
|
||||
"""Create a Document object from a JSON string."""
|
||||
data = json.loads(json_str)
|
||||
return cls(page_content=data["page_content"], metadata=data["metadata"])
|
||||
|
Loading…
Reference in New Issue
Block a user