mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-18 11:51:41 +03:00
31d3cce97f
This pull request includes updates to the `docker-compose.dev.yml` and `Dockerfile.dev` files. The changes aim to improve performance and fix bugs. The updates include: - Removing unnecessary workers configuration in the `docker-compose.dev.yml` file. - Updating the base image in the `Dockerfile.dev` to use a slim version. - Adjusting the schedule for a specific task in the code. - Modifying the time interval for retrieving active syncs. - Changing the loader class for processing PowerPoint files. - Refactoring the file existence check logic. - Adding debug logs for file existence check and file removal. - Adjusting the file synchronization logic. These changes are intended to enhance the performance and stability of the application.
136 lines
4.2 KiB
Python
136 lines
4.2 KiB
Python
import json
|
|
import os
|
|
from multiprocessing import get_logger
|
|
|
|
from langchain.pydantic_v1 import Field
|
|
from langchain.schema import Document
|
|
from logger import get_logger
|
|
from models import get_supabase_client
|
|
from supabase.client import Client
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
# Mapping of file extensions to MIME types
|
|
mime_types = {
|
|
".txt": "text/plain",
|
|
".csv": "text/csv",
|
|
".md": "text/markdown",
|
|
".markdown": "text/markdown",
|
|
".telegram": "application/x-telegram",
|
|
".m4a": "audio/mp4",
|
|
".mp3": "audio/mpeg",
|
|
".webm": "audio/webm",
|
|
".mp4": "video/mp4",
|
|
".mpga": "audio/mpeg",
|
|
".wav": "audio/wav",
|
|
".mpeg": "video/mpeg",
|
|
".pdf": "application/pdf",
|
|
".html": "text/html",
|
|
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
".odt": "application/vnd.oasis.opendocument.text",
|
|
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
".xls": "application/vnd.ms-excel",
|
|
".epub": "application/epub+zip",
|
|
".ipynb": "application/x-ipynb+json",
|
|
".py": "text/x-python",
|
|
}
|
|
|
|
|
|
def check_file_exists(brain_id: str, file_identifier: str) -> bool:
|
|
|
|
supabase_client: Client = get_supabase_client()
|
|
try:
|
|
# Check if the file exists
|
|
logger.info(f"Checking if file {file_identifier} exists.")
|
|
# This needs to be converted into a file_identifier that is safe for a URL
|
|
|
|
response = supabase_client.storage.from_("quivr").list(brain_id)
|
|
|
|
# Check if the file_identifier is in the response
|
|
file_exists = any(
|
|
file["name"].split(".")[0] == file_identifier.split(".")[0]
|
|
for file in response
|
|
)
|
|
logger.info(f"File identifier: {file_identifier}")
|
|
logger.info(f"File exists: {file_exists}")
|
|
if file_exists:
|
|
logger.info(f"File {file_identifier} exists.")
|
|
return True
|
|
else:
|
|
logger.info(f"File {file_identifier} does not exist.")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"An error occurred while checking the file: {e}")
|
|
return True
|
|
|
|
|
|
def upload_file_storage(file, file_identifier: str, upsert: str = "false"):
|
|
supabase_client: Client = get_supabase_client()
|
|
response = None
|
|
|
|
try:
|
|
# Get the file extension
|
|
_, file_extension = os.path.splitext(file_identifier)
|
|
|
|
# Get the MIME type for the file extension
|
|
mime_type = mime_types.get(file_extension, "text/html")
|
|
|
|
response = supabase_client.storage.from_("quivr").upload(
|
|
file_identifier,
|
|
file,
|
|
file_options={
|
|
"content-type": mime_type,
|
|
"upsert": upsert,
|
|
"cache-control": "3600",
|
|
},
|
|
)
|
|
|
|
return response
|
|
except Exception as e:
|
|
if "The resource already exists" in str(e) and upsert == "true":
|
|
response = supabase_client.storage.from_("quivr").update(
|
|
file_identifier,
|
|
file,
|
|
file_options={
|
|
"content-type": mime_type,
|
|
"upsert": upsert,
|
|
"cache-control": "3600",
|
|
},
|
|
)
|
|
else:
|
|
raise e
|
|
|
|
|
|
class DocumentSerializable(Document):
|
|
"""Class for storing a piece of text and associated metadata."""
|
|
|
|
page_content: str
|
|
metadata: dict = Field(default_factory=dict)
|
|
|
|
@property
|
|
def lc_serializable(self) -> bool:
|
|
return True
|
|
|
|
def __repr__(self):
|
|
return f"Document(page_content='{self.page_content[:50]}...', metadata={self.metadata})"
|
|
|
|
def __str__(self):
|
|
return self.__repr__()
|
|
|
|
def to_json(self) -> str:
|
|
"""Convert the Document object to a JSON string."""
|
|
return json.dumps(
|
|
{
|
|
"page_content": self.page_content,
|
|
"metadata": self.metadata,
|
|
}
|
|
)
|
|
|
|
@classmethod
|
|
def from_json(cls, json_str: str):
|
|
"""Create a Document object from a JSON string."""
|
|
data = json.loads(json_str)
|
|
return cls(page_content=data["page_content"], metadata=data["metadata"])
|