quivr/backend/repository/files/upload_file.py

import json
import os
from multiprocessing import get_logger

from langchain.pydantic_v1 import Field
from langchain.schema import Document
from models import get_supabase_client
from supabase.client import Client

logger = get_logger()

# Mapping of file extensions to MIME types
mime_types = {
    ".txt": "text/plain",
    ".csv": "text/csv",
    ".md": "text/markdown",
    ".markdown": "text/markdown",
    ".telegram": "application/x-telegram",
    ".m4a": "audio/mp4",
    ".mp3": "audio/mpeg",
    ".webm": "audio/webm",
    ".mp4": "video/mp4",
    ".mpga": "audio/mpeg",
    ".wav": "audio/wav",
    ".mpeg": "video/mpeg",
    ".pdf": "application/pdf",
    ".html": "text/html",
    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    ".odt": "application/vnd.oasis.opendocument.text",
    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    ".xls": "application/vnd.ms-excel",
    ".epub": "application/epub+zip",
    ".ipynb": "application/x-ipynb+json",
    ".py": "text/x-python",
}


def upload_file_storage(file, file_identifier: str, upsert: str = "false"):
    supabase_client: Client = get_supabase_client()
    response = None

    try:
        # Get the file extension
        _, file_extension = os.path.splitext(file_identifier)

        # Get the MIME type for the file extension
        mime_type = mime_types.get(file_extension, "text/html")

        response = supabase_client.storage.from_("quivr").upload(
            file_identifier,
            file,
            file_options={
                "content-type": mime_type,
                "upsert": upsert,
                "cache-control": "3600",
            },
        )

        return response
    except Exception as e:
        if "The resource already exists" in str(e) and upsert == "true":
            response = supabase_client.storage.from_("quivr").update(
                file_identifier,
                file,
                file_options={
                    "content-type": mime_type,
                    "upsert": upsert,
                    "cache-control": "3600",
                },
            )
        else:
            raise e


class DocumentSerializable(Document):
    """Class for storing a piece of text and associated metadata."""

    page_content: str
    metadata: dict = Field(default_factory=dict)

    @property
    def lc_serializable(self) -> bool:
        return True

    def __repr__(self):
        return f"Document(page_content='{self.page_content[:50]}...', metadata={self.metadata})"

    def __str__(self):
        return self.__repr__()

    def to_json(self) -> str:
        """Convert the Document object to a JSON string."""
        return json.dumps(
            {
                "page_content": self.page_content,
                "metadata": self.metadata,
            }
        )

    @classmethod
    def from_json(cls, json_str: str):
        """Create a Document object from a JSON string."""
        data = json.loads(json_str)
        return cls(page_content=data["page_content"], metadata=data["metadata"])
feat(upload): changed to task (#1178) 2023-09-15 16:52:06 +03:00			`import json`
fix: 🐛 upload (#2112) now you can download & view pdf # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-01-29 08:35:10 +03:00			`import os`
feat(file-system): added queue and filesystem (#1159) * feat(queue): added * feat(crawling): added queue * fix(crawler): fixed github * feat(docker): simplified docker compose * feat(celery): added worker * feat(files): now uploaded * feat(files): missing routes * feat(delete): added * feat(storage): added policy and migrations * feat(sqs): implemented * feat(redis): added queue name variable * fix(task): updated * style(env): emoved unused env * ci(tests): removed broken tests 2023-09-14 12:56:59 +03:00			`from multiprocessing import get_logger`

feat(upload): changed to task (#1178) 2023-09-15 16:52:06 +03:00			`from langchain.pydantic_v1 import Field`
			`from langchain.schema import Document`
feat(file-system): added queue and filesystem (#1159) * feat(queue): added * feat(crawling): added queue * fix(crawler): fixed github * feat(docker): simplified docker compose * feat(celery): added worker * feat(files): now uploaded * feat(files): missing routes * feat(delete): added * feat(storage): added policy and migrations * feat(sqs): implemented * feat(redis): added queue name variable * fix(task): updated * style(env): emoved unused env * ci(tests): removed broken tests 2023-09-14 12:56:59 +03:00			`from models import get_supabase_client`
			`from supabase.client import Client`

			`logger = get_logger()`

fix: 🐛 upload (#2112) now you can download & view pdf # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-01-29 08:35:10 +03:00			`# Mapping of file extensions to MIME types`
			`mime_types = {`
			`".txt": "text/plain",`
			`".csv": "text/csv",`
			`".md": "text/markdown",`
			`".markdown": "text/markdown",`
			`".telegram": "application/x-telegram",`
			`".m4a": "audio/mp4",`
			`".mp3": "audio/mpeg",`
			`".webm": "audio/webm",`
			`".mp4": "video/mp4",`
			`".mpga": "audio/mpeg",`
			`".wav": "audio/wav",`
			`".mpeg": "video/mpeg",`
			`".pdf": "application/pdf",`
			`".html": "text/html",`
			`".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",`
			`".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",`
			`".odt": "application/vnd.oasis.opendocument.text",`
			`".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",`
			`".xls": "application/vnd.ms-excel",`
			`".epub": "application/epub+zip",`
			`".ipynb": "application/x-ipynb+json",`
			`".py": "text/x-python",`
			`}`

feat(file-system): added queue and filesystem (#1159) * feat(queue): added * feat(crawling): added queue * fix(crawler): fixed github * feat(docker): simplified docker compose * feat(celery): added worker * feat(files): now uploaded * feat(files): missing routes * feat(delete): added * feat(storage): added policy and migrations * feat(sqs): implemented * feat(redis): added queue name variable * fix(task): updated * style(env): emoved unused env * ci(tests): removed broken tests 2023-09-14 12:56:59 +03:00
feat(notion): added custom integration (#2268) This pull request adds a custom integration feature and sync functionality to the application. It includes the following changes: - Added a new integration entity for custom integrations. - Implemented the ability to load and poll the custom integration. - Added a task to sync the custom integration with the user's brain. - Updated the celery beat schedule to include the new task. Please review and merge this pull request. 2024-02-28 08:30:25 +03:00			`def upload_file_storage(file, file_identifier: str, upsert: str = "false"):`
feat(file-system): added queue and filesystem (#1159) * feat(queue): added * feat(crawling): added queue * fix(crawler): fixed github * feat(docker): simplified docker compose * feat(celery): added worker * feat(files): now uploaded * feat(files): missing routes * feat(delete): added * feat(storage): added policy and migrations * feat(sqs): implemented * feat(redis): added queue name variable * fix(task): updated * style(env): emoved unused env * ci(tests): removed broken tests 2023-09-14 12:56:59 +03:00			`supabase_client: Client = get_supabase_client()`
			`response = None`

			`try:`
fix: 🐛 upload (#2112) now you can download & view pdf # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-01-29 08:35:10 +03:00			`# Get the file extension`
			`_, file_extension = os.path.splitext(file_identifier)`

			`# Get the MIME type for the file extension`
			`mime_type = mime_types.get(file_extension, "text/html")`

			`response = supabase_client.storage.from_("quivr").upload(`
feat(notion): added custom integration (#2268) This pull request adds a custom integration feature and sync functionality to the application. It includes the following changes: - Added a new integration entity for custom integrations. - Implemented the ability to load and poll the custom integration. - Added a task to sync the custom integration with the user's brain. - Updated the celery beat schedule to include the new task. Please review and merge this pull request. 2024-02-28 08:30:25 +03:00			`file_identifier,`
			`file,`
			`file_options={`
			`"content-type": mime_type,`
			`"upsert": upsert,`
			`"cache-control": "3600",`
			`},`
fix: 🐛 upload (#2112) now you can download & view pdf # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2024-01-29 08:35:10 +03:00			`)`

feat(file-system): added queue and filesystem (#1159) * feat(queue): added * feat(crawling): added queue * fix(crawler): fixed github * feat(docker): simplified docker compose * feat(celery): added worker * feat(files): now uploaded * feat(files): missing routes * feat(delete): added * feat(storage): added policy and migrations * feat(sqs): implemented * feat(redis): added queue name variable * fix(task): updated * style(env): emoved unused env * ci(tests): removed broken tests 2023-09-14 12:56:59 +03:00			`return response`
			`except Exception as e:`
feat(notion): added custom integration (#2268) This pull request adds a custom integration feature and sync functionality to the application. It includes the following changes: - Added a new integration entity for custom integrations. - Implemented the ability to load and poll the custom integration. - Added a task to sync the custom integration with the user's brain. - Updated the celery beat schedule to include the new task. Please review and merge this pull request. 2024-02-28 08:30:25 +03:00			`if "The resource already exists" in str(e) and upsert == "true":`
			`response = supabase_client.storage.from_("quivr").update(`
			`file_identifier,`
			`file,`
			`file_options={`
			`"content-type": mime_type,`
			`"upsert": upsert,`
			`"cache-control": "3600",`
			`},`
			`)`
			`else:`
			`raise e`
feat(upload): changed to task (#1178) 2023-09-15 16:52:06 +03:00

			`class DocumentSerializable(Document):`
			`"""Class for storing a piece of text and associated metadata."""`

			`page_content: str`
			`metadata: dict = Field(default_factory=dict)`

			`@property`
			`def lc_serializable(self) -> bool:`
			`return True`

			`def __repr__(self):`
			`return f"Document(page_content='{self.page_content[:50]}...', metadata={self.metadata})"`

			`def __str__(self):`
			`return self.__repr__()`

			`def to_json(self) -> str:`
			`"""Convert the Document object to a JSON string."""`
			`return json.dumps(`
			`{`
			`"page_content": self.page_content,`
			`"metadata": self.metadata,`
			`}`
			`)`

			`@classmethod`
			`def from_json(cls, json_str: str):`
			`"""Create a Document object from a JSON string."""`
			`data = json.loads(json_str)`
			`return cls(page_content=data["page_content"], metadata=data["metadata"])`