quivr/backend/models/files.py

import os
import tempfile
from typing import Any, Optional
from uuid import UUID

from fastapi import UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from logger import get_logger
from models.brains import Brain
from models.databases.supabase.supabase import SupabaseDB
from models.settings import get_supabase_db
from packages.files.file import compute_sha1_from_file
from pydantic import BaseModel

logger = get_logger(__name__)


class File(BaseModel):
    id: Optional[UUID] = None
    file: Optional[UploadFile]
    file_name: Optional[str] = ""
    file_size: Optional[int] = None
    file_sha1: Optional[str] = ""
    vectors_ids: Optional[list] = []
    file_extension: Optional[str] = ""
    content: Optional[Any] = None
    chunk_size: int = 500
    chunk_overlap: int = 0
    documents: Optional[Any] = None

    @property
    def supabase_db(self) -> SupabaseDB:
        return get_supabase_db()

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        if self.file:
            self.file_name = self.file.filename
            self.file_size = self.file.size  # pyright: ignore reportPrivateUsage=none
            self.file_extension = os.path.splitext(
                self.file.filename  # pyright: ignore reportPrivateUsage=none
            )[-1].lower()

    async def compute_file_sha1(self):
        """
        Compute the sha1 of the file using a temporary file
        """
        with tempfile.NamedTemporaryFile(
            delete=False,
            suffix=self.file.filename,  # pyright: ignore reportPrivateUsage=none
        ) as tmp_file:
            await self.file.seek(0)  # pyright: ignore reportPrivateUsage=none
            self.content = (
                await self.file.read()  # pyright: ignore reportPrivateUsage=none
            )
            tmp_file.write(self.content)
            tmp_file.flush()
            self.file_sha1 = compute_sha1_from_file(tmp_file.name)

        os.remove(tmp_file.name)

    def compute_documents(self, loader_class):
        """
        Compute the documents from the file

        Args:
            loader_class (class): The class of the loader to use to load the file
        """
        logger.info(f"Computing documents from file {self.file_name}")

        documents = []
        with tempfile.NamedTemporaryFile(
            delete=False,
            suffix=self.file.filename,  # pyright: ignore reportPrivateUsage=none
        ) as tmp_file:
            tmp_file.write(self.content)  # pyright: ignore reportPrivateUsage=none
            tmp_file.flush()
            loader = loader_class(tmp_file.name)
            documents = loader.load()

        os.remove(tmp_file.name)

        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
        )

        self.documents = text_splitter.split_documents(documents)

    def set_file_vectors_ids(self):
        """
        Set the vectors_ids property with the ids of the vectors
        that are associated with the file in the vectors table
        """
        self.vectors_ids = self.supabase_db.get_vectors_by_file_sha1(
            self.file_sha1
        ).data

    def file_already_exists(self):
        """
        Check if file already exists in vectors table
        """
        self.set_file_vectors_ids()

        # if the file does not exist in vectors then no need to go check in brains_vectors
        if len(self.vectors_ids) == 0:  # pyright: ignore reportPrivateUsage=none
            return False

        return True

    def file_already_exists_in_brain(self, brain_id):
        """
        Check if file already exists in a brain

        Args:
            brain_id (str): Brain id
        """
        response = self.supabase_db.get_brain_vectors_by_brain_id_and_file_sha1(
            brain_id, self.file_sha1  # type: ignore
        )

        if len(response.data) == 0:
            return False

        return True

    def file_is_empty(self):
        """
        Check if file is empty by checking if the file pointer is at the beginning of the file
        """
        return self.file.size < 1  # pyright: ignore reportPrivateUsage=none

    def link_file_to_brain(self, brain: Brain):
        self.set_file_vectors_ids()

        if self.vectors_ids is None:
            return

        for vector_id in self.vectors_ids:  # pyright: ignore reportPrivateUsage=none
            brain.create_brain_vector(vector_id["id"], self.file_sha1)
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`import os`
			`import tempfile`
			`from typing import Any, Optional`
			`from uuid import UUID`

			`from fastapi import UploadFile`
			`from langchain.text_splitter import RecursiveCharacterTextSplitter`
			`from logger import get_logger`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00			`from models.brains import Brain`
refactor: delete common_dependencies function (#843) * use function for get_documents_vector_store * use function for get_embeddings * use function for get_supabase_client * use function for get_supabase_db * delete lasts common_dependencies 2023-08-03 21:24:42 +03:00			`from models.databases.supabase.supabase import SupabaseDB`
			`from models.settings import get_supabase_db`
refactor: create "files" package (#1626) # Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): 2023-11-14 11:52:44 +03:00			`from packages.files.file import compute_sha1_from_file`
feat(file-system): added queue and filesystem (#1159) * feat(queue): added * feat(crawling): added queue * fix(crawler): fixed github * feat(docker): simplified docker compose * feat(celery): added worker * feat(files): now uploaded * feat(files): missing routes * feat(delete): added * feat(storage): added policy and migrations * feat(sqs): implemented * feat(redis): added queue name variable * fix(task): updated * style(env): emoved unused env * ci(tests): removed broken tests 2023-09-14 12:56:59 +03:00			`from pydantic import BaseModel`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`logger = get_logger(__name__)`

Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`class File(BaseModel):`
			`id: Optional[UUID] = None`
			`file: Optional[UploadFile]`
			`file_name: Optional[str] = ""`
docs(backend): add docstrings (#590) 2023-07-10 20:28:38 +03:00			`file_size: Optional[int] = None`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`file_sha1: Optional[str] = ""`
docs(backend): add docstrings (#590) 2023-07-10 20:28:38 +03:00			`vectors_ids: Optional[list] = []`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`file_extension: Optional[str] = ""`
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`content: Optional[Any] = None`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`chunk_size: int = 500`
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`chunk_overlap: int = 0`
			`documents: Optional[Any] = None`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00
refactor: delete common_dependencies function (#843) * use function for get_documents_vector_store * use function for get_embeddings * use function for get_supabase_client * use function for get_supabase_db * delete lasts common_dependencies 2023-08-03 21:24:42 +03:00			`@property`
			`def supabase_db(self) -> SupabaseDB:`
			`return get_supabase_db()`
feat: Introduce repository pattern to prepare adding other database providers (#646) * add sqlalchemy models * add neon settings * add insert brain * abstract supabase from Brain class * abstract supabase from Brain class * abstract supabase from /models * update Database to Repository * update neon_tables to pg_tables * update chat, api-key and message * update vector class * update settings * update env vars for test * Update backend-tests.yml * fix test * fix fetch_user_requests_count() * fix fetch_user_requests_count() * fix increment_user_request_count * fix increment_user_request_count * fix asset upload_response message * fix pyright * fix brain_subscription * fix brain_subscription * fix brain_subscription * fix get user request stat * update create_brain_user * add delete brain vector and user * add delete brain vector and user * correctly call function --------- Co-authored-by: Noé Pion <noe.pion@onfido.com> Co-authored-by: raoufchebri <raouf@chebri.com> Co-authored-by: Stan Girard <girard.stanislas@gmail.com> 2023-08-02 00:03:47 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`def __init__(self, **kwargs):`
			`super().__init__(**kwargs)`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`if self.file:`
			`self.file_name = self.file.filename`
feat(file-system): added queue and filesystem (#1159) * feat(queue): added * feat(crawling): added queue * fix(crawler): fixed github * feat(docker): simplified docker compose * feat(celery): added worker * feat(files): now uploaded * feat(files): missing routes * feat(delete): added * feat(storage): added policy and migrations * feat(sqs): implemented * feat(redis): added queue name variable * fix(task): updated * style(env): emoved unused env * ci(tests): removed broken tests 2023-09-14 12:56:59 +03:00			`self.file_size = self.file.size # pyright: ignore reportPrivateUsage=none`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00			`self.file_extension = os.path.splitext(`
			`self.file.filename # pyright: ignore reportPrivateUsage=none`
			`)[-1].lower()`

Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`async def compute_file_sha1(self):`
docs(backend): add docstrings (#590) 2023-07-10 20:28:38 +03:00			`"""`
			`Compute the sha1 of the file using a temporary file`
			`"""`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00			`with tempfile.NamedTemporaryFile(`
			`delete=False,`
			`suffix=self.file.filename, # pyright: ignore reportPrivateUsage=none`
			`) as tmp_file:`
			`await self.file.seek(0) # pyright: ignore reportPrivateUsage=none`
			`self.content = (`
			`await self.file.read() # pyright: ignore reportPrivateUsage=none`
			`)`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`tmp_file.write(self.content)`
			`tmp_file.flush()`
			`self.file_sha1 = compute_sha1_from_file(tmp_file.name)`

			`os.remove(tmp_file.name)`

			`def compute_documents(self, loader_class):`
docs(backend): add docstrings (#590) 2023-07-10 20:28:38 +03:00			`"""`
			`Compute the documents from the file`

			`Args:`
			`loader_class (class): The class of the loader to use to load the file`
			`"""`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`logger.info(f"Computing documents from file {self.file_name}")`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`documents = []`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00			`with tempfile.NamedTemporaryFile(`
			`delete=False,`
			`suffix=self.file.filename, # pyright: ignore reportPrivateUsage=none`
			`) as tmp_file:`
			`tmp_file.write(self.content) # pyright: ignore reportPrivateUsage=none`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`tmp_file.flush()`
			`loader = loader_class(tmp_file.name)`
			`documents = loader.load()`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`os.remove(tmp_file.name)`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(`
			`chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap`
			`)`

			`self.documents = text_splitter.split_documents(documents)`

			`def set_file_vectors_ids(self):`
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`"""`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00			`Set the vectors_ids property with the ids of the vectors`
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`that are associated with the file in the vectors table`
			`"""`
refactor: delete common_dependencies function (#843) * use function for get_documents_vector_store * use function for get_embeddings * use function for get_supabase_client * use function for get_supabase_db * delete lasts common_dependencies 2023-08-03 21:24:42 +03:00			`self.vectors_ids = self.supabase_db.get_vectors_by_file_sha1(`
			`self.file_sha1`
			`).data`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`def file_already_exists(self):`
			`"""`
			`Check if file already exists in vectors table`
			`"""`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`self.set_file_vectors_ids()`

Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`# if the file does not exist in vectors then no need to go check in brains_vectors`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00			`if len(self.vectors_ids) == 0: # pyright: ignore reportPrivateUsage=none`
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`return False`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`return True`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`def file_already_exists_in_brain(self, brain_id):`
docs(backend): add docstrings (#590) 2023-07-10 20:28:38 +03:00			`"""`
			`Check if file already exists in a brain`

			`Args:`
			`brain_id (str): Brain id`
			`"""`
refactor: delete common_dependencies function (#843) * use function for get_documents_vector_store * use function for get_embeddings * use function for get_supabase_client * use function for get_supabase_db * delete lasts common_dependencies 2023-08-03 21:24:42 +03:00			`response = self.supabase_db.get_brain_vectors_by_brain_id_and_file_sha1(`
test: skip failing linter tests (#1036) 2023-08-25 13:03:13 +03:00			`brain_id, self.file_sha1 # type: ignore`
refactor: delete common_dependencies function (#843) * use function for get_documents_vector_store * use function for get_embeddings * use function for get_supabase_client * use function for get_supabase_db * delete lasts common_dependencies 2023-08-03 21:24:42 +03:00			`)`
feat: Introduce repository pattern to prepare adding other database providers (#646) * add sqlalchemy models * add neon settings * add insert brain * abstract supabase from Brain class * abstract supabase from Brain class * abstract supabase from /models * update Database to Repository * update neon_tables to pg_tables * update chat, api-key and message * update vector class * update settings * update env vars for test * Update backend-tests.yml * fix test * fix fetch_user_requests_count() * fix fetch_user_requests_count() * fix increment_user_request_count * fix increment_user_request_count * fix asset upload_response message * fix pyright * fix brain_subscription * fix brain_subscription * fix brain_subscription * fix get user request stat * update create_brain_user * add delete brain vector and user * add delete brain vector and user * correctly call function --------- Co-authored-by: Noé Pion <noe.pion@onfido.com> Co-authored-by: raoufchebri <raouf@chebri.com> Co-authored-by: Stan Girard <girard.stanislas@gmail.com> 2023-08-02 00:03:47 +03:00
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`if len(response.data) == 0:`
			`return False`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`return True`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Feat/multiple brains files (#361) 2023-06-28 20:39:27 +03:00			`def file_is_empty(self):`
docs(backend): add docstrings (#590) 2023-07-10 20:28:38 +03:00			`"""`
			`Check if file is empty by checking if the file pointer is at the beginning of the file`
			`"""`
feat(file-system): added queue and filesystem (#1159) * feat(queue): added * feat(crawling): added queue * fix(crawler): fixed github * feat(docker): simplified docker compose * feat(celery): added worker * feat(files): now uploaded * feat(files): missing routes * feat(delete): added * feat(storage): added policy and migrations * feat(sqs): implemented * feat(redis): added queue name variable * fix(task): updated * style(env): emoved unused env * ci(tests): removed broken tests 2023-09-14 12:56:59 +03:00			`return self.file.size < 1 # pyright: ignore reportPrivateUsage=none`
Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00
Fix/file upload explore (#412) 2023-06-29 19:26:03 +03:00			`def link_file_to_brain(self, brain: Brain):`
			`self.set_file_vectors_ids()`

docs(backend): add docstrings (#590) 2023-07-10 20:28:38 +03:00			`if self.vectors_ids is None:`
			`return`

Feat/static analysis (#582) * feat: add static analysis * chore: update Makefile add static analysis script * chore: add vscode extensions recommandations 2023-07-10 15:27:49 +03:00			`for vector_id in self.vectors_ids: # pyright: ignore reportPrivateUsage=none`
			`brain.create_brain_vector(vector_id["id"], self.file_sha1)`