quivr/backend/worker/quivr_worker/files.py
Stan Girard 282fa0e3f8
feat(assistants): mock api (#3195)
# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
2024-09-18 03:30:48 -07:00

107 lines
3.0 KiB
Python

import hashlib
import time
from contextlib import contextmanager
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Any
from uuid import UUID
from quivr_api.logger import get_logger
from quivr_core.files.file import FileExtension, QuivrFile
from quivr_worker.utils.utils import get_tmp_name
logger = get_logger("celery_worker")
def compute_sha1(content: bytes) -> str:
m = hashlib.sha1()
m.update(content)
return m.hexdigest()
@contextmanager
def build_file(
file_data: bytes,
knowledge_id: UUID,
file_name: str,
original_file_name: str | None = None,
):
try:
# TODO(@aminediro) : Maybe use fsspec file to be agnostic to where files are stored :?
# We are reading the whole file to memory, which doesn't scale
tmp_name, base_file_name, file_extension = get_tmp_name(file_name)
tmp_file = NamedTemporaryFile(
suffix="_" + tmp_name, # pyright: ignore reportPrivateUsage=none
)
tmp_file.write(file_data)
tmp_file.flush()
file_sha1 = compute_sha1(file_data)
file_instance = File(
knowledge_id=knowledge_id,
file_name=base_file_name,
original_file_name=(
original_file_name if original_file_name else base_file_name
),
tmp_file_path=Path(tmp_file.name),
file_size=len(file_data),
file_extension=file_extension,
file_sha1=file_sha1,
)
yield file_instance
finally:
# Code to release resource, e.g.:
tmp_file.close()
class File:
__slots__ = [
"id",
"file_name",
"tmp_file_path",
"file_size",
"file_extension",
"file_sha1",
"original_file_name",
]
def __init__(
self,
knowledge_id: UUID,
file_name: str,
tmp_file_path: Path,
file_size: int,
file_extension: str,
file_sha1: str,
original_file_name: str,
):
self.id = knowledge_id
self.file_name = file_name
self.tmp_file_path = tmp_file_path
self.file_size = file_size
self.file_sha1 = file_sha1
self.file_extension = FileExtension(file_extension)
self.original_file_name = original_file_name
def is_empty(self):
return self.file_size < 1 # pyright: ignore reportPrivateUsage=none
def to_qfile(self, brain_id: UUID, metadata: dict[str, Any] = {}) -> QuivrFile:
return QuivrFile(
id=self.id,
original_filename=self.file_name,
path=self.tmp_file_path,
brain_id=brain_id,
file_sha1=self.file_sha1,
file_extension=self.file_extension,
file_size=self.file_size,
metadata={
"date": time.strftime("%Y%m%d"),
"file_name": self.file_name,
"original_file_name": self.original_file_name,
"knowledge_id": self.id,
**metadata,
},
)