mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-15 01:21:48 +03:00
282fa0e3f8
# Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate):
107 lines
3.0 KiB
Python
107 lines
3.0 KiB
Python
import hashlib
|
|
import time
|
|
from contextlib import contextmanager
|
|
from pathlib import Path
|
|
from tempfile import NamedTemporaryFile
|
|
from typing import Any
|
|
from uuid import UUID
|
|
|
|
from quivr_api.logger import get_logger
|
|
from quivr_core.files.file import FileExtension, QuivrFile
|
|
|
|
from quivr_worker.utils.utils import get_tmp_name
|
|
|
|
logger = get_logger("celery_worker")
|
|
|
|
|
|
def compute_sha1(content: bytes) -> str:
|
|
m = hashlib.sha1()
|
|
m.update(content)
|
|
return m.hexdigest()
|
|
|
|
|
|
@contextmanager
|
|
def build_file(
|
|
file_data: bytes,
|
|
knowledge_id: UUID,
|
|
file_name: str,
|
|
original_file_name: str | None = None,
|
|
):
|
|
try:
|
|
# TODO(@aminediro) : Maybe use fsspec file to be agnostic to where files are stored :?
|
|
# We are reading the whole file to memory, which doesn't scale
|
|
tmp_name, base_file_name, file_extension = get_tmp_name(file_name)
|
|
tmp_file = NamedTemporaryFile(
|
|
suffix="_" + tmp_name, # pyright: ignore reportPrivateUsage=none
|
|
)
|
|
tmp_file.write(file_data)
|
|
tmp_file.flush()
|
|
file_sha1 = compute_sha1(file_data)
|
|
|
|
file_instance = File(
|
|
knowledge_id=knowledge_id,
|
|
file_name=base_file_name,
|
|
original_file_name=(
|
|
original_file_name if original_file_name else base_file_name
|
|
),
|
|
tmp_file_path=Path(tmp_file.name),
|
|
file_size=len(file_data),
|
|
file_extension=file_extension,
|
|
file_sha1=file_sha1,
|
|
)
|
|
yield file_instance
|
|
finally:
|
|
# Code to release resource, e.g.:
|
|
tmp_file.close()
|
|
|
|
|
|
class File:
|
|
__slots__ = [
|
|
"id",
|
|
"file_name",
|
|
"tmp_file_path",
|
|
"file_size",
|
|
"file_extension",
|
|
"file_sha1",
|
|
"original_file_name",
|
|
]
|
|
|
|
def __init__(
|
|
self,
|
|
knowledge_id: UUID,
|
|
file_name: str,
|
|
tmp_file_path: Path,
|
|
file_size: int,
|
|
file_extension: str,
|
|
file_sha1: str,
|
|
original_file_name: str,
|
|
):
|
|
self.id = knowledge_id
|
|
self.file_name = file_name
|
|
self.tmp_file_path = tmp_file_path
|
|
self.file_size = file_size
|
|
self.file_sha1 = file_sha1
|
|
self.file_extension = FileExtension(file_extension)
|
|
self.original_file_name = original_file_name
|
|
|
|
def is_empty(self):
|
|
return self.file_size < 1 # pyright: ignore reportPrivateUsage=none
|
|
|
|
def to_qfile(self, brain_id: UUID, metadata: dict[str, Any] = {}) -> QuivrFile:
|
|
return QuivrFile(
|
|
id=self.id,
|
|
original_filename=self.file_name,
|
|
path=self.tmp_file_path,
|
|
brain_id=brain_id,
|
|
file_sha1=self.file_sha1,
|
|
file_extension=self.file_extension,
|
|
file_size=self.file_size,
|
|
metadata={
|
|
"date": time.strftime("%Y%m%d"),
|
|
"file_name": self.file_name,
|
|
"original_file_name": self.original_file_name,
|
|
"knowledge_id": self.id,
|
|
**metadata,
|
|
},
|
|
)
|