quivr/backend/worker/quivr_worker/files.py
Stan Girard 380cf82706
feat: quivr core 0.1 (#2970)
# Description


# Testing backend 

## Docker setup
1. Copy `.env.example` to `.env`. Some env variables were added :
EMBEDDING_DIM
2. Apply supabase migratrions : 
```sh
supabase stop
supabase db reset
supabase start
```
3. Start backend containers
```
make dev
```
## Local setup 
You can also run backend without docker.
1. Install [`rye`](https://rye.astral.sh/guide/installation/). Choose
the managed python version and set the version to 3.11
2. Run the following: 
```
cd quivr/backend
rye sync
```
3. Source `.venv` virtual env : `source .venv/bin/activate`
4. Run the backend, make sure you are running redis and supabase
API: 
```
LOG_LEVEL=debug uvicorn quivr_api.main:app --log-level debug --reload --host 0.0.0.0 --port 5050 --workers 1
```
Worker: 
```
LOG_LEVEL=debug celery -A quivr_worker.celery_worker worker -l info -E --concurrency 1
```
Notifier: 
```
LOG_LEVEL=debug python worker/quivr_worker/celery_monitor.py
```

---------

Co-authored-by: chloedia <chloedaems0@gmail.com>
Co-authored-by: aminediro <aminedirhoussi1@gmail.com>
Co-authored-by: Antoine Dewez <44063631+Zewed@users.noreply.github.com>
Co-authored-by: Chloé Daems <73901882+chloedia@users.noreply.github.com>
Co-authored-by: Zewed <dewez.antoine2@gmail.com>
2024-09-02 10:20:53 +02:00

107 lines
3.0 KiB
Python

import hashlib
import time
from contextlib import contextmanager
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Any
from uuid import UUID
from quivr_api.logger import get_logger
from quivr_core.files.file import FileExtension, QuivrFile
from quivr_worker.utils import get_tmp_name
logger = get_logger("celery_worker")
def compute_sha1(content: bytes) -> str:
m = hashlib.sha1()
m.update(content)
return m.hexdigest()
@contextmanager
def build_file(
file_data: bytes,
knowledge_id: UUID,
file_name: str,
original_file_name: str | None = None,
):
try:
# TODO(@aminediro) : Maybe use fsspec file to be agnostic to where files are stored :?
# We are reading the whole file to memory, which doesn't scale
tmp_name, base_file_name, file_extension = get_tmp_name(file_name)
tmp_file = NamedTemporaryFile(
suffix="_" + tmp_name, # pyright: ignore reportPrivateUsage=none
)
tmp_file.write(file_data)
tmp_file.flush()
file_sha1 = compute_sha1(file_data)
file_instance = File(
knowledge_id=knowledge_id,
file_name=base_file_name,
original_file_name=(
original_file_name if original_file_name else base_file_name
),
tmp_file_path=Path(tmp_file.name),
file_size=len(file_data),
file_extension=file_extension,
file_sha1=file_sha1,
)
yield file_instance
finally:
# Code to release resource, e.g.:
tmp_file.close()
class File:
__slots__ = [
"id",
"file_name",
"tmp_file_path",
"file_size",
"file_extension",
"file_sha1",
"original_file_name",
]
def __init__(
self,
knowledge_id: UUID,
file_name: str,
tmp_file_path: Path,
file_size: int,
file_extension: str,
file_sha1: str,
original_file_name: str,
):
self.id = knowledge_id
self.file_name = file_name
self.tmp_file_path = tmp_file_path
self.file_size = file_size
self.file_sha1 = file_sha1
self.file_extension = FileExtension(file_extension)
self.original_file_name = original_file_name
def is_empty(self):
return self.file_size < 1 # pyright: ignore reportPrivateUsage=none
def to_qfile(self, brain_id: UUID, metadata: dict[str, Any] = {}) -> QuivrFile:
return QuivrFile(
id=self.id,
original_filename=self.file_name,
path=self.tmp_file_path,
brain_id=brain_id,
file_sha1=self.file_sha1,
file_extension=self.file_extension,
file_size=self.file_size,
metadata={
"date": time.strftime("%Y%m%d"),
"file_name": self.file_name,
"original_file_name": self.original_file_name,
"knowledge_id": self.id,
**metadata,
},
)