Back/refacto files (#240)

* feat(docker): added docker for prod * feat(refacto): moved to modules
2024-09-11 13:15:41 +03:00 · 2023-06-03 23:12:42 +02:00 · 2023-06-03 23:12:42 +02:00 · a3ca7ecb37
commit a3ca7ecb37
parent 59c02228b6
19 changed files with 230 additions and 123 deletions
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 dev:
 	docker compose -f docker-compose.dev.yml up --build
 prod:
 	docker compose -f docker-compose.yml up --build
--- a/README.md
+++ b/README.md
@ -110,11 +110,17 @@ cp .frontend_env.example frontend/.env
 - **Step 5**: Launch the app
 ```bash
-docker compose build && docker compose up
+docker compose -f docker-compose.yml up --build
 ```
 - **Step 6**: Navigate to `localhost:3000` in your browser
 - ** Step 7**: Want to contribute to the project? 
 ```
 docker compose -f docker-compose.dev.yml up --build
 ```
 ## Contributors ✨
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@ -11,4 +11,4 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt --timeout 100
 COPY . /code/
-CMD ["uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
+CMD ["uvicorn", "main:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
--- a/backend/auth/auth_bearer.py
+++ b/backend/auth/auth_bearer.py
@ -1,10 +1,11 @@
 import os
 from typing import Optional
 from auth_handler import decode_access_token
 from fastapi import HTTPException, Request
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 from .auth_handler import decode_access_token
 class JWTBearer(HTTPBearer):
    def __init__(self, auto_error: bool = True):
--- a/backend/auth/auth_handler.py
+++ b/backend/auth/auth_handler.py
--- a/backend/llm/qa.py
+++ b/backend/llm/qa.py
@ -10,8 +10,8 @@ from langchain.llms import VertexAI
 from langchain.memory import ConversationBufferMemory
 from langchain.vectorstores import SupabaseVectorStore
 from llm import LANGUAGE_PROMPT
 from models.chats import ChatMessage
 from supabase import Client, create_client
 from utils import ChatMessage
 class CustomSupabaseVectorStore(SupabaseVectorStore):
--- a/backend/main.py
+++ b/backend/main.py
@ -4,51 +4,28 @@ import time
 from tempfile import SpooledTemporaryFile
 import pypandoc
-from auth_bearer import JWTBearer
+from auth.auth_bearer import JWTBearer
 from crawl.crawler import CrawlWebsite
 from fastapi import Depends, FastAPI, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from llm.qa import get_qa_llm
 from llm.summarization import llm_evaluate_summaries
 from logger import get_logger
-from parsers.audio import process_audio
+from middlewares.cors import add_cors_middleware
-from parsers.common import file_already_exists
+from models.chats import ChatMessage
-from parsers.csv import process_csv
+from models.users import User
 from parsers.docx import process_docx
 from parsers.epub import process_epub
 from parsers.html import process_html
 from parsers.markdown import process_markdown
 from parsers.notebook import process_ipnyb
 from parsers.odt import process_odt
 from parsers.pdf import process_pdf
 from parsers.powerpoint import process_powerpoint
 from parsers.txt import process_txt
 from pydantic import BaseModel
 from supabase import Client
-from utils import (ChatMessage, CommonsDep, convert_bytes, create_user,
+from utils.file import convert_bytes, get_file_size
-                   get_file_size, similarity_search, update_user_request_count)
+from utils.processors import filter_file
 from utils.vectors import (CommonsDep, create_user, similarity_search,
                           update_user_request_count)
 logger = get_logger(__name__)
 app = FastAPI()
 origins = [
    "http://localhost",
    "http://localhost:3000",
    "https://quivr.app",
    "https://www.quivr.app",
    "http://quivr.app",
    "http://www.quivr.app",
    "*"
 ]
-app.add_middleware(
+add_cors_middleware(app)
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
@ -57,45 +34,6 @@ async def startup_event():
    pypandoc.download_pandoc()
 file_processors = {
    ".txt": process_txt,
    ".csv": process_csv,
    ".md": process_markdown,
    ".markdown": process_markdown,
    ".m4a": process_audio,
    ".mp3": process_audio,
    ".webm": process_audio,
    ".mp4": process_audio,
    ".mpga": process_audio,
    ".wav": process_audio,
    ".mpeg": process_audio,
    ".pdf": process_pdf,
    ".html": process_html,
    ".pptx": process_powerpoint,
    ".docx": process_docx,
    ".odt": process_odt,
    ".epub": process_epub,
    ".ipynb": process_ipnyb,
 }
 class User (BaseModel):
    email: str
 async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
    if await file_already_exists(supabase_client, file, user):
        return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
    elif file.file._file.tell()  < 1:
        return {"message": f"❌ {file.filename} is empty.", "type": "error"}
    else:
        file_extension = os.path.splitext(file.filename)[-1].lower()  # Convert file extension to lowercase
        if file_extension in file_processors:
            await file_processors[file_extension](file, enable_summarization, user)
            return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
        else:
            return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
@app.post("/upload", dependencies=[Depends(JWTBearer())])
@ -221,7 +159,7 @@ async def delete_endpoint(commons: CommonsDep, file_name: str, credentials: dict
 async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dict = Depends(JWTBearer()) ):
    user = User(email=credentials.get('email', 'none'))
    response = commons['supabase'].table("vectors").select(
-        "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
+        "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url", "content").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
    documents = response.data
    # Returns all documents with the same file name
    return {"documents": documents}
@ -229,4 +167,4 @@ async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dic
@app.get("/")
 async def root():
-    return {"message": "Hello World"}
+    return {"status": "OK"}
--- a/backend/middlewares/cors.py
+++ b/backend/middlewares/cors.py
@ -0,0 +1,21 @@
 from fastapi.middleware.cors import CORSMiddleware
 origins = [
    "http://localhost",
    "http://localhost:3000",
    "https://quivr.app",
    "https://www.quivr.app",
    "http://quivr.app",
    "http://www.quivr.app",
    "*"
 ]
 def add_cors_middleware(app):
    app.add_middleware(
        CORSMiddleware,
        allow_origins=origins,
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
--- a/backend/models/chats.py
+++ b/backend/models/chats.py
@ -0,0 +1,13 @@
 from typing import List, Tuple
 from pydantic import BaseModel
 class ChatMessage(BaseModel):
    model: str = "gpt-3.5-turbo"
    question: str
    # A list of tuples where each tuple is (speaker, text)
    history: List[Tuple[str, str]]
    temperature: float = 0.0
    max_tokens: int = 256
    use_summarization: bool = False
--- a/backend/models/users.py
+++ b/backend/models/users.py
@ -0,0 +1,5 @@
 from pydantic import BaseModel
 class User (BaseModel):
    email: str
--- a/backend/parsers/audio.py
+++ b/backend/parsers/audio.py
@ -10,7 +10,8 @@ from langchain.document_loaders import TextLoader
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from utils import compute_sha1_from_content, documents_vector_store
+from utils.file import compute_sha1_from_content
 from utils.vectors import documents_vector_store
 # # Create a function to transcribe audio using Whisper
 # def _transcribe_audio(api_key, audio_file, stats_db):
--- a/backend/parsers/common.py
+++ b/backend/parsers/common.py
@ -8,8 +8,8 @@ from typing import Optional
 from fastapi import UploadFile
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from utils import (compute_sha1_from_content, compute_sha1_from_file,
+from utils.file import compute_sha1_from_content, compute_sha1_from_file
-                   create_summary, create_vector, documents_vector_store)
+from utils.vectors import create_summary, create_vector, documents_vector_store
 async def process_file(file: UploadFile, loader_class, file_suffix, enable_summarization, user):
@ -52,6 +52,8 @@ async def process_file(file: UploadFile, loader_class, file_suffix, enable_summa
        doc_with_metadata = Document(
            page_content=doc.page_content, metadata=metadata)
        create_vector(user.email, doc_with_metadata)
            #     add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
        if enable_summarization and ids and len(ids) > 0:
            create_summary(ids[0], doc.page_content, metadata)
    return
--- a/backend/utils/file.py
+++ b/backend/utils/file.py
@ -0,0 +1,37 @@
 import hashlib
 from fastapi import UploadFile
 def convert_bytes(bytes, precision=2):
    """Converts bytes into a human-friendly format."""
    abbreviations = ['B', 'KB', 'MB']
    if bytes <= 0:
        return '0 B'
    size = bytes
    index = 0
    while size >= 1024 and index < len(abbreviations) - 1:
        size /= 1024
        index += 1
    return f'{size:.{precision}f} {abbreviations[index]}'
 def get_file_size(file: UploadFile):
    # move the cursor to the end of the file
    file.file._file.seek(0, 2)
    file_size = file.file._file.tell()  # Getting the size of the file
    # move the cursor back to the beginning of the file 
    file.file.seek(0)
    return file_size
 def compute_sha1_from_file(file_path):
    with open(file_path, "rb") as file:
        bytes = file.read()
        readable_hash = compute_sha1_from_content(bytes)
    return readable_hash
 def compute_sha1_from_content(content):
    readable_hash = hashlib.sha1(content).hexdigest()
    return readable_hash
--- a/backend/utils/processors.py
+++ b/backend/utils/processors.py
@ -0,0 +1,55 @@
 import os
 from fastapi import Depends, FastAPI, UploadFile
 from models.users import User
 from parsers.audio import process_audio
 from parsers.common import file_already_exists
 from parsers.csv import process_csv
 from parsers.docx import process_docx
 from parsers.epub import process_epub
 from parsers.html import process_html
 from parsers.markdown import process_markdown
 from parsers.notebook import process_ipnyb
 from parsers.odt import process_odt
 from parsers.pdf import process_pdf
 from parsers.powerpoint import process_powerpoint
 from parsers.txt import process_txt
 from supabase import Client
 file_processors = {
    ".txt": process_txt,
    ".csv": process_csv,
    ".md": process_markdown,
    ".markdown": process_markdown,
    ".m4a": process_audio,
    ".mp3": process_audio,
    ".webm": process_audio,
    ".mp4": process_audio,
    ".mpga": process_audio,
    ".wav": process_audio,
    ".mpeg": process_audio,
    ".pdf": process_pdf,
    ".html": process_html,
    ".pptx": process_powerpoint,
    ".docx": process_docx,
    ".odt": process_odt,
    ".epub": process_epub,
    ".ipynb": process_ipnyb,
 }
 async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
    if await file_already_exists(supabase_client, file, user):
        return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
    elif file.file._file.tell()  < 1:
        return {"message": f"❌ {file.filename} is empty.", "type": "error"}
    else:
        file_extension = os.path.splitext(file.filename)[-1].lower()  # Convert file extension to lowercase
        if file_extension in file_processors:
            await file_processors[file_extension](file, enable_summarization, user)
            return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
        else:
            return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
--- a/backend/utils/vectors.py
+++ b/backend/utils/vectors.py
@ -1,4 +1,3 @@
 import hashlib
 import os
 from typing import Annotated, List, Tuple
@ -9,7 +8,6 @@ from langchain.vectorstores import SupabaseVectorStore
 from llm.summarization import llm_summerize
 from logger import get_logger
 from pydantic import BaseModel
 from supabase import Client, create_client
 logger = get_logger(__name__)
@ -27,17 +25,8 @@ summaries_vector_store = SupabaseVectorStore(
    supabase_client, embeddings, table_name="summaries")
 def compute_sha1_from_file(file_path):
    with open(file_path, "rb") as file:
        bytes = file.read()
        readable_hash = compute_sha1_from_content(bytes)
    return readable_hash
 def compute_sha1_from_content(content):
    readable_hash = hashlib.sha1(content).hexdigest()
    return readable_hash
 def common_dependencies():
    return {
@ -51,14 +40,6 @@ def common_dependencies():
 CommonsDep = Annotated[dict, Depends(common_dependencies)]
 class ChatMessage(BaseModel):
    model: str = "gpt-3.5-turbo"
    question: str
    # A list of tuples where each tuple is (speaker, text)
    history: List[Tuple[str, str]]
    temperature: float = 0.0
    max_tokens: int = 256
    use_summarization: bool = False
 def create_summary(document_id, content, metadata):
@ -107,23 +88,5 @@ def similarity_search(query, table='match_summaries', top_k=5, threshold=0.5):
    ).execute()
    return summaries.data
 def get_file_size(file: UploadFile):
    # move the cursor to the end of the file
    file.file._file.seek(0, 2)
    file_size = file.file._file.tell()  # Getting the size of the file
    # move the cursor back to the beginning of the file 
    file.file.seek(0)
    return file_size
 def convert_bytes(bytes, precision=2):
    """Converts bytes into a human-friendly format."""
    abbreviations = ['B', 'KB', 'MB']
    if bytes <= 0:
        return '0 B'
    size = bytes
    index = 0
    while size >= 1024 and index < len(abbreviations) - 1:
        size /= 1024
        index += 1
    return f'{size:.{precision}f} {abbreviations[index]}'
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@ -0,0 +1,30 @@
 version: "3"
 services:
  frontend:
    env_file:
      - ./frontend/.env
    build:
      context: frontend
      dockerfile: Dockerfile.dev
    container_name: web
    restart: always
    volumes:
      - ./frontend/:/app
      - /app/node_modules
      - /app/.next
    ports:
      - 3000:3000
  backend:
    env_file:
      - ./backend/.env
    build:
      context: backend
      dockerfile: Dockerfile
    container_name: backend
    restart: always
    volumes:
      - ./backend/:/code/
      - ~/.config/gcloud:/root/.config/gcloud
    ports:
      - 5050:5050
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -9,10 +9,6 @@ services:
      dockerfile: Dockerfile
    container_name: web
    restart: always
    volumes:
      - ./frontend/:/app
      - /app/node_modules
      - /app/.next
    ports:
      - 3000:3000
  backend:
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@ -1,4 +1,5 @@
 FROM node:18-alpine
 # Install Python and essential build tools
 RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
 RUN python3 -m ensurepip
@ -21,8 +22,11 @@ RUN yarn install
 # Copy the rest of our Next.js folder into /app
 COPY . .
 # Build the Next.js application
 RUN yarn build
 # Ensure port 3000 is accessible to our system
 EXPOSE 3000
-# Run yarn dev, as we would via the command line 
+# Run yarn start, as we would via the command line 
-CMD ["yarn", "dev"]
+CMD ["yarn", "start"]
--- a/frontend/Dockerfile.dev
+++ b/frontend/Dockerfile.dev
@ -0,0 +1,28 @@
 FROM node:18-alpine
 # Install Python and essential build tools
 RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
 RUN python3 -m ensurepip
 RUN pip3 install --no-cache --upgrade pip setuptools
 # Create the directory on the node image 
 # where our Next.js app will live
 RUN mkdir -p /app
 # Set /app as the working directory
 WORKDIR /app
 # Copy package.json and yarn.lock
 # to the /app working directory
 COPY package*.json yarn.lock ./
 # Install dependencies in /app
 RUN yarn install
 # Copy the rest of our Next.js folder into /app
 COPY . .
 # Ensure port 3000 is accessible to our system
 EXPOSE 3000
 # Run yarn dev, as we would via the command line 
 CMD ["yarn", "dev"]
`@ -11,4 +11,4 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt --timeout 100`

	`COPY . /code/`	`COPY . /code/`

	`CMD ["uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]`	`CMD ["uvicorn", "main:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]`