Back/refacto files (#240)

* feat(docker): added docker for prod * feat(refacto): moved to modules
2024-12-23 03:12:24 +03:00 · 2023-06-03 23:12:42 +02:00 · 2023-06-03 23:12:42 +02:00 · a3ca7ecb37
commit a3ca7ecb37
parent 59c02228b6
19 changed files with 230 additions and 123 deletions
--- a/7
+++ b/7
@ -0,0 +1,7 @@
+
+
+dev:
+	docker compose -f docker-compose.dev.yml up --build
+
+prod:
+	docker compose -f docker-compose.yml up --build
--- a/README.md
+++ b/README.md
@ -110,11 +110,17 @@ cp .frontend_env.example frontend/.env
 - **Step 5**: Launch the app

 ```bash
-docker compose build && docker compose up
+docker compose -f docker-compose.yml up --build
 ```

 - **Step 6**: Navigate to `localhost:3000` in your browser

+- ** Step 7**: Want to contribute to the project? 
+
+```
+docker compose -f docker-compose.dev.yml up --build
+```
+


 ## Contributors ✨
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@ -11,4 +11,4 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt --timeout 100

 COPY . /code/

-CMD ["uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
+CMD ["uvicorn", "main:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
--- a/backend/auth/auth_bearer.py
+++ b/backend/auth/auth_bearer.py
@ -1,10 +1,11 @@
 import os
 from typing import Optional

-from auth_handler import decode_access_token
 from fastapi import HTTPException, Request
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer

+from .auth_handler import decode_access_token
+

 class JWTBearer(HTTPBearer):
    def __init__(self, auto_error: bool = True):
--- a/backend/auth/auth_handler.py
+++ b/backend/auth/auth_handler.py
--- a/backend/llm/qa.py
+++ b/backend/llm/qa.py
@ -10,8 +10,8 @@ from langchain.llms import VertexAI
 from langchain.memory import ConversationBufferMemory
 from langchain.vectorstores import SupabaseVectorStore
 from llm import LANGUAGE_PROMPT
+from models.chats import ChatMessage
 from supabase import Client, create_client
-from utils import ChatMessage


 class CustomSupabaseVectorStore(SupabaseVectorStore):
--- a/backend/main.py
+++ b/backend/main.py
@ -4,51 +4,28 @@ import time
 from tempfile import SpooledTemporaryFile

 import pypandoc
-from auth_bearer import JWTBearer
+from auth.auth_bearer import JWTBearer
 from crawl.crawler import CrawlWebsite
 from fastapi import Depends, FastAPI, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
 from llm.qa import get_qa_llm
 from llm.summarization import llm_evaluate_summaries
 from logger import get_logger
-from parsers.audio import process_audio
-from parsers.common import file_already_exists
-from parsers.csv import process_csv
-from parsers.docx import process_docx
-from parsers.epub import process_epub
-from parsers.html import process_html
-from parsers.markdown import process_markdown
-from parsers.notebook import process_ipnyb
-from parsers.odt import process_odt
-from parsers.pdf import process_pdf
-from parsers.powerpoint import process_powerpoint
-from parsers.txt import process_txt
+from middlewares.cors import add_cors_middleware
+from models.chats import ChatMessage
+from models.users import User
 from pydantic import BaseModel
 from supabase import Client
-from utils import (ChatMessage, CommonsDep, convert_bytes, create_user,
-                   get_file_size, similarity_search, update_user_request_count)
+from utils.file import convert_bytes, get_file_size
+from utils.processors import filter_file
+from utils.vectors import (CommonsDep, create_user, similarity_search,
+                           update_user_request_count)

 logger = get_logger(__name__)

 app = FastAPI()

-origins = [
-    "http://localhost",
-    "http://localhost:3000",
-    "https://quivr.app",
-    "https://www.quivr.app",
-    "http://quivr.app",
-    "http://www.quivr.app",
-    "*"
-]

-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=origins,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
+add_cors_middleware(app)



@ -57,45 +34,6 @@ async def startup_event():
    pypandoc.download_pandoc()


-file_processors = {
-    ".txt": process_txt,
-    ".csv": process_csv,
-    ".md": process_markdown,
-    ".markdown": process_markdown,
-    ".m4a": process_audio,
-    ".mp3": process_audio,
-    ".webm": process_audio,
-    ".mp4": process_audio,
-    ".mpga": process_audio,
-    ".wav": process_audio,
-    ".mpeg": process_audio,
-    ".pdf": process_pdf,
-    ".html": process_html,
-    ".pptx": process_powerpoint,
-    ".docx": process_docx,
-    ".odt": process_odt,
-    ".epub": process_epub,
-    ".ipynb": process_ipnyb,
-}
-
-
-class User (BaseModel):
-    email: str
-
-
-async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
-    if await file_already_exists(supabase_client, file, user):
-        return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
-    elif file.file._file.tell()  < 1:
-        return {"message": f"❌ {file.filename} is empty.", "type": "error"}
-    else:
-        file_extension = os.path.splitext(file.filename)[-1].lower()  # Convert file extension to lowercase
-        if file_extension in file_processors:
-            await file_processors[file_extension](file, enable_summarization, user)
-            return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
-        else:
-            return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
-


@app.post("/upload", dependencies=[Depends(JWTBearer())])
@ -221,7 +159,7 @@ async def delete_endpoint(commons: CommonsDep, file_name: str, credentials: dict
 async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dict = Depends(JWTBearer()) ):
    user = User(email=credentials.get('email', 'none'))
    response = commons['supabase'].table("vectors").select(
-        "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
+        "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url", "content").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
    documents = response.data
    # Returns all documents with the same file name
    return {"documents": documents}
@ -229,4 +167,4 @@ async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dic

@app.get("/")
 async def root():
-    return {"message": "Hello World"}
+    return {"status": "OK"}
--- a/backend/middlewares/cors.py
+++ b/backend/middlewares/cors.py
@ -0,0 +1,21 @@
+from fastapi.middleware.cors import CORSMiddleware
+
+origins = [
+    "http://localhost",
+    "http://localhost:3000",
+    "https://quivr.app",
+    "https://www.quivr.app",
+    "http://quivr.app",
+    "http://www.quivr.app",
+    "*"
+]
+
+
+def add_cors_middleware(app):
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=origins,
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
--- a/backend/models/chats.py
+++ b/backend/models/chats.py
@ -0,0 +1,13 @@
+from typing import List, Tuple
+
+from pydantic import BaseModel
+
+
+class ChatMessage(BaseModel):
+    model: str = "gpt-3.5-turbo"
+    question: str
+    # A list of tuples where each tuple is (speaker, text)
+    history: List[Tuple[str, str]]
+    temperature: float = 0.0
+    max_tokens: int = 256
+    use_summarization: bool = False
--- a/backend/models/users.py
+++ b/backend/models/users.py
@ -0,0 +1,5 @@
+from pydantic import BaseModel
+
+
+class User (BaseModel):
+    email: str
--- a/backend/parsers/audio.py
+++ b/backend/parsers/audio.py
@ -10,7 +10,8 @@ from langchain.document_loaders import TextLoader
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from utils import compute_sha1_from_content, documents_vector_store
+from utils.file import compute_sha1_from_content
+from utils.vectors import documents_vector_store

 # # Create a function to transcribe audio using Whisper
 # def _transcribe_audio(api_key, audio_file, stats_db):
--- a/backend/parsers/common.py
+++ b/backend/parsers/common.py
@ -8,8 +8,8 @@ from typing import Optional
 from fastapi import UploadFile
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from utils import (compute_sha1_from_content, compute_sha1_from_file,
-                   create_summary, create_vector, documents_vector_store)
+from utils.file import compute_sha1_from_content, compute_sha1_from_file
+from utils.vectors import create_summary, create_vector, documents_vector_store


 async def process_file(file: UploadFile, loader_class, file_suffix, enable_summarization, user):
@ -52,6 +52,8 @@ async def process_file(file: UploadFile, loader_class, file_suffix, enable_summa
        doc_with_metadata = Document(
            page_content=doc.page_content, metadata=metadata)
        create_vector(user.email, doc_with_metadata)
+            #     add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
+
        if enable_summarization and ids and len(ids) > 0:
            create_summary(ids[0], doc.page_content, metadata)
    return
--- a/backend/utils/file.py
+++ b/backend/utils/file.py
@ -0,0 +1,37 @@
+import hashlib
+
+from fastapi import UploadFile
+
+
+def convert_bytes(bytes, precision=2):
+    """Converts bytes into a human-friendly format."""
+    abbreviations = ['B', 'KB', 'MB']
+    if bytes <= 0:
+        return '0 B'
+    size = bytes
+    index = 0
+    while size >= 1024 and index < len(abbreviations) - 1:
+        size /= 1024
+        index += 1
+    return f'{size:.{precision}f} {abbreviations[index]}'
+
+def get_file_size(file: UploadFile):
+    # move the cursor to the end of the file
+    file.file._file.seek(0, 2)
+    file_size = file.file._file.tell()  # Getting the size of the file
+    # move the cursor back to the beginning of the file 
+    file.file.seek(0)
+
+    return file_size
+
+
+def compute_sha1_from_file(file_path):
+    with open(file_path, "rb") as file:
+        bytes = file.read()
+        readable_hash = compute_sha1_from_content(bytes)
+    return readable_hash
+
+
+def compute_sha1_from_content(content):
+    readable_hash = hashlib.sha1(content).hexdigest()
+    return readable_hash
--- a/backend/utils/processors.py
+++ b/backend/utils/processors.py
@ -0,0 +1,55 @@
+import os
+
+from fastapi import Depends, FastAPI, UploadFile
+from models.users import User
+from parsers.audio import process_audio
+from parsers.common import file_already_exists
+from parsers.csv import process_csv
+from parsers.docx import process_docx
+from parsers.epub import process_epub
+from parsers.html import process_html
+from parsers.markdown import process_markdown
+from parsers.notebook import process_ipnyb
+from parsers.odt import process_odt
+from parsers.pdf import process_pdf
+from parsers.powerpoint import process_powerpoint
+from parsers.txt import process_txt
+from supabase import Client
+
+file_processors = {
+    ".txt": process_txt,
+    ".csv": process_csv,
+    ".md": process_markdown,
+    ".markdown": process_markdown,
+    ".m4a": process_audio,
+    ".mp3": process_audio,
+    ".webm": process_audio,
+    ".mp4": process_audio,
+    ".mpga": process_audio,
+    ".wav": process_audio,
+    ".mpeg": process_audio,
+    ".pdf": process_pdf,
+    ".html": process_html,
+    ".pptx": process_powerpoint,
+    ".docx": process_docx,
+    ".odt": process_odt,
+    ".epub": process_epub,
+    ".ipynb": process_ipnyb,
+}
+
+
+
+
+async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
+    if await file_already_exists(supabase_client, file, user):
+        return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
+    elif file.file._file.tell()  < 1:
+        return {"message": f"❌ {file.filename} is empty.", "type": "error"}
+    else:
+        file_extension = os.path.splitext(file.filename)[-1].lower()  # Convert file extension to lowercase
+        if file_extension in file_processors:
+            await file_processors[file_extension](file, enable_summarization, user)
+            return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
+        else:
+            return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
+
--- a/backend/utils/vectors.py
+++ b/backend/utils/vectors.py
@ -1,4 +1,3 @@
-import hashlib
 import os
 from typing import Annotated, List, Tuple

@ -9,7 +8,6 @@ from langchain.vectorstores import SupabaseVectorStore
 from llm.summarization import llm_summerize
 from logger import get_logger
 from pydantic import BaseModel
-
 from supabase import Client, create_client

 logger = get_logger(__name__)
@ -27,17 +25,8 @@ summaries_vector_store = SupabaseVectorStore(
    supabase_client, embeddings, table_name="summaries")


-def compute_sha1_from_file(file_path):
-    with open(file_path, "rb") as file:
-        bytes = file.read()
-        readable_hash = compute_sha1_from_content(bytes)
-    return readable_hash


-def compute_sha1_from_content(content):
-    readable_hash = hashlib.sha1(content).hexdigest()
-    return readable_hash
-

 def common_dependencies():
    return {
@ -51,14 +40,6 @@ def common_dependencies():
 CommonsDep = Annotated[dict, Depends(common_dependencies)]


-class ChatMessage(BaseModel):
-    model: str = "gpt-3.5-turbo"
-    question: str
-    # A list of tuples where each tuple is (speaker, text)
-    history: List[Tuple[str, str]]
-    temperature: float = 0.0
-    max_tokens: int = 256
-    use_summarization: bool = False


 def create_summary(document_id, content, metadata):
@ -107,23 +88,5 @@ def similarity_search(query, table='match_summaries', top_k=5, threshold=0.5):
    ).execute()
    return summaries.data

-def get_file_size(file: UploadFile):
-    # move the cursor to the end of the file
-    file.file._file.seek(0, 2)
-    file_size = file.file._file.tell()  # Getting the size of the file
-    # move the cursor back to the beginning of the file 
-    file.file.seek(0)

-    return file_size

-def convert_bytes(bytes, precision=2):
-    """Converts bytes into a human-friendly format."""
-    abbreviations = ['B', 'KB', 'MB']
-    if bytes <= 0:
-        return '0 B'
-    size = bytes
-    index = 0
-    while size >= 1024 and index < len(abbreviations) - 1:
-        size /= 1024
-        index += 1
-    return f'{size:.{precision}f} {abbreviations[index]}'
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@ -0,0 +1,30 @@
+version: "3"
+
+services:
+  frontend:
+    env_file:
+      - ./frontend/.env
+    build:
+      context: frontend
+      dockerfile: Dockerfile.dev
+    container_name: web
+    restart: always
+    volumes:
+      - ./frontend/:/app
+      - /app/node_modules
+      - /app/.next
+    ports:
+      - 3000:3000
+  backend:
+    env_file:
+      - ./backend/.env
+    build:
+      context: backend
+      dockerfile: Dockerfile
+    container_name: backend
+    restart: always
+    volumes:
+      - ./backend/:/code/
+      - ~/.config/gcloud:/root/.config/gcloud
+    ports:
+      - 5050:5050
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -9,10 +9,6 @@ services:
      dockerfile: Dockerfile
    container_name: web
    restart: always
-    volumes:
-      - ./frontend/:/app
-      - /app/node_modules
-      - /app/.next
    ports:
      - 3000:3000
  backend:
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@ -1,4 +1,5 @@
 FROM node:18-alpine
+
 # Install Python and essential build tools
 RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
 RUN python3 -m ensurepip
@ -21,8 +22,11 @@ RUN yarn install
 # Copy the rest of our Next.js folder into /app
 COPY . .

+# Build the Next.js application
+RUN yarn build
+
 # Ensure port 3000 is accessible to our system
 EXPOSE 3000

-# Run yarn dev, as we would via the command line 
-CMD ["yarn", "dev"]
+# Run yarn start, as we would via the command line 
+CMD ["yarn", "start"]
--- a/frontend/Dockerfile.dev
+++ b/frontend/Dockerfile.dev
@ -0,0 +1,28 @@
+FROM node:18-alpine
+# Install Python and essential build tools
+RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
+RUN python3 -m ensurepip
+RUN pip3 install --no-cache --upgrade pip setuptools
+
+# Create the directory on the node image 
+# where our Next.js app will live
+RUN mkdir -p /app
+
+# Set /app as the working directory
+WORKDIR /app
+
+# Copy package.json and yarn.lock
+# to the /app working directory
+COPY package*.json yarn.lock ./
+
+# Install dependencies in /app
+RUN yarn install
+
+# Copy the rest of our Next.js folder into /app
+COPY . .
+
+# Ensure port 3000 is accessible to our system
+EXPOSE 3000
+
+# Run yarn dev, as we would via the command line 
+CMD ["yarn", "dev"]