From a3ca7ecb3746cb15bcb7c507cfb809cdbda6fa95 Mon Sep 17 00:00:00 2001
From: Stan Girard <girard.stanislas@gmail.com>
Date: Sat, 3 Jun 2023 23:12:42 +0200
Subject: [PATCH] Back/refacto files (#240)

* feat(docker): added docker for prod

* feat(refacto): moved to modules
---
 Makefile                               |  7 +++
 README.md                              |  8 ++-
 backend/Dockerfile                     |  2 +-
 backend/{ => auth}/auth_bearer.py      |  3 +-
 backend/{ => auth}/auth_handler.py     |  0
 backend/llm/qa.py                      |  2 +-
 backend/{api.py => main.py}            | 84 ++++----------------------
 backend/middlewares/cors.py            | 21 +++++++
 backend/models/chats.py                | 13 ++++
 backend/models/users.py                |  5 ++
 backend/parsers/audio.py               |  3 +-
 backend/parsers/common.py              |  6 +-
 backend/utils/file.py                  | 37 ++++++++++++
 backend/utils/processors.py            | 55 +++++++++++++++++
 backend/{utils.py => utils/vectors.py} | 37 ------------
 docker-compose.dev.yml                 | 30 +++++++++
 docker-compose.yml                     |  4 --
 frontend/Dockerfile                    |  8 ++-
 frontend/Dockerfile.dev                | 28 +++++++++
 19 files changed, 230 insertions(+), 123 deletions(-)
 create mode 100644 Makefile
 rename backend/{ => auth}/auth_bearer.py (95%)
 rename backend/{ => auth}/auth_handler.py (100%)
 rename backend/{api.py => main.py} (72%)
 create mode 100644 backend/middlewares/cors.py
 create mode 100644 backend/models/chats.py
 create mode 100644 backend/models/users.py
 create mode 100644 backend/utils/file.py
 create mode 100644 backend/utils/processors.py
 rename backend/{utils.py => utils/vectors.py} (71%)
 create mode 100644 docker-compose.dev.yml
 create mode 100644 frontend/Dockerfile.dev

diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..7fe8f3c0e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,7 @@
+
+
+dev:
+	docker compose -f docker-compose.dev.yml up --build
+
+prod:
+	docker compose -f docker-compose.yml up --build
\ No newline at end of file
diff --git a/README.md b/README.md
index bc68029a6..e069a6557 100644
--- a/README.md
+++ b/README.md
@@ -110,11 +110,17 @@ cp .frontend_env.example frontend/.env
 - **Step 5**: Launch the app
 
 ```bash
-docker compose build && docker compose up
+docker compose -f docker-compose.yml up --build
 ```
 
 - **Step 6**: Navigate to `localhost:3000` in your browser
 
+- ** Step 7**: Want to contribute to the project? 
+
+```
+docker compose -f docker-compose.dev.yml up --build
+```
+
 
 
 ## Contributors ✨
diff --git a/backend/Dockerfile b/backend/Dockerfile
index f47a37363..01c652f49 100644
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -11,4 +11,4 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt --timeout 100
 
 COPY . /code/
 
-CMD ["uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
\ No newline at end of file
+CMD ["uvicorn", "main:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
\ No newline at end of file
diff --git a/backend/auth_bearer.py b/backend/auth/auth_bearer.py
similarity index 95%
rename from backend/auth_bearer.py
rename to backend/auth/auth_bearer.py
index e58e40082..1a4e784bc 100644
--- a/backend/auth_bearer.py
+++ b/backend/auth/auth_bearer.py
@@ -1,10 +1,11 @@
 import os
 from typing import Optional
 
-from auth_handler import decode_access_token
 from fastapi import HTTPException, Request
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 
+from .auth_handler import decode_access_token
+
 
 class JWTBearer(HTTPBearer):
     def __init__(self, auto_error: bool = True):
diff --git a/backend/auth_handler.py b/backend/auth/auth_handler.py
similarity index 100%
rename from backend/auth_handler.py
rename to backend/auth/auth_handler.py
diff --git a/backend/llm/qa.py b/backend/llm/qa.py
index 9575f3560..e4f321700 100644
--- a/backend/llm/qa.py
+++ b/backend/llm/qa.py
@@ -10,8 +10,8 @@ from langchain.llms import VertexAI
 from langchain.memory import ConversationBufferMemory
 from langchain.vectorstores import SupabaseVectorStore
 from llm import LANGUAGE_PROMPT
+from models.chats import ChatMessage
 from supabase import Client, create_client
-from utils import ChatMessage
 
 
 class CustomSupabaseVectorStore(SupabaseVectorStore):
diff --git a/backend/api.py b/backend/main.py
similarity index 72%
rename from backend/api.py
rename to backend/main.py
index 9e70b4eb4..e20ea7023 100644
--- a/backend/api.py
+++ b/backend/main.py
@@ -4,51 +4,28 @@ import time
 from tempfile import SpooledTemporaryFile
 
 import pypandoc
-from auth_bearer import JWTBearer
+from auth.auth_bearer import JWTBearer
 from crawl.crawler import CrawlWebsite
 from fastapi import Depends, FastAPI, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
 from llm.qa import get_qa_llm
 from llm.summarization import llm_evaluate_summaries
 from logger import get_logger
-from parsers.audio import process_audio
-from parsers.common import file_already_exists
-from parsers.csv import process_csv
-from parsers.docx import process_docx
-from parsers.epub import process_epub
-from parsers.html import process_html
-from parsers.markdown import process_markdown
-from parsers.notebook import process_ipnyb
-from parsers.odt import process_odt
-from parsers.pdf import process_pdf
-from parsers.powerpoint import process_powerpoint
-from parsers.txt import process_txt
+from middlewares.cors import add_cors_middleware
+from models.chats import ChatMessage
+from models.users import User
 from pydantic import BaseModel
 from supabase import Client
-from utils import (ChatMessage, CommonsDep, convert_bytes, create_user,
-                   get_file_size, similarity_search, update_user_request_count)
+from utils.file import convert_bytes, get_file_size
+from utils.processors import filter_file
+from utils.vectors import (CommonsDep, create_user, similarity_search,
+                           update_user_request_count)
 
 logger = get_logger(__name__)
 
 app = FastAPI()
 
-origins = [
-    "http://localhost",
-    "http://localhost:3000",
-    "https://quivr.app",
-    "https://www.quivr.app",
-    "http://quivr.app",
-    "http://www.quivr.app",
-    "*"
-]
 
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=origins,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
+add_cors_middleware(app)
 
 
 
@@ -57,45 +34,6 @@ async def startup_event():
     pypandoc.download_pandoc()
 
 
-file_processors = {
-    ".txt": process_txt,
-    ".csv": process_csv,
-    ".md": process_markdown,
-    ".markdown": process_markdown,
-    ".m4a": process_audio,
-    ".mp3": process_audio,
-    ".webm": process_audio,
-    ".mp4": process_audio,
-    ".mpga": process_audio,
-    ".wav": process_audio,
-    ".mpeg": process_audio,
-    ".pdf": process_pdf,
-    ".html": process_html,
-    ".pptx": process_powerpoint,
-    ".docx": process_docx,
-    ".odt": process_odt,
-    ".epub": process_epub,
-    ".ipynb": process_ipnyb,
-}
-
-
-class User (BaseModel):
-    email: str
-
-
-async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
-    if await file_already_exists(supabase_client, file, user):
-        return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
-    elif file.file._file.tell()  < 1:
-        return {"message": f"❌ {file.filename} is empty.", "type": "error"}
-    else:
-        file_extension = os.path.splitext(file.filename)[-1].lower()  # Convert file extension to lowercase
-        if file_extension in file_processors:
-            await file_processors[file_extension](file, enable_summarization, user)
-            return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
-        else:
-            return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
-
 
 
 @app.post("/upload", dependencies=[Depends(JWTBearer())])
@@ -221,7 +159,7 @@ async def delete_endpoint(commons: CommonsDep, file_name: str, credentials: dict
 async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dict = Depends(JWTBearer()) ):
     user = User(email=credentials.get('email', 'none'))
     response = commons['supabase'].table("vectors").select(
-        "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
+        "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url", "content").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
     documents = response.data
     # Returns all documents with the same file name
     return {"documents": documents}
@@ -229,4 +167,4 @@ async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dic
 
 @app.get("/")
 async def root():
-    return {"message": "Hello World"}
+    return {"status": "OK"}
diff --git a/backend/middlewares/cors.py b/backend/middlewares/cors.py
new file mode 100644
index 000000000..b5fb52372
--- /dev/null
+++ b/backend/middlewares/cors.py
@@ -0,0 +1,21 @@
+from fastapi.middleware.cors import CORSMiddleware
+
+origins = [
+    "http://localhost",
+    "http://localhost:3000",
+    "https://quivr.app",
+    "https://www.quivr.app",
+    "http://quivr.app",
+    "http://www.quivr.app",
+    "*"
+]
+
+
+def add_cors_middleware(app):
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=origins,
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
diff --git a/backend/models/chats.py b/backend/models/chats.py
new file mode 100644
index 000000000..929eb2dea
--- /dev/null
+++ b/backend/models/chats.py
@@ -0,0 +1,13 @@
+from typing import List, Tuple
+
+from pydantic import BaseModel
+
+
+class ChatMessage(BaseModel):
+    model: str = "gpt-3.5-turbo"
+    question: str
+    # A list of tuples where each tuple is (speaker, text)
+    history: List[Tuple[str, str]]
+    temperature: float = 0.0
+    max_tokens: int = 256
+    use_summarization: bool = False
diff --git a/backend/models/users.py b/backend/models/users.py
new file mode 100644
index 000000000..6b95efdca
--- /dev/null
+++ b/backend/models/users.py
@@ -0,0 +1,5 @@
+from pydantic import BaseModel
+
+
+class User (BaseModel):
+    email: str
diff --git a/backend/parsers/audio.py b/backend/parsers/audio.py
index 8200d266d..e28e52b72 100644
--- a/backend/parsers/audio.py
+++ b/backend/parsers/audio.py
@@ -10,7 +10,8 @@ from langchain.document_loaders import TextLoader
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from utils import compute_sha1_from_content, documents_vector_store
+from utils.file import compute_sha1_from_content
+from utils.vectors import documents_vector_store
 
 # # Create a function to transcribe audio using Whisper
 # def _transcribe_audio(api_key, audio_file, stats_db):
diff --git a/backend/parsers/common.py b/backend/parsers/common.py
index d62aea720..860a92bca 100644
--- a/backend/parsers/common.py
+++ b/backend/parsers/common.py
@@ -8,8 +8,8 @@ from typing import Optional
 from fastapi import UploadFile
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from utils import (compute_sha1_from_content, compute_sha1_from_file,
-                   create_summary, create_vector, documents_vector_store)
+from utils.file import compute_sha1_from_content, compute_sha1_from_file
+from utils.vectors import create_summary, create_vector, documents_vector_store
 
 
 async def process_file(file: UploadFile, loader_class, file_suffix, enable_summarization, user):
@@ -52,6 +52,8 @@ async def process_file(file: UploadFile, loader_class, file_suffix, enable_summa
         doc_with_metadata = Document(
             page_content=doc.page_content, metadata=metadata)
         create_vector(user.email, doc_with_metadata)
+            #     add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
+
         if enable_summarization and ids and len(ids) > 0:
             create_summary(ids[0], doc.page_content, metadata)
     return
diff --git a/backend/utils/file.py b/backend/utils/file.py
new file mode 100644
index 000000000..0ba9de8f7
--- /dev/null
+++ b/backend/utils/file.py
@@ -0,0 +1,37 @@
+import hashlib
+
+from fastapi import UploadFile
+
+
+def convert_bytes(bytes, precision=2):
+    """Converts bytes into a human-friendly format."""
+    abbreviations = ['B', 'KB', 'MB']
+    if bytes <= 0:
+        return '0 B'
+    size = bytes
+    index = 0
+    while size >= 1024 and index < len(abbreviations) - 1:
+        size /= 1024
+        index += 1
+    return f'{size:.{precision}f} {abbreviations[index]}'
+
+def get_file_size(file: UploadFile):
+    # move the cursor to the end of the file
+    file.file._file.seek(0, 2)
+    file_size = file.file._file.tell()  # Getting the size of the file
+    # move the cursor back to the beginning of the file 
+    file.file.seek(0)
+
+    return file_size
+
+
+def compute_sha1_from_file(file_path):
+    with open(file_path, "rb") as file:
+        bytes = file.read()
+        readable_hash = compute_sha1_from_content(bytes)
+    return readable_hash
+
+
+def compute_sha1_from_content(content):
+    readable_hash = hashlib.sha1(content).hexdigest()
+    return readable_hash
\ No newline at end of file
diff --git a/backend/utils/processors.py b/backend/utils/processors.py
new file mode 100644
index 000000000..28e9076cf
--- /dev/null
+++ b/backend/utils/processors.py
@@ -0,0 +1,55 @@
+import os
+
+from fastapi import Depends, FastAPI, UploadFile
+from models.users import User
+from parsers.audio import process_audio
+from parsers.common import file_already_exists
+from parsers.csv import process_csv
+from parsers.docx import process_docx
+from parsers.epub import process_epub
+from parsers.html import process_html
+from parsers.markdown import process_markdown
+from parsers.notebook import process_ipnyb
+from parsers.odt import process_odt
+from parsers.pdf import process_pdf
+from parsers.powerpoint import process_powerpoint
+from parsers.txt import process_txt
+from supabase import Client
+
+file_processors = {
+    ".txt": process_txt,
+    ".csv": process_csv,
+    ".md": process_markdown,
+    ".markdown": process_markdown,
+    ".m4a": process_audio,
+    ".mp3": process_audio,
+    ".webm": process_audio,
+    ".mp4": process_audio,
+    ".mpga": process_audio,
+    ".wav": process_audio,
+    ".mpeg": process_audio,
+    ".pdf": process_pdf,
+    ".html": process_html,
+    ".pptx": process_powerpoint,
+    ".docx": process_docx,
+    ".odt": process_odt,
+    ".epub": process_epub,
+    ".ipynb": process_ipnyb,
+}
+
+
+
+
+async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
+    if await file_already_exists(supabase_client, file, user):
+        return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
+    elif file.file._file.tell()  < 1:
+        return {"message": f"❌ {file.filename} is empty.", "type": "error"}
+    else:
+        file_extension = os.path.splitext(file.filename)[-1].lower()  # Convert file extension to lowercase
+        if file_extension in file_processors:
+            await file_processors[file_extension](file, enable_summarization, user)
+            return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
+        else:
+            return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
+
diff --git a/backend/utils.py b/backend/utils/vectors.py
similarity index 71%
rename from backend/utils.py
rename to backend/utils/vectors.py
index dca8a6055..4f00464f6 100644
--- a/backend/utils.py
+++ b/backend/utils/vectors.py
@@ -1,4 +1,3 @@
-import hashlib
 import os
 from typing import Annotated, List, Tuple
 
@@ -9,7 +8,6 @@ from langchain.vectorstores import SupabaseVectorStore
 from llm.summarization import llm_summerize
 from logger import get_logger
 from pydantic import BaseModel
-
 from supabase import Client, create_client
 
 logger = get_logger(__name__)
@@ -27,17 +25,8 @@ summaries_vector_store = SupabaseVectorStore(
     supabase_client, embeddings, table_name="summaries")
 
 
-def compute_sha1_from_file(file_path):
-    with open(file_path, "rb") as file:
-        bytes = file.read()
-        readable_hash = compute_sha1_from_content(bytes)
-    return readable_hash
 
 
-def compute_sha1_from_content(content):
-    readable_hash = hashlib.sha1(content).hexdigest()
-    return readable_hash
-
 
 def common_dependencies():
     return {
@@ -51,14 +40,6 @@ def common_dependencies():
 CommonsDep = Annotated[dict, Depends(common_dependencies)]
 
 
-class ChatMessage(BaseModel):
-    model: str = "gpt-3.5-turbo"
-    question: str
-    # A list of tuples where each tuple is (speaker, text)
-    history: List[Tuple[str, str]]
-    temperature: float = 0.0
-    max_tokens: int = 256
-    use_summarization: bool = False
 
 
 def create_summary(document_id, content, metadata):
@@ -107,23 +88,5 @@ def similarity_search(query, table='match_summaries', top_k=5, threshold=0.5):
     ).execute()
     return summaries.data
 
-def get_file_size(file: UploadFile):
-    # move the cursor to the end of the file
-    file.file._file.seek(0, 2)
-    file_size = file.file._file.tell()  # Getting the size of the file
-    # move the cursor back to the beginning of the file 
-    file.file.seek(0)
 
-    return file_size
 
-def convert_bytes(bytes, precision=2):
-    """Converts bytes into a human-friendly format."""
-    abbreviations = ['B', 'KB', 'MB']
-    if bytes <= 0:
-        return '0 B'
-    size = bytes
-    index = 0
-    while size >= 1024 and index < len(abbreviations) - 1:
-        size /= 1024
-        index += 1
-    return f'{size:.{precision}f} {abbreviations[index]}'
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 000000000..6da95e7ac
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,30 @@
+version: "3"
+
+services:
+  frontend:
+    env_file:
+      - ./frontend/.env
+    build:
+      context: frontend
+      dockerfile: Dockerfile.dev
+    container_name: web
+    restart: always
+    volumes:
+      - ./frontend/:/app
+      - /app/node_modules
+      - /app/.next
+    ports:
+      - 3000:3000
+  backend:
+    env_file:
+      - ./backend/.env
+    build:
+      context: backend
+      dockerfile: Dockerfile
+    container_name: backend
+    restart: always
+    volumes:
+      - ./backend/:/code/
+      - ~/.config/gcloud:/root/.config/gcloud
+    ports:
+      - 5050:5050
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index c565f5105..59dca0b8a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,10 +9,6 @@ services:
       dockerfile: Dockerfile
     container_name: web
     restart: always
-    volumes:
-      - ./frontend/:/app
-      - /app/node_modules
-      - /app/.next
     ports:
       - 3000:3000
   backend:
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index 91dc306af..9750eabc7 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -1,4 +1,5 @@
 FROM node:18-alpine
+
 # Install Python and essential build tools
 RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
 RUN python3 -m ensurepip
@@ -21,8 +22,11 @@ RUN yarn install
 # Copy the rest of our Next.js folder into /app
 COPY . .
 
+# Build the Next.js application
+RUN yarn build
+
 # Ensure port 3000 is accessible to our system
 EXPOSE 3000
 
-# Run yarn dev, as we would via the command line 
-CMD ["yarn", "dev"]
+# Run yarn start, as we would via the command line 
+CMD ["yarn", "start"]
\ No newline at end of file
diff --git a/frontend/Dockerfile.dev b/frontend/Dockerfile.dev
new file mode 100644
index 000000000..91dc306af
--- /dev/null
+++ b/frontend/Dockerfile.dev
@@ -0,0 +1,28 @@
+FROM node:18-alpine
+# Install Python and essential build tools
+RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
+RUN python3 -m ensurepip
+RUN pip3 install --no-cache --upgrade pip setuptools
+
+# Create the directory on the node image 
+# where our Next.js app will live
+RUN mkdir -p /app
+
+# Set /app as the working directory
+WORKDIR /app
+
+# Copy package.json and yarn.lock
+# to the /app working directory
+COPY package*.json yarn.lock ./
+
+# Install dependencies in /app
+RUN yarn install
+
+# Copy the rest of our Next.js folder into /app
+COPY . .
+
+# Ensure port 3000 is accessible to our system
+EXPOSE 3000
+
+# Run yarn dev, as we would via the command line 
+CMD ["yarn", "dev"]