From a3ca7ecb3746cb15bcb7c507cfb809cdbda6fa95 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Sat, 3 Jun 2023 23:12:42 +0200 Subject: [PATCH] Back/refacto files (#240) * feat(docker): added docker for prod * feat(refacto): moved to modules --- Makefile | 7 +++ README.md | 8 ++- backend/Dockerfile | 2 +- backend/{ => auth}/auth_bearer.py | 3 +- backend/{ => auth}/auth_handler.py | 0 backend/llm/qa.py | 2 +- backend/{api.py => main.py} | 84 ++++---------------------- backend/middlewares/cors.py | 21 +++++++ backend/models/chats.py | 13 ++++ backend/models/users.py | 5 ++ backend/parsers/audio.py | 3 +- backend/parsers/common.py | 6 +- backend/utils/file.py | 37 ++++++++++++ backend/utils/processors.py | 55 +++++++++++++++++ backend/{utils.py => utils/vectors.py} | 37 ------------ docker-compose.dev.yml | 30 +++++++++ docker-compose.yml | 4 -- frontend/Dockerfile | 8 ++- frontend/Dockerfile.dev | 28 +++++++++ 19 files changed, 230 insertions(+), 123 deletions(-) create mode 100644 Makefile rename backend/{ => auth}/auth_bearer.py (95%) rename backend/{ => auth}/auth_handler.py (100%) rename backend/{api.py => main.py} (72%) create mode 100644 backend/middlewares/cors.py create mode 100644 backend/models/chats.py create mode 100644 backend/models/users.py create mode 100644 backend/utils/file.py create mode 100644 backend/utils/processors.py rename backend/{utils.py => utils/vectors.py} (71%) create mode 100644 docker-compose.dev.yml create mode 100644 frontend/Dockerfile.dev diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..7fe8f3c0e --- /dev/null +++ b/Makefile @@ -0,0 +1,7 @@ + + +dev: + docker compose -f docker-compose.dev.yml up --build + +prod: + docker compose -f docker-compose.yml up --build \ No newline at end of file diff --git a/README.md b/README.md index bc68029a6..e069a6557 100644 --- a/README.md +++ b/README.md @@ -110,11 +110,17 @@ cp .frontend_env.example frontend/.env - **Step 5**: Launch the app ```bash -docker compose build && docker compose up +docker compose -f docker-compose.yml up --build ``` - **Step 6**: Navigate to `localhost:3000` in your browser +- ** Step 7**: Want to contribute to the project? + +``` +docker compose -f docker-compose.dev.yml up --build +``` + ## Contributors ✨ diff --git a/backend/Dockerfile b/backend/Dockerfile index f47a37363..01c652f49 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -11,4 +11,4 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt --timeout 100 COPY . /code/ -CMD ["uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "5050"] \ No newline at end of file +CMD ["uvicorn", "main:app", "--reload", "--host", "0.0.0.0", "--port", "5050"] \ No newline at end of file diff --git a/backend/auth_bearer.py b/backend/auth/auth_bearer.py similarity index 95% rename from backend/auth_bearer.py rename to backend/auth/auth_bearer.py index e58e40082..1a4e784bc 100644 --- a/backend/auth_bearer.py +++ b/backend/auth/auth_bearer.py @@ -1,10 +1,11 @@ import os from typing import Optional -from auth_handler import decode_access_token from fastapi import HTTPException, Request from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer +from .auth_handler import decode_access_token + class JWTBearer(HTTPBearer): def __init__(self, auto_error: bool = True): diff --git a/backend/auth_handler.py b/backend/auth/auth_handler.py similarity index 100% rename from backend/auth_handler.py rename to backend/auth/auth_handler.py diff --git a/backend/llm/qa.py b/backend/llm/qa.py index 9575f3560..e4f321700 100644 --- a/backend/llm/qa.py +++ b/backend/llm/qa.py @@ -10,8 +10,8 @@ from langchain.llms import VertexAI from langchain.memory import ConversationBufferMemory from langchain.vectorstores import SupabaseVectorStore from llm import LANGUAGE_PROMPT +from models.chats import ChatMessage from supabase import Client, create_client -from utils import ChatMessage class CustomSupabaseVectorStore(SupabaseVectorStore): diff --git a/backend/api.py b/backend/main.py similarity index 72% rename from backend/api.py rename to backend/main.py index 9e70b4eb4..e20ea7023 100644 --- a/backend/api.py +++ b/backend/main.py @@ -4,51 +4,28 @@ import time from tempfile import SpooledTemporaryFile import pypandoc -from auth_bearer import JWTBearer +from auth.auth_bearer import JWTBearer from crawl.crawler import CrawlWebsite from fastapi import Depends, FastAPI, UploadFile -from fastapi.middleware.cors import CORSMiddleware from llm.qa import get_qa_llm from llm.summarization import llm_evaluate_summaries from logger import get_logger -from parsers.audio import process_audio -from parsers.common import file_already_exists -from parsers.csv import process_csv -from parsers.docx import process_docx -from parsers.epub import process_epub -from parsers.html import process_html -from parsers.markdown import process_markdown -from parsers.notebook import process_ipnyb -from parsers.odt import process_odt -from parsers.pdf import process_pdf -from parsers.powerpoint import process_powerpoint -from parsers.txt import process_txt +from middlewares.cors import add_cors_middleware +from models.chats import ChatMessage +from models.users import User from pydantic import BaseModel from supabase import Client -from utils import (ChatMessage, CommonsDep, convert_bytes, create_user, - get_file_size, similarity_search, update_user_request_count) +from utils.file import convert_bytes, get_file_size +from utils.processors import filter_file +from utils.vectors import (CommonsDep, create_user, similarity_search, + update_user_request_count) logger = get_logger(__name__) app = FastAPI() -origins = [ - "http://localhost", - "http://localhost:3000", - "https://quivr.app", - "https://www.quivr.app", - "http://quivr.app", - "http://www.quivr.app", - "*" -] -app.add_middleware( - CORSMiddleware, - allow_origins=origins, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) +add_cors_middleware(app) @@ -57,45 +34,6 @@ async def startup_event(): pypandoc.download_pandoc() -file_processors = { - ".txt": process_txt, - ".csv": process_csv, - ".md": process_markdown, - ".markdown": process_markdown, - ".m4a": process_audio, - ".mp3": process_audio, - ".webm": process_audio, - ".mp4": process_audio, - ".mpga": process_audio, - ".wav": process_audio, - ".mpeg": process_audio, - ".pdf": process_pdf, - ".html": process_html, - ".pptx": process_powerpoint, - ".docx": process_docx, - ".odt": process_odt, - ".epub": process_epub, - ".ipynb": process_ipnyb, -} - - -class User (BaseModel): - email: str - - -async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User): - if await file_already_exists(supabase_client, file, user): - return {"message": f"🤔 {file.filename} already exists.", "type": "warning"} - elif file.file._file.tell() < 1: - return {"message": f"❌ {file.filename} is empty.", "type": "error"} - else: - file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase - if file_extension in file_processors: - await file_processors[file_extension](file, enable_summarization, user) - return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"} - else: - return {"message": f"❌ {file.filename} is not supported.", "type": "error"} - @app.post("/upload", dependencies=[Depends(JWTBearer())]) @@ -221,7 +159,7 @@ async def delete_endpoint(commons: CommonsDep, file_name: str, credentials: dict async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dict = Depends(JWTBearer()) ): user = User(email=credentials.get('email', 'none')) response = commons['supabase'].table("vectors").select( - "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name, "user_id": user.email}).execute() + "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url", "content").match({"metadata->>file_name": file_name, "user_id": user.email}).execute() documents = response.data # Returns all documents with the same file name return {"documents": documents} @@ -229,4 +167,4 @@ async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dic @app.get("/") async def root(): - return {"message": "Hello World"} + return {"status": "OK"} diff --git a/backend/middlewares/cors.py b/backend/middlewares/cors.py new file mode 100644 index 000000000..b5fb52372 --- /dev/null +++ b/backend/middlewares/cors.py @@ -0,0 +1,21 @@ +from fastapi.middleware.cors import CORSMiddleware + +origins = [ + "http://localhost", + "http://localhost:3000", + "https://quivr.app", + "https://www.quivr.app", + "http://quivr.app", + "http://www.quivr.app", + "*" +] + + +def add_cors_middleware(app): + app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) diff --git a/backend/models/chats.py b/backend/models/chats.py new file mode 100644 index 000000000..929eb2dea --- /dev/null +++ b/backend/models/chats.py @@ -0,0 +1,13 @@ +from typing import List, Tuple + +from pydantic import BaseModel + + +class ChatMessage(BaseModel): + model: str = "gpt-3.5-turbo" + question: str + # A list of tuples where each tuple is (speaker, text) + history: List[Tuple[str, str]] + temperature: float = 0.0 + max_tokens: int = 256 + use_summarization: bool = False diff --git a/backend/models/users.py b/backend/models/users.py new file mode 100644 index 000000000..6b95efdca --- /dev/null +++ b/backend/models/users.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class User (BaseModel): + email: str diff --git a/backend/parsers/audio.py b/backend/parsers/audio.py index 8200d266d..e28e52b72 100644 --- a/backend/parsers/audio.py +++ b/backend/parsers/audio.py @@ -10,7 +10,8 @@ from langchain.document_loaders import TextLoader from langchain.embeddings.openai import OpenAIEmbeddings from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter -from utils import compute_sha1_from_content, documents_vector_store +from utils.file import compute_sha1_from_content +from utils.vectors import documents_vector_store # # Create a function to transcribe audio using Whisper # def _transcribe_audio(api_key, audio_file, stats_db): diff --git a/backend/parsers/common.py b/backend/parsers/common.py index d62aea720..860a92bca 100644 --- a/backend/parsers/common.py +++ b/backend/parsers/common.py @@ -8,8 +8,8 @@ from typing import Optional from fastapi import UploadFile from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter -from utils import (compute_sha1_from_content, compute_sha1_from_file, - create_summary, create_vector, documents_vector_store) +from utils.file import compute_sha1_from_content, compute_sha1_from_file +from utils.vectors import create_summary, create_vector, documents_vector_store async def process_file(file: UploadFile, loader_class, file_suffix, enable_summarization, user): @@ -52,6 +52,8 @@ async def process_file(file: UploadFile, loader_class, file_suffix, enable_summa doc_with_metadata = Document( page_content=doc.page_content, metadata=metadata) create_vector(user.email, doc_with_metadata) + # add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap}) + if enable_summarization and ids and len(ids) > 0: create_summary(ids[0], doc.page_content, metadata) return diff --git a/backend/utils/file.py b/backend/utils/file.py new file mode 100644 index 000000000..0ba9de8f7 --- /dev/null +++ b/backend/utils/file.py @@ -0,0 +1,37 @@ +import hashlib + +from fastapi import UploadFile + + +def convert_bytes(bytes, precision=2): + """Converts bytes into a human-friendly format.""" + abbreviations = ['B', 'KB', 'MB'] + if bytes <= 0: + return '0 B' + size = bytes + index = 0 + while size >= 1024 and index < len(abbreviations) - 1: + size /= 1024 + index += 1 + return f'{size:.{precision}f} {abbreviations[index]}' + +def get_file_size(file: UploadFile): + # move the cursor to the end of the file + file.file._file.seek(0, 2) + file_size = file.file._file.tell() # Getting the size of the file + # move the cursor back to the beginning of the file + file.file.seek(0) + + return file_size + + +def compute_sha1_from_file(file_path): + with open(file_path, "rb") as file: + bytes = file.read() + readable_hash = compute_sha1_from_content(bytes) + return readable_hash + + +def compute_sha1_from_content(content): + readable_hash = hashlib.sha1(content).hexdigest() + return readable_hash \ No newline at end of file diff --git a/backend/utils/processors.py b/backend/utils/processors.py new file mode 100644 index 000000000..28e9076cf --- /dev/null +++ b/backend/utils/processors.py @@ -0,0 +1,55 @@ +import os + +from fastapi import Depends, FastAPI, UploadFile +from models.users import User +from parsers.audio import process_audio +from parsers.common import file_already_exists +from parsers.csv import process_csv +from parsers.docx import process_docx +from parsers.epub import process_epub +from parsers.html import process_html +from parsers.markdown import process_markdown +from parsers.notebook import process_ipnyb +from parsers.odt import process_odt +from parsers.pdf import process_pdf +from parsers.powerpoint import process_powerpoint +from parsers.txt import process_txt +from supabase import Client + +file_processors = { + ".txt": process_txt, + ".csv": process_csv, + ".md": process_markdown, + ".markdown": process_markdown, + ".m4a": process_audio, + ".mp3": process_audio, + ".webm": process_audio, + ".mp4": process_audio, + ".mpga": process_audio, + ".wav": process_audio, + ".mpeg": process_audio, + ".pdf": process_pdf, + ".html": process_html, + ".pptx": process_powerpoint, + ".docx": process_docx, + ".odt": process_odt, + ".epub": process_epub, + ".ipynb": process_ipnyb, +} + + + + +async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User): + if await file_already_exists(supabase_client, file, user): + return {"message": f"🤔 {file.filename} already exists.", "type": "warning"} + elif file.file._file.tell() < 1: + return {"message": f"❌ {file.filename} is empty.", "type": "error"} + else: + file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase + if file_extension in file_processors: + await file_processors[file_extension](file, enable_summarization, user) + return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"} + else: + return {"message": f"❌ {file.filename} is not supported.", "type": "error"} + diff --git a/backend/utils.py b/backend/utils/vectors.py similarity index 71% rename from backend/utils.py rename to backend/utils/vectors.py index dca8a6055..4f00464f6 100644 --- a/backend/utils.py +++ b/backend/utils/vectors.py @@ -1,4 +1,3 @@ -import hashlib import os from typing import Annotated, List, Tuple @@ -9,7 +8,6 @@ from langchain.vectorstores import SupabaseVectorStore from llm.summarization import llm_summerize from logger import get_logger from pydantic import BaseModel - from supabase import Client, create_client logger = get_logger(__name__) @@ -27,17 +25,8 @@ summaries_vector_store = SupabaseVectorStore( supabase_client, embeddings, table_name="summaries") -def compute_sha1_from_file(file_path): - with open(file_path, "rb") as file: - bytes = file.read() - readable_hash = compute_sha1_from_content(bytes) - return readable_hash -def compute_sha1_from_content(content): - readable_hash = hashlib.sha1(content).hexdigest() - return readable_hash - def common_dependencies(): return { @@ -51,14 +40,6 @@ def common_dependencies(): CommonsDep = Annotated[dict, Depends(common_dependencies)] -class ChatMessage(BaseModel): - model: str = "gpt-3.5-turbo" - question: str - # A list of tuples where each tuple is (speaker, text) - history: List[Tuple[str, str]] - temperature: float = 0.0 - max_tokens: int = 256 - use_summarization: bool = False def create_summary(document_id, content, metadata): @@ -107,23 +88,5 @@ def similarity_search(query, table='match_summaries', top_k=5, threshold=0.5): ).execute() return summaries.data -def get_file_size(file: UploadFile): - # move the cursor to the end of the file - file.file._file.seek(0, 2) - file_size = file.file._file.tell() # Getting the size of the file - # move the cursor back to the beginning of the file - file.file.seek(0) - return file_size -def convert_bytes(bytes, precision=2): - """Converts bytes into a human-friendly format.""" - abbreviations = ['B', 'KB', 'MB'] - if bytes <= 0: - return '0 B' - size = bytes - index = 0 - while size >= 1024 and index < len(abbreviations) - 1: - size /= 1024 - index += 1 - return f'{size:.{precision}f} {abbreviations[index]}' diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 000000000..6da95e7ac --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,30 @@ +version: "3" + +services: + frontend: + env_file: + - ./frontend/.env + build: + context: frontend + dockerfile: Dockerfile.dev + container_name: web + restart: always + volumes: + - ./frontend/:/app + - /app/node_modules + - /app/.next + ports: + - 3000:3000 + backend: + env_file: + - ./backend/.env + build: + context: backend + dockerfile: Dockerfile + container_name: backend + restart: always + volumes: + - ./backend/:/code/ + - ~/.config/gcloud:/root/.config/gcloud + ports: + - 5050:5050 \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index c565f5105..59dca0b8a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,10 +9,6 @@ services: dockerfile: Dockerfile container_name: web restart: always - volumes: - - ./frontend/:/app - - /app/node_modules - - /app/.next ports: - 3000:3000 backend: diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 91dc306af..9750eabc7 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,4 +1,5 @@ FROM node:18-alpine + # Install Python and essential build tools RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python RUN python3 -m ensurepip @@ -21,8 +22,11 @@ RUN yarn install # Copy the rest of our Next.js folder into /app COPY . . +# Build the Next.js application +RUN yarn build + # Ensure port 3000 is accessible to our system EXPOSE 3000 -# Run yarn dev, as we would via the command line -CMD ["yarn", "dev"] +# Run yarn start, as we would via the command line +CMD ["yarn", "start"] \ No newline at end of file diff --git a/frontend/Dockerfile.dev b/frontend/Dockerfile.dev new file mode 100644 index 000000000..91dc306af --- /dev/null +++ b/frontend/Dockerfile.dev @@ -0,0 +1,28 @@ +FROM node:18-alpine +# Install Python and essential build tools +RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python +RUN python3 -m ensurepip +RUN pip3 install --no-cache --upgrade pip setuptools + +# Create the directory on the node image +# where our Next.js app will live +RUN mkdir -p /app + +# Set /app as the working directory +WORKDIR /app + +# Copy package.json and yarn.lock +# to the /app working directory +COPY package*.json yarn.lock ./ + +# Install dependencies in /app +RUN yarn install + +# Copy the rest of our Next.js folder into /app +COPY . . + +# Ensure port 3000 is accessible to our system +EXPOSE 3000 + +# Run yarn dev, as we would via the command line +CMD ["yarn", "dev"]