Back/refacto files (#240)

* feat(docker): added docker for prod

* feat(refacto): moved to modules
This commit is contained in:
Stan Girard 2023-06-03 23:12:42 +02:00 committed by GitHub
parent 59c02228b6
commit a3ca7ecb37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 230 additions and 123 deletions

7
Makefile Normal file
View File

@ -0,0 +1,7 @@
dev:
docker compose -f docker-compose.dev.yml up --build
prod:
docker compose -f docker-compose.yml up --build

View File

@ -110,11 +110,17 @@ cp .frontend_env.example frontend/.env
- **Step 5**: Launch the app - **Step 5**: Launch the app
```bash ```bash
docker compose build && docker compose up docker compose -f docker-compose.yml up --build
``` ```
- **Step 6**: Navigate to `localhost:3000` in your browser - **Step 6**: Navigate to `localhost:3000` in your browser
- ** Step 7**: Want to contribute to the project?
```
docker compose -f docker-compose.dev.yml up --build
```
## Contributors ✨ ## Contributors ✨

View File

@ -11,4 +11,4 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt --timeout 100
COPY . /code/ COPY . /code/
CMD ["uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "5050"] CMD ["uvicorn", "main:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]

View File

@ -1,10 +1,11 @@
import os import os
from typing import Optional from typing import Optional
from auth_handler import decode_access_token
from fastapi import HTTPException, Request from fastapi import HTTPException, Request
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from .auth_handler import decode_access_token
class JWTBearer(HTTPBearer): class JWTBearer(HTTPBearer):
def __init__(self, auto_error: bool = True): def __init__(self, auto_error: bool = True):

View File

@ -10,8 +10,8 @@ from langchain.llms import VertexAI
from langchain.memory import ConversationBufferMemory from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import SupabaseVectorStore from langchain.vectorstores import SupabaseVectorStore
from llm import LANGUAGE_PROMPT from llm import LANGUAGE_PROMPT
from models.chats import ChatMessage
from supabase import Client, create_client from supabase import Client, create_client
from utils import ChatMessage
class CustomSupabaseVectorStore(SupabaseVectorStore): class CustomSupabaseVectorStore(SupabaseVectorStore):

View File

@ -4,51 +4,28 @@ import time
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
import pypandoc import pypandoc
from auth_bearer import JWTBearer from auth.auth_bearer import JWTBearer
from crawl.crawler import CrawlWebsite from crawl.crawler import CrawlWebsite
from fastapi import Depends, FastAPI, UploadFile from fastapi import Depends, FastAPI, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from llm.qa import get_qa_llm from llm.qa import get_qa_llm
from llm.summarization import llm_evaluate_summaries from llm.summarization import llm_evaluate_summaries
from logger import get_logger from logger import get_logger
from parsers.audio import process_audio from middlewares.cors import add_cors_middleware
from parsers.common import file_already_exists from models.chats import ChatMessage
from parsers.csv import process_csv from models.users import User
from parsers.docx import process_docx
from parsers.epub import process_epub
from parsers.html import process_html
from parsers.markdown import process_markdown
from parsers.notebook import process_ipnyb
from parsers.odt import process_odt
from parsers.pdf import process_pdf
from parsers.powerpoint import process_powerpoint
from parsers.txt import process_txt
from pydantic import BaseModel from pydantic import BaseModel
from supabase import Client from supabase import Client
from utils import (ChatMessage, CommonsDep, convert_bytes, create_user, from utils.file import convert_bytes, get_file_size
get_file_size, similarity_search, update_user_request_count) from utils.processors import filter_file
from utils.vectors import (CommonsDep, create_user, similarity_search,
update_user_request_count)
logger = get_logger(__name__) logger = get_logger(__name__)
app = FastAPI() app = FastAPI()
origins = [
"http://localhost",
"http://localhost:3000",
"https://quivr.app",
"https://www.quivr.app",
"http://quivr.app",
"http://www.quivr.app",
"*"
]
app.add_middleware( add_cors_middleware(app)
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@ -57,45 +34,6 @@ async def startup_event():
pypandoc.download_pandoc() pypandoc.download_pandoc()
file_processors = {
".txt": process_txt,
".csv": process_csv,
".md": process_markdown,
".markdown": process_markdown,
".m4a": process_audio,
".mp3": process_audio,
".webm": process_audio,
".mp4": process_audio,
".mpga": process_audio,
".wav": process_audio,
".mpeg": process_audio,
".pdf": process_pdf,
".html": process_html,
".pptx": process_powerpoint,
".docx": process_docx,
".odt": process_odt,
".epub": process_epub,
".ipynb": process_ipnyb,
}
class User (BaseModel):
email: str
async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
if await file_already_exists(supabase_client, file, user):
return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
elif file.file._file.tell() < 1:
return {"message": f"{file.filename} is empty.", "type": "error"}
else:
file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase
if file_extension in file_processors:
await file_processors[file_extension](file, enable_summarization, user)
return {"message": f"{file.filename} has been uploaded.", "type": "success"}
else:
return {"message": f"{file.filename} is not supported.", "type": "error"}
@app.post("/upload", dependencies=[Depends(JWTBearer())]) @app.post("/upload", dependencies=[Depends(JWTBearer())])
@ -221,7 +159,7 @@ async def delete_endpoint(commons: CommonsDep, file_name: str, credentials: dict
async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dict = Depends(JWTBearer()) ): async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dict = Depends(JWTBearer()) ):
user = User(email=credentials.get('email', 'none')) user = User(email=credentials.get('email', 'none'))
response = commons['supabase'].table("vectors").select( response = commons['supabase'].table("vectors").select(
"metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name, "user_id": user.email}).execute() "metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url", "content").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
documents = response.data documents = response.data
# Returns all documents with the same file name # Returns all documents with the same file name
return {"documents": documents} return {"documents": documents}
@ -229,4 +167,4 @@ async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dic
@app.get("/") @app.get("/")
async def root(): async def root():
return {"message": "Hello World"} return {"status": "OK"}

View File

@ -0,0 +1,21 @@
from fastapi.middleware.cors import CORSMiddleware
origins = [
"http://localhost",
"http://localhost:3000",
"https://quivr.app",
"https://www.quivr.app",
"http://quivr.app",
"http://www.quivr.app",
"*"
]
def add_cors_middleware(app):
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

13
backend/models/chats.py Normal file
View File

@ -0,0 +1,13 @@
from typing import List, Tuple
from pydantic import BaseModel
class ChatMessage(BaseModel):
model: str = "gpt-3.5-turbo"
question: str
# A list of tuples where each tuple is (speaker, text)
history: List[Tuple[str, str]]
temperature: float = 0.0
max_tokens: int = 256
use_summarization: bool = False

5
backend/models/users.py Normal file
View File

@ -0,0 +1,5 @@
from pydantic import BaseModel
class User (BaseModel):
email: str

View File

@ -10,7 +10,8 @@ from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import compute_sha1_from_content, documents_vector_store from utils.file import compute_sha1_from_content
from utils.vectors import documents_vector_store
# # Create a function to transcribe audio using Whisper # # Create a function to transcribe audio using Whisper
# def _transcribe_audio(api_key, audio_file, stats_db): # def _transcribe_audio(api_key, audio_file, stats_db):

View File

@ -8,8 +8,8 @@ from typing import Optional
from fastapi import UploadFile from fastapi import UploadFile
from langchain.schema import Document from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import (compute_sha1_from_content, compute_sha1_from_file, from utils.file import compute_sha1_from_content, compute_sha1_from_file
create_summary, create_vector, documents_vector_store) from utils.vectors import create_summary, create_vector, documents_vector_store
async def process_file(file: UploadFile, loader_class, file_suffix, enable_summarization, user): async def process_file(file: UploadFile, loader_class, file_suffix, enable_summarization, user):
@ -52,6 +52,8 @@ async def process_file(file: UploadFile, loader_class, file_suffix, enable_summa
doc_with_metadata = Document( doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata) page_content=doc.page_content, metadata=metadata)
create_vector(user.email, doc_with_metadata) create_vector(user.email, doc_with_metadata)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
if enable_summarization and ids and len(ids) > 0: if enable_summarization and ids and len(ids) > 0:
create_summary(ids[0], doc.page_content, metadata) create_summary(ids[0], doc.page_content, metadata)
return return

37
backend/utils/file.py Normal file
View File

@ -0,0 +1,37 @@
import hashlib
from fastapi import UploadFile
def convert_bytes(bytes, precision=2):
"""Converts bytes into a human-friendly format."""
abbreviations = ['B', 'KB', 'MB']
if bytes <= 0:
return '0 B'
size = bytes
index = 0
while size >= 1024 and index < len(abbreviations) - 1:
size /= 1024
index += 1
return f'{size:.{precision}f} {abbreviations[index]}'
def get_file_size(file: UploadFile):
# move the cursor to the end of the file
file.file._file.seek(0, 2)
file_size = file.file._file.tell() # Getting the size of the file
# move the cursor back to the beginning of the file
file.file.seek(0)
return file_size
def compute_sha1_from_file(file_path):
with open(file_path, "rb") as file:
bytes = file.read()
readable_hash = compute_sha1_from_content(bytes)
return readable_hash
def compute_sha1_from_content(content):
readable_hash = hashlib.sha1(content).hexdigest()
return readable_hash

View File

@ -0,0 +1,55 @@
import os
from fastapi import Depends, FastAPI, UploadFile
from models.users import User
from parsers.audio import process_audio
from parsers.common import file_already_exists
from parsers.csv import process_csv
from parsers.docx import process_docx
from parsers.epub import process_epub
from parsers.html import process_html
from parsers.markdown import process_markdown
from parsers.notebook import process_ipnyb
from parsers.odt import process_odt
from parsers.pdf import process_pdf
from parsers.powerpoint import process_powerpoint
from parsers.txt import process_txt
from supabase import Client
file_processors = {
".txt": process_txt,
".csv": process_csv,
".md": process_markdown,
".markdown": process_markdown,
".m4a": process_audio,
".mp3": process_audio,
".webm": process_audio,
".mp4": process_audio,
".mpga": process_audio,
".wav": process_audio,
".mpeg": process_audio,
".pdf": process_pdf,
".html": process_html,
".pptx": process_powerpoint,
".docx": process_docx,
".odt": process_odt,
".epub": process_epub,
".ipynb": process_ipnyb,
}
async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
if await file_already_exists(supabase_client, file, user):
return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
elif file.file._file.tell() < 1:
return {"message": f"{file.filename} is empty.", "type": "error"}
else:
file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase
if file_extension in file_processors:
await file_processors[file_extension](file, enable_summarization, user)
return {"message": f"{file.filename} has been uploaded.", "type": "success"}
else:
return {"message": f"{file.filename} is not supported.", "type": "error"}

View File

@ -1,4 +1,3 @@
import hashlib
import os import os
from typing import Annotated, List, Tuple from typing import Annotated, List, Tuple
@ -9,7 +8,6 @@ from langchain.vectorstores import SupabaseVectorStore
from llm.summarization import llm_summerize from llm.summarization import llm_summerize
from logger import get_logger from logger import get_logger
from pydantic import BaseModel from pydantic import BaseModel
from supabase import Client, create_client from supabase import Client, create_client
logger = get_logger(__name__) logger = get_logger(__name__)
@ -27,17 +25,8 @@ summaries_vector_store = SupabaseVectorStore(
supabase_client, embeddings, table_name="summaries") supabase_client, embeddings, table_name="summaries")
def compute_sha1_from_file(file_path):
with open(file_path, "rb") as file:
bytes = file.read()
readable_hash = compute_sha1_from_content(bytes)
return readable_hash
def compute_sha1_from_content(content):
readable_hash = hashlib.sha1(content).hexdigest()
return readable_hash
def common_dependencies(): def common_dependencies():
return { return {
@ -51,14 +40,6 @@ def common_dependencies():
CommonsDep = Annotated[dict, Depends(common_dependencies)] CommonsDep = Annotated[dict, Depends(common_dependencies)]
class ChatMessage(BaseModel):
model: str = "gpt-3.5-turbo"
question: str
# A list of tuples where each tuple is (speaker, text)
history: List[Tuple[str, str]]
temperature: float = 0.0
max_tokens: int = 256
use_summarization: bool = False
def create_summary(document_id, content, metadata): def create_summary(document_id, content, metadata):
@ -107,23 +88,5 @@ def similarity_search(query, table='match_summaries', top_k=5, threshold=0.5):
).execute() ).execute()
return summaries.data return summaries.data
def get_file_size(file: UploadFile):
# move the cursor to the end of the file
file.file._file.seek(0, 2)
file_size = file.file._file.tell() # Getting the size of the file
# move the cursor back to the beginning of the file
file.file.seek(0)
return file_size
def convert_bytes(bytes, precision=2):
"""Converts bytes into a human-friendly format."""
abbreviations = ['B', 'KB', 'MB']
if bytes <= 0:
return '0 B'
size = bytes
index = 0
while size >= 1024 and index < len(abbreviations) - 1:
size /= 1024
index += 1
return f'{size:.{precision}f} {abbreviations[index]}'

30
docker-compose.dev.yml Normal file
View File

@ -0,0 +1,30 @@
version: "3"
services:
frontend:
env_file:
- ./frontend/.env
build:
context: frontend
dockerfile: Dockerfile.dev
container_name: web
restart: always
volumes:
- ./frontend/:/app
- /app/node_modules
- /app/.next
ports:
- 3000:3000
backend:
env_file:
- ./backend/.env
build:
context: backend
dockerfile: Dockerfile
container_name: backend
restart: always
volumes:
- ./backend/:/code/
- ~/.config/gcloud:/root/.config/gcloud
ports:
- 5050:5050

View File

@ -9,10 +9,6 @@ services:
dockerfile: Dockerfile dockerfile: Dockerfile
container_name: web container_name: web
restart: always restart: always
volumes:
- ./frontend/:/app
- /app/node_modules
- /app/.next
ports: ports:
- 3000:3000 - 3000:3000
backend: backend:

View File

@ -1,4 +1,5 @@
FROM node:18-alpine FROM node:18-alpine
# Install Python and essential build tools # Install Python and essential build tools
RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
RUN python3 -m ensurepip RUN python3 -m ensurepip
@ -21,8 +22,11 @@ RUN yarn install
# Copy the rest of our Next.js folder into /app # Copy the rest of our Next.js folder into /app
COPY . . COPY . .
# Build the Next.js application
RUN yarn build
# Ensure port 3000 is accessible to our system # Ensure port 3000 is accessible to our system
EXPOSE 3000 EXPOSE 3000
# Run yarn dev, as we would via the command line # Run yarn start, as we would via the command line
CMD ["yarn", "dev"] CMD ["yarn", "start"]

28
frontend/Dockerfile.dev Normal file
View File

@ -0,0 +1,28 @@
FROM node:18-alpine
# Install Python and essential build tools
RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
RUN python3 -m ensurepip
RUN pip3 install --no-cache --upgrade pip setuptools
# Create the directory on the node image
# where our Next.js app will live
RUN mkdir -p /app
# Set /app as the working directory
WORKDIR /app
# Copy package.json and yarn.lock
# to the /app working directory
COPY package*.json yarn.lock ./
# Install dependencies in /app
RUN yarn install
# Copy the rest of our Next.js folder into /app
COPY . .
# Ensure port 3000 is accessible to our system
EXPOSE 3000
# Run yarn dev, as we would via the command line
CMD ["yarn", "dev"]