Back/refacto files (#240)

* feat(docker): added docker for prod

* feat(refacto): moved to modules
This commit is contained in:
Stan Girard 2023-06-03 23:12:42 +02:00 committed by GitHub
parent 59c02228b6
commit a3ca7ecb37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 230 additions and 123 deletions

7
Makefile Normal file
View File

@ -0,0 +1,7 @@
dev:
docker compose -f docker-compose.dev.yml up --build
prod:
docker compose -f docker-compose.yml up --build

View File

@ -110,11 +110,17 @@ cp .frontend_env.example frontend/.env
- **Step 5**: Launch the app
```bash
docker compose build && docker compose up
docker compose -f docker-compose.yml up --build
```
- **Step 6**: Navigate to `localhost:3000` in your browser
- ** Step 7**: Want to contribute to the project?
```
docker compose -f docker-compose.dev.yml up --build
```
## Contributors ✨

View File

@ -11,4 +11,4 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt --timeout 100
COPY . /code/
CMD ["uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
CMD ["uvicorn", "main:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]

View File

@ -1,10 +1,11 @@
import os
from typing import Optional
from auth_handler import decode_access_token
from fastapi import HTTPException, Request
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from .auth_handler import decode_access_token
class JWTBearer(HTTPBearer):
def __init__(self, auto_error: bool = True):

View File

@ -10,8 +10,8 @@ from langchain.llms import VertexAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import SupabaseVectorStore
from llm import LANGUAGE_PROMPT
from models.chats import ChatMessage
from supabase import Client, create_client
from utils import ChatMessage
class CustomSupabaseVectorStore(SupabaseVectorStore):

View File

@ -4,51 +4,28 @@ import time
from tempfile import SpooledTemporaryFile
import pypandoc
from auth_bearer import JWTBearer
from auth.auth_bearer import JWTBearer
from crawl.crawler import CrawlWebsite
from fastapi import Depends, FastAPI, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from llm.qa import get_qa_llm
from llm.summarization import llm_evaluate_summaries
from logger import get_logger
from parsers.audio import process_audio
from parsers.common import file_already_exists
from parsers.csv import process_csv
from parsers.docx import process_docx
from parsers.epub import process_epub
from parsers.html import process_html
from parsers.markdown import process_markdown
from parsers.notebook import process_ipnyb
from parsers.odt import process_odt
from parsers.pdf import process_pdf
from parsers.powerpoint import process_powerpoint
from parsers.txt import process_txt
from middlewares.cors import add_cors_middleware
from models.chats import ChatMessage
from models.users import User
from pydantic import BaseModel
from supabase import Client
from utils import (ChatMessage, CommonsDep, convert_bytes, create_user,
get_file_size, similarity_search, update_user_request_count)
from utils.file import convert_bytes, get_file_size
from utils.processors import filter_file
from utils.vectors import (CommonsDep, create_user, similarity_search,
update_user_request_count)
logger = get_logger(__name__)
app = FastAPI()
origins = [
"http://localhost",
"http://localhost:3000",
"https://quivr.app",
"https://www.quivr.app",
"http://quivr.app",
"http://www.quivr.app",
"*"
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
add_cors_middleware(app)
@ -57,45 +34,6 @@ async def startup_event():
pypandoc.download_pandoc()
file_processors = {
".txt": process_txt,
".csv": process_csv,
".md": process_markdown,
".markdown": process_markdown,
".m4a": process_audio,
".mp3": process_audio,
".webm": process_audio,
".mp4": process_audio,
".mpga": process_audio,
".wav": process_audio,
".mpeg": process_audio,
".pdf": process_pdf,
".html": process_html,
".pptx": process_powerpoint,
".docx": process_docx,
".odt": process_odt,
".epub": process_epub,
".ipynb": process_ipnyb,
}
class User (BaseModel):
email: str
async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
if await file_already_exists(supabase_client, file, user):
return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
elif file.file._file.tell() < 1:
return {"message": f"{file.filename} is empty.", "type": "error"}
else:
file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase
if file_extension in file_processors:
await file_processors[file_extension](file, enable_summarization, user)
return {"message": f"{file.filename} has been uploaded.", "type": "success"}
else:
return {"message": f"{file.filename} is not supported.", "type": "error"}
@app.post("/upload", dependencies=[Depends(JWTBearer())])
@ -221,7 +159,7 @@ async def delete_endpoint(commons: CommonsDep, file_name: str, credentials: dict
async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dict = Depends(JWTBearer()) ):
user = User(email=credentials.get('email', 'none'))
response = commons['supabase'].table("vectors").select(
"metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
"metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url", "content").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
documents = response.data
# Returns all documents with the same file name
return {"documents": documents}
@ -229,4 +167,4 @@ async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dic
@app.get("/")
async def root():
return {"message": "Hello World"}
return {"status": "OK"}

View File

@ -0,0 +1,21 @@
from fastapi.middleware.cors import CORSMiddleware
origins = [
"http://localhost",
"http://localhost:3000",
"https://quivr.app",
"https://www.quivr.app",
"http://quivr.app",
"http://www.quivr.app",
"*"
]
def add_cors_middleware(app):
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

13
backend/models/chats.py Normal file
View File

@ -0,0 +1,13 @@
from typing import List, Tuple
from pydantic import BaseModel
class ChatMessage(BaseModel):
model: str = "gpt-3.5-turbo"
question: str
# A list of tuples where each tuple is (speaker, text)
history: List[Tuple[str, str]]
temperature: float = 0.0
max_tokens: int = 256
use_summarization: bool = False

5
backend/models/users.py Normal file
View File

@ -0,0 +1,5 @@
from pydantic import BaseModel
class User (BaseModel):
email: str

View File

@ -10,7 +10,8 @@ from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import compute_sha1_from_content, documents_vector_store
from utils.file import compute_sha1_from_content
from utils.vectors import documents_vector_store
# # Create a function to transcribe audio using Whisper
# def _transcribe_audio(api_key, audio_file, stats_db):

View File

@ -8,8 +8,8 @@ from typing import Optional
from fastapi import UploadFile
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import (compute_sha1_from_content, compute_sha1_from_file,
create_summary, create_vector, documents_vector_store)
from utils.file import compute_sha1_from_content, compute_sha1_from_file
from utils.vectors import create_summary, create_vector, documents_vector_store
async def process_file(file: UploadFile, loader_class, file_suffix, enable_summarization, user):
@ -52,6 +52,8 @@ async def process_file(file: UploadFile, loader_class, file_suffix, enable_summa
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)
create_vector(user.email, doc_with_metadata)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
if enable_summarization and ids and len(ids) > 0:
create_summary(ids[0], doc.page_content, metadata)
return

37
backend/utils/file.py Normal file
View File

@ -0,0 +1,37 @@
import hashlib
from fastapi import UploadFile
def convert_bytes(bytes, precision=2):
"""Converts bytes into a human-friendly format."""
abbreviations = ['B', 'KB', 'MB']
if bytes <= 0:
return '0 B'
size = bytes
index = 0
while size >= 1024 and index < len(abbreviations) - 1:
size /= 1024
index += 1
return f'{size:.{precision}f} {abbreviations[index]}'
def get_file_size(file: UploadFile):
# move the cursor to the end of the file
file.file._file.seek(0, 2)
file_size = file.file._file.tell() # Getting the size of the file
# move the cursor back to the beginning of the file
file.file.seek(0)
return file_size
def compute_sha1_from_file(file_path):
with open(file_path, "rb") as file:
bytes = file.read()
readable_hash = compute_sha1_from_content(bytes)
return readable_hash
def compute_sha1_from_content(content):
readable_hash = hashlib.sha1(content).hexdigest()
return readable_hash

View File

@ -0,0 +1,55 @@
import os
from fastapi import Depends, FastAPI, UploadFile
from models.users import User
from parsers.audio import process_audio
from parsers.common import file_already_exists
from parsers.csv import process_csv
from parsers.docx import process_docx
from parsers.epub import process_epub
from parsers.html import process_html
from parsers.markdown import process_markdown
from parsers.notebook import process_ipnyb
from parsers.odt import process_odt
from parsers.pdf import process_pdf
from parsers.powerpoint import process_powerpoint
from parsers.txt import process_txt
from supabase import Client
file_processors = {
".txt": process_txt,
".csv": process_csv,
".md": process_markdown,
".markdown": process_markdown,
".m4a": process_audio,
".mp3": process_audio,
".webm": process_audio,
".mp4": process_audio,
".mpga": process_audio,
".wav": process_audio,
".mpeg": process_audio,
".pdf": process_pdf,
".html": process_html,
".pptx": process_powerpoint,
".docx": process_docx,
".odt": process_odt,
".epub": process_epub,
".ipynb": process_ipnyb,
}
async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
if await file_already_exists(supabase_client, file, user):
return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
elif file.file._file.tell() < 1:
return {"message": f"{file.filename} is empty.", "type": "error"}
else:
file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase
if file_extension in file_processors:
await file_processors[file_extension](file, enable_summarization, user)
return {"message": f"{file.filename} has been uploaded.", "type": "success"}
else:
return {"message": f"{file.filename} is not supported.", "type": "error"}

View File

@ -1,4 +1,3 @@
import hashlib
import os
from typing import Annotated, List, Tuple
@ -9,7 +8,6 @@ from langchain.vectorstores import SupabaseVectorStore
from llm.summarization import llm_summerize
from logger import get_logger
from pydantic import BaseModel
from supabase import Client, create_client
logger = get_logger(__name__)
@ -27,17 +25,8 @@ summaries_vector_store = SupabaseVectorStore(
supabase_client, embeddings, table_name="summaries")
def compute_sha1_from_file(file_path):
with open(file_path, "rb") as file:
bytes = file.read()
readable_hash = compute_sha1_from_content(bytes)
return readable_hash
def compute_sha1_from_content(content):
readable_hash = hashlib.sha1(content).hexdigest()
return readable_hash
def common_dependencies():
return {
@ -51,14 +40,6 @@ def common_dependencies():
CommonsDep = Annotated[dict, Depends(common_dependencies)]
class ChatMessage(BaseModel):
model: str = "gpt-3.5-turbo"
question: str
# A list of tuples where each tuple is (speaker, text)
history: List[Tuple[str, str]]
temperature: float = 0.0
max_tokens: int = 256
use_summarization: bool = False
def create_summary(document_id, content, metadata):
@ -107,23 +88,5 @@ def similarity_search(query, table='match_summaries', top_k=5, threshold=0.5):
).execute()
return summaries.data
def get_file_size(file: UploadFile):
# move the cursor to the end of the file
file.file._file.seek(0, 2)
file_size = file.file._file.tell() # Getting the size of the file
# move the cursor back to the beginning of the file
file.file.seek(0)
return file_size
def convert_bytes(bytes, precision=2):
"""Converts bytes into a human-friendly format."""
abbreviations = ['B', 'KB', 'MB']
if bytes <= 0:
return '0 B'
size = bytes
index = 0
while size >= 1024 and index < len(abbreviations) - 1:
size /= 1024
index += 1
return f'{size:.{precision}f} {abbreviations[index]}'

30
docker-compose.dev.yml Normal file
View File

@ -0,0 +1,30 @@
version: "3"
services:
frontend:
env_file:
- ./frontend/.env
build:
context: frontend
dockerfile: Dockerfile.dev
container_name: web
restart: always
volumes:
- ./frontend/:/app
- /app/node_modules
- /app/.next
ports:
- 3000:3000
backend:
env_file:
- ./backend/.env
build:
context: backend
dockerfile: Dockerfile
container_name: backend
restart: always
volumes:
- ./backend/:/code/
- ~/.config/gcloud:/root/.config/gcloud
ports:
- 5050:5050

View File

@ -9,10 +9,6 @@ services:
dockerfile: Dockerfile
container_name: web
restart: always
volumes:
- ./frontend/:/app
- /app/node_modules
- /app/.next
ports:
- 3000:3000
backend:

View File

@ -1,4 +1,5 @@
FROM node:18-alpine
# Install Python and essential build tools
RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
RUN python3 -m ensurepip
@ -21,8 +22,11 @@ RUN yarn install
# Copy the rest of our Next.js folder into /app
COPY . .
# Build the Next.js application
RUN yarn build
# Ensure port 3000 is accessible to our system
EXPOSE 3000
# Run yarn dev, as we would via the command line
CMD ["yarn", "dev"]
# Run yarn start, as we would via the command line
CMD ["yarn", "start"]

28
frontend/Dockerfile.dev Normal file
View File

@ -0,0 +1,28 @@
FROM node:18-alpine
# Install Python and essential build tools
RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
RUN python3 -m ensurepip
RUN pip3 install --no-cache --upgrade pip setuptools
# Create the directory on the node image
# where our Next.js app will live
RUN mkdir -p /app
# Set /app as the working directory
WORKDIR /app
# Copy package.json and yarn.lock
# to the /app working directory
COPY package*.json yarn.lock ./
# Install dependencies in /app
RUN yarn install
# Copy the rest of our Next.js folder into /app
COPY . .
# Ensure port 3000 is accessible to our system
EXPOSE 3000
# Run yarn dev, as we would via the command line
CMD ["yarn", "dev"]