mirror of
https://github.com/StanGirard/quivr.git
synced 2024-12-23 03:12:24 +03:00
Back/refacto files (#240)
* feat(docker): added docker for prod * feat(refacto): moved to modules
This commit is contained in:
parent
59c02228b6
commit
a3ca7ecb37
7
Makefile
Normal file
7
Makefile
Normal file
@ -0,0 +1,7 @@
|
||||
|
||||
|
||||
dev:
|
||||
docker compose -f docker-compose.dev.yml up --build
|
||||
|
||||
prod:
|
||||
docker compose -f docker-compose.yml up --build
|
@ -110,11 +110,17 @@ cp .frontend_env.example frontend/.env
|
||||
- **Step 5**: Launch the app
|
||||
|
||||
```bash
|
||||
docker compose build && docker compose up
|
||||
docker compose -f docker-compose.yml up --build
|
||||
```
|
||||
|
||||
- **Step 6**: Navigate to `localhost:3000` in your browser
|
||||
|
||||
- ** Step 7**: Want to contribute to the project?
|
||||
|
||||
```
|
||||
docker compose -f docker-compose.dev.yml up --build
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Contributors ✨
|
||||
|
@ -11,4 +11,4 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt --timeout 100
|
||||
|
||||
COPY . /code/
|
||||
|
||||
CMD ["uvicorn", "api:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
|
||||
CMD ["uvicorn", "main:app", "--reload", "--host", "0.0.0.0", "--port", "5050"]
|
@ -1,10 +1,11 @@
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from auth_handler import decode_access_token
|
||||
from fastapi import HTTPException, Request
|
||||
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
||||
|
||||
from .auth_handler import decode_access_token
|
||||
|
||||
|
||||
class JWTBearer(HTTPBearer):
|
||||
def __init__(self, auto_error: bool = True):
|
@ -10,8 +10,8 @@ from langchain.llms import VertexAI
|
||||
from langchain.memory import ConversationBufferMemory
|
||||
from langchain.vectorstores import SupabaseVectorStore
|
||||
from llm import LANGUAGE_PROMPT
|
||||
from models.chats import ChatMessage
|
||||
from supabase import Client, create_client
|
||||
from utils import ChatMessage
|
||||
|
||||
|
||||
class CustomSupabaseVectorStore(SupabaseVectorStore):
|
||||
|
@ -4,51 +4,28 @@ import time
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
import pypandoc
|
||||
from auth_bearer import JWTBearer
|
||||
from auth.auth_bearer import JWTBearer
|
||||
from crawl.crawler import CrawlWebsite
|
||||
from fastapi import Depends, FastAPI, UploadFile
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from llm.qa import get_qa_llm
|
||||
from llm.summarization import llm_evaluate_summaries
|
||||
from logger import get_logger
|
||||
from parsers.audio import process_audio
|
||||
from parsers.common import file_already_exists
|
||||
from parsers.csv import process_csv
|
||||
from parsers.docx import process_docx
|
||||
from parsers.epub import process_epub
|
||||
from parsers.html import process_html
|
||||
from parsers.markdown import process_markdown
|
||||
from parsers.notebook import process_ipnyb
|
||||
from parsers.odt import process_odt
|
||||
from parsers.pdf import process_pdf
|
||||
from parsers.powerpoint import process_powerpoint
|
||||
from parsers.txt import process_txt
|
||||
from middlewares.cors import add_cors_middleware
|
||||
from models.chats import ChatMessage
|
||||
from models.users import User
|
||||
from pydantic import BaseModel
|
||||
from supabase import Client
|
||||
from utils import (ChatMessage, CommonsDep, convert_bytes, create_user,
|
||||
get_file_size, similarity_search, update_user_request_count)
|
||||
from utils.file import convert_bytes, get_file_size
|
||||
from utils.processors import filter_file
|
||||
from utils.vectors import (CommonsDep, create_user, similarity_search,
|
||||
update_user_request_count)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
origins = [
|
||||
"http://localhost",
|
||||
"http://localhost:3000",
|
||||
"https://quivr.app",
|
||||
"https://www.quivr.app",
|
||||
"http://quivr.app",
|
||||
"http://www.quivr.app",
|
||||
"*"
|
||||
]
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
add_cors_middleware(app)
|
||||
|
||||
|
||||
|
||||
@ -57,45 +34,6 @@ async def startup_event():
|
||||
pypandoc.download_pandoc()
|
||||
|
||||
|
||||
file_processors = {
|
||||
".txt": process_txt,
|
||||
".csv": process_csv,
|
||||
".md": process_markdown,
|
||||
".markdown": process_markdown,
|
||||
".m4a": process_audio,
|
||||
".mp3": process_audio,
|
||||
".webm": process_audio,
|
||||
".mp4": process_audio,
|
||||
".mpga": process_audio,
|
||||
".wav": process_audio,
|
||||
".mpeg": process_audio,
|
||||
".pdf": process_pdf,
|
||||
".html": process_html,
|
||||
".pptx": process_powerpoint,
|
||||
".docx": process_docx,
|
||||
".odt": process_odt,
|
||||
".epub": process_epub,
|
||||
".ipynb": process_ipnyb,
|
||||
}
|
||||
|
||||
|
||||
class User (BaseModel):
|
||||
email: str
|
||||
|
||||
|
||||
async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
|
||||
if await file_already_exists(supabase_client, file, user):
|
||||
return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
|
||||
elif file.file._file.tell() < 1:
|
||||
return {"message": f"❌ {file.filename} is empty.", "type": "error"}
|
||||
else:
|
||||
file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase
|
||||
if file_extension in file_processors:
|
||||
await file_processors[file_extension](file, enable_summarization, user)
|
||||
return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
|
||||
else:
|
||||
return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
|
||||
|
||||
|
||||
|
||||
@app.post("/upload", dependencies=[Depends(JWTBearer())])
|
||||
@ -221,7 +159,7 @@ async def delete_endpoint(commons: CommonsDep, file_name: str, credentials: dict
|
||||
async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dict = Depends(JWTBearer()) ):
|
||||
user = User(email=credentials.get('email', 'none'))
|
||||
response = commons['supabase'].table("vectors").select(
|
||||
"metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
|
||||
"metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url", "content").match({"metadata->>file_name": file_name, "user_id": user.email}).execute()
|
||||
documents = response.data
|
||||
# Returns all documents with the same file name
|
||||
return {"documents": documents}
|
||||
@ -229,4 +167,4 @@ async def download_endpoint(commons: CommonsDep, file_name: str,credentials: dic
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return {"message": "Hello World"}
|
||||
return {"status": "OK"}
|
21
backend/middlewares/cors.py
Normal file
21
backend/middlewares/cors.py
Normal file
@ -0,0 +1,21 @@
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
origins = [
|
||||
"http://localhost",
|
||||
"http://localhost:3000",
|
||||
"https://quivr.app",
|
||||
"https://www.quivr.app",
|
||||
"http://quivr.app",
|
||||
"http://www.quivr.app",
|
||||
"*"
|
||||
]
|
||||
|
||||
|
||||
def add_cors_middleware(app):
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
13
backend/models/chats.py
Normal file
13
backend/models/chats.py
Normal file
@ -0,0 +1,13 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ChatMessage(BaseModel):
|
||||
model: str = "gpt-3.5-turbo"
|
||||
question: str
|
||||
# A list of tuples where each tuple is (speaker, text)
|
||||
history: List[Tuple[str, str]]
|
||||
temperature: float = 0.0
|
||||
max_tokens: int = 256
|
||||
use_summarization: bool = False
|
5
backend/models/users.py
Normal file
5
backend/models/users.py
Normal file
@ -0,0 +1,5 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class User (BaseModel):
|
||||
email: str
|
@ -10,7 +10,8 @@ from langchain.document_loaders import TextLoader
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from utils import compute_sha1_from_content, documents_vector_store
|
||||
from utils.file import compute_sha1_from_content
|
||||
from utils.vectors import documents_vector_store
|
||||
|
||||
# # Create a function to transcribe audio using Whisper
|
||||
# def _transcribe_audio(api_key, audio_file, stats_db):
|
||||
|
@ -8,8 +8,8 @@ from typing import Optional
|
||||
from fastapi import UploadFile
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from utils import (compute_sha1_from_content, compute_sha1_from_file,
|
||||
create_summary, create_vector, documents_vector_store)
|
||||
from utils.file import compute_sha1_from_content, compute_sha1_from_file
|
||||
from utils.vectors import create_summary, create_vector, documents_vector_store
|
||||
|
||||
|
||||
async def process_file(file: UploadFile, loader_class, file_suffix, enable_summarization, user):
|
||||
@ -52,6 +52,8 @@ async def process_file(file: UploadFile, loader_class, file_suffix, enable_summa
|
||||
doc_with_metadata = Document(
|
||||
page_content=doc.page_content, metadata=metadata)
|
||||
create_vector(user.email, doc_with_metadata)
|
||||
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
||||
|
||||
if enable_summarization and ids and len(ids) > 0:
|
||||
create_summary(ids[0], doc.page_content, metadata)
|
||||
return
|
||||
|
37
backend/utils/file.py
Normal file
37
backend/utils/file.py
Normal file
@ -0,0 +1,37 @@
|
||||
import hashlib
|
||||
|
||||
from fastapi import UploadFile
|
||||
|
||||
|
||||
def convert_bytes(bytes, precision=2):
|
||||
"""Converts bytes into a human-friendly format."""
|
||||
abbreviations = ['B', 'KB', 'MB']
|
||||
if bytes <= 0:
|
||||
return '0 B'
|
||||
size = bytes
|
||||
index = 0
|
||||
while size >= 1024 and index < len(abbreviations) - 1:
|
||||
size /= 1024
|
||||
index += 1
|
||||
return f'{size:.{precision}f} {abbreviations[index]}'
|
||||
|
||||
def get_file_size(file: UploadFile):
|
||||
# move the cursor to the end of the file
|
||||
file.file._file.seek(0, 2)
|
||||
file_size = file.file._file.tell() # Getting the size of the file
|
||||
# move the cursor back to the beginning of the file
|
||||
file.file.seek(0)
|
||||
|
||||
return file_size
|
||||
|
||||
|
||||
def compute_sha1_from_file(file_path):
|
||||
with open(file_path, "rb") as file:
|
||||
bytes = file.read()
|
||||
readable_hash = compute_sha1_from_content(bytes)
|
||||
return readable_hash
|
||||
|
||||
|
||||
def compute_sha1_from_content(content):
|
||||
readable_hash = hashlib.sha1(content).hexdigest()
|
||||
return readable_hash
|
55
backend/utils/processors.py
Normal file
55
backend/utils/processors.py
Normal file
@ -0,0 +1,55 @@
|
||||
import os
|
||||
|
||||
from fastapi import Depends, FastAPI, UploadFile
|
||||
from models.users import User
|
||||
from parsers.audio import process_audio
|
||||
from parsers.common import file_already_exists
|
||||
from parsers.csv import process_csv
|
||||
from parsers.docx import process_docx
|
||||
from parsers.epub import process_epub
|
||||
from parsers.html import process_html
|
||||
from parsers.markdown import process_markdown
|
||||
from parsers.notebook import process_ipnyb
|
||||
from parsers.odt import process_odt
|
||||
from parsers.pdf import process_pdf
|
||||
from parsers.powerpoint import process_powerpoint
|
||||
from parsers.txt import process_txt
|
||||
from supabase import Client
|
||||
|
||||
file_processors = {
|
||||
".txt": process_txt,
|
||||
".csv": process_csv,
|
||||
".md": process_markdown,
|
||||
".markdown": process_markdown,
|
||||
".m4a": process_audio,
|
||||
".mp3": process_audio,
|
||||
".webm": process_audio,
|
||||
".mp4": process_audio,
|
||||
".mpga": process_audio,
|
||||
".wav": process_audio,
|
||||
".mpeg": process_audio,
|
||||
".pdf": process_pdf,
|
||||
".html": process_html,
|
||||
".pptx": process_powerpoint,
|
||||
".docx": process_docx,
|
||||
".odt": process_odt,
|
||||
".epub": process_epub,
|
||||
".ipynb": process_ipnyb,
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client, user: User):
|
||||
if await file_already_exists(supabase_client, file, user):
|
||||
return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
|
||||
elif file.file._file.tell() < 1:
|
||||
return {"message": f"❌ {file.filename} is empty.", "type": "error"}
|
||||
else:
|
||||
file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase
|
||||
if file_extension in file_processors:
|
||||
await file_processors[file_extension](file, enable_summarization, user)
|
||||
return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
|
||||
else:
|
||||
return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
|
||||
|
@ -1,4 +1,3 @@
|
||||
import hashlib
|
||||
import os
|
||||
from typing import Annotated, List, Tuple
|
||||
|
||||
@ -9,7 +8,6 @@ from langchain.vectorstores import SupabaseVectorStore
|
||||
from llm.summarization import llm_summerize
|
||||
from logger import get_logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from supabase import Client, create_client
|
||||
|
||||
logger = get_logger(__name__)
|
||||
@ -27,17 +25,8 @@ summaries_vector_store = SupabaseVectorStore(
|
||||
supabase_client, embeddings, table_name="summaries")
|
||||
|
||||
|
||||
def compute_sha1_from_file(file_path):
|
||||
with open(file_path, "rb") as file:
|
||||
bytes = file.read()
|
||||
readable_hash = compute_sha1_from_content(bytes)
|
||||
return readable_hash
|
||||
|
||||
|
||||
def compute_sha1_from_content(content):
|
||||
readable_hash = hashlib.sha1(content).hexdigest()
|
||||
return readable_hash
|
||||
|
||||
|
||||
def common_dependencies():
|
||||
return {
|
||||
@ -51,14 +40,6 @@ def common_dependencies():
|
||||
CommonsDep = Annotated[dict, Depends(common_dependencies)]
|
||||
|
||||
|
||||
class ChatMessage(BaseModel):
|
||||
model: str = "gpt-3.5-turbo"
|
||||
question: str
|
||||
# A list of tuples where each tuple is (speaker, text)
|
||||
history: List[Tuple[str, str]]
|
||||
temperature: float = 0.0
|
||||
max_tokens: int = 256
|
||||
use_summarization: bool = False
|
||||
|
||||
|
||||
def create_summary(document_id, content, metadata):
|
||||
@ -107,23 +88,5 @@ def similarity_search(query, table='match_summaries', top_k=5, threshold=0.5):
|
||||
).execute()
|
||||
return summaries.data
|
||||
|
||||
def get_file_size(file: UploadFile):
|
||||
# move the cursor to the end of the file
|
||||
file.file._file.seek(0, 2)
|
||||
file_size = file.file._file.tell() # Getting the size of the file
|
||||
# move the cursor back to the beginning of the file
|
||||
file.file.seek(0)
|
||||
|
||||
return file_size
|
||||
|
||||
def convert_bytes(bytes, precision=2):
|
||||
"""Converts bytes into a human-friendly format."""
|
||||
abbreviations = ['B', 'KB', 'MB']
|
||||
if bytes <= 0:
|
||||
return '0 B'
|
||||
size = bytes
|
||||
index = 0
|
||||
while size >= 1024 and index < len(abbreviations) - 1:
|
||||
size /= 1024
|
||||
index += 1
|
||||
return f'{size:.{precision}f} {abbreviations[index]}'
|
30
docker-compose.dev.yml
Normal file
30
docker-compose.dev.yml
Normal file
@ -0,0 +1,30 @@
|
||||
version: "3"
|
||||
|
||||
services:
|
||||
frontend:
|
||||
env_file:
|
||||
- ./frontend/.env
|
||||
build:
|
||||
context: frontend
|
||||
dockerfile: Dockerfile.dev
|
||||
container_name: web
|
||||
restart: always
|
||||
volumes:
|
||||
- ./frontend/:/app
|
||||
- /app/node_modules
|
||||
- /app/.next
|
||||
ports:
|
||||
- 3000:3000
|
||||
backend:
|
||||
env_file:
|
||||
- ./backend/.env
|
||||
build:
|
||||
context: backend
|
||||
dockerfile: Dockerfile
|
||||
container_name: backend
|
||||
restart: always
|
||||
volumes:
|
||||
- ./backend/:/code/
|
||||
- ~/.config/gcloud:/root/.config/gcloud
|
||||
ports:
|
||||
- 5050:5050
|
@ -9,10 +9,6 @@ services:
|
||||
dockerfile: Dockerfile
|
||||
container_name: web
|
||||
restart: always
|
||||
volumes:
|
||||
- ./frontend/:/app
|
||||
- /app/node_modules
|
||||
- /app/.next
|
||||
ports:
|
||||
- 3000:3000
|
||||
backend:
|
||||
|
@ -1,4 +1,5 @@
|
||||
FROM node:18-alpine
|
||||
|
||||
# Install Python and essential build tools
|
||||
RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
|
||||
RUN python3 -m ensurepip
|
||||
@ -21,8 +22,11 @@ RUN yarn install
|
||||
# Copy the rest of our Next.js folder into /app
|
||||
COPY . .
|
||||
|
||||
# Build the Next.js application
|
||||
RUN yarn build
|
||||
|
||||
# Ensure port 3000 is accessible to our system
|
||||
EXPOSE 3000
|
||||
|
||||
# Run yarn dev, as we would via the command line
|
||||
CMD ["yarn", "dev"]
|
||||
# Run yarn start, as we would via the command line
|
||||
CMD ["yarn", "start"]
|
28
frontend/Dockerfile.dev
Normal file
28
frontend/Dockerfile.dev
Normal file
@ -0,0 +1,28 @@
|
||||
FROM node:18-alpine
|
||||
# Install Python and essential build tools
|
||||
RUN apk add --update --no-cache python3 make g++ && ln -sf python3 /usr/bin/python
|
||||
RUN python3 -m ensurepip
|
||||
RUN pip3 install --no-cache --upgrade pip setuptools
|
||||
|
||||
# Create the directory on the node image
|
||||
# where our Next.js app will live
|
||||
RUN mkdir -p /app
|
||||
|
||||
# Set /app as the working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy package.json and yarn.lock
|
||||
# to the /app working directory
|
||||
COPY package*.json yarn.lock ./
|
||||
|
||||
# Install dependencies in /app
|
||||
RUN yarn install
|
||||
|
||||
# Copy the rest of our Next.js folder into /app
|
||||
COPY . .
|
||||
|
||||
# Ensure port 3000 is accessible to our system
|
||||
EXPOSE 3000
|
||||
|
||||
# Run yarn dev, as we would via the command line
|
||||
CMD ["yarn", "dev"]
|
Loading…
Reference in New Issue
Block a user