2023-05-21 02:20:55 +03:00
|
|
|
import os
|
|
|
|
import shutil
|
2023-05-30 14:02:48 +03:00
|
|
|
from tempfile import SpooledTemporaryFile
|
|
|
|
from typing import Annotated, List, Tuple
|
2023-05-21 02:20:55 +03:00
|
|
|
|
2023-05-30 14:02:48 +03:00
|
|
|
import pypandoc
|
|
|
|
from auth_bearer import JWTBearer
|
|
|
|
from crawl.crawler import CrawlWebsite
|
|
|
|
from fastapi import Depends, FastAPI, File, Header, HTTPException, UploadFile
|
|
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
2023-05-22 09:39:55 +03:00
|
|
|
from llm.qa import get_qa_llm
|
2023-05-30 14:02:48 +03:00
|
|
|
from llm.summarization import llm_evaluate_summaries
|
|
|
|
from logger import get_logger
|
|
|
|
from parsers.audio import process_audio
|
2023-05-21 02:20:55 +03:00
|
|
|
from parsers.common import file_already_exists
|
|
|
|
from parsers.csv import process_csv
|
|
|
|
from parsers.docx import process_docx
|
2023-05-30 14:02:48 +03:00
|
|
|
from parsers.epub import process_epub
|
|
|
|
from parsers.html import process_html
|
2023-05-21 02:20:55 +03:00
|
|
|
from parsers.markdown import process_markdown
|
2023-05-30 14:02:48 +03:00
|
|
|
from parsers.notebook import process_ipnyb
|
|
|
|
from parsers.odt import process_odt
|
|
|
|
from parsers.pdf import process_pdf
|
2023-05-21 02:20:55 +03:00
|
|
|
from parsers.powerpoint import process_powerpoint
|
2023-05-30 14:02:48 +03:00
|
|
|
from parsers.txt import process_txt
|
|
|
|
from pydantic import BaseModel
|
|
|
|
from supabase import Client, create_client
|
|
|
|
from utils import ChatMessage, CommonsDep, similarity_search
|
2023-05-21 02:20:55 +03:00
|
|
|
|
2023-05-22 09:39:55 +03:00
|
|
|
logger = get_logger(__name__)
|
2023-05-21 02:20:55 +03:00
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI()
|
|
|
|
|
|
|
|
origins = [
|
|
|
|
"http://localhost",
|
|
|
|
"http://localhost:3000",
|
2023-05-22 16:18:25 +03:00
|
|
|
"*",
|
2023-05-21 02:20:55 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
app.add_middleware(
|
|
|
|
CORSMiddleware,
|
|
|
|
allow_origins=origins,
|
|
|
|
allow_credentials=True,
|
|
|
|
allow_methods=["*"],
|
|
|
|
allow_headers=["*"],
|
|
|
|
)
|
|
|
|
|
2023-05-21 09:15:31 +03:00
|
|
|
|
|
|
|
@app.on_event("startup")
|
|
|
|
async def startup_event():
|
|
|
|
pypandoc.download_pandoc()
|
|
|
|
|
|
|
|
|
2023-05-21 02:20:55 +03:00
|
|
|
file_processors = {
|
|
|
|
".txt": process_txt,
|
|
|
|
".csv": process_csv,
|
|
|
|
".md": process_markdown,
|
|
|
|
".markdown": process_markdown,
|
|
|
|
".m4a": process_audio,
|
|
|
|
".mp3": process_audio,
|
|
|
|
".webm": process_audio,
|
|
|
|
".mp4": process_audio,
|
|
|
|
".mpga": process_audio,
|
|
|
|
".wav": process_audio,
|
|
|
|
".mpeg": process_audio,
|
|
|
|
".pdf": process_pdf,
|
|
|
|
".html": process_html,
|
|
|
|
".pptx": process_powerpoint,
|
2023-05-21 09:15:31 +03:00
|
|
|
".docx": process_docx,
|
2023-05-30 14:02:48 +03:00
|
|
|
".odt": process_odt,
|
2023-05-21 09:15:31 +03:00
|
|
|
".epub": process_epub,
|
2023-05-21 22:12:41 +03:00
|
|
|
".ipynb": process_ipnyb,
|
2023-05-21 02:20:55 +03:00
|
|
|
}
|
|
|
|
|
2023-05-21 09:32:22 +03:00
|
|
|
|
2023-05-22 09:39:55 +03:00
|
|
|
async def filter_file(file: UploadFile, enable_summarization: bool, supabase_client: Client):
|
|
|
|
if await file_already_exists(supabase_client, file):
|
2023-05-21 02:20:55 +03:00
|
|
|
return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
|
|
|
|
elif file.file._file.tell() < 1:
|
|
|
|
return {"message": f"❌ {file.filename} is empty.", "type": "error"}
|
|
|
|
else:
|
2023-05-26 00:58:38 +03:00
|
|
|
file_extension = os.path.splitext(file.filename)[-1].lower() # Convert file extension to lowercase
|
2023-05-21 02:20:55 +03:00
|
|
|
if file_extension in file_processors:
|
2023-05-22 09:39:55 +03:00
|
|
|
await file_processors[file_extension](file, enable_summarization)
|
2023-05-21 02:20:55 +03:00
|
|
|
return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
|
|
|
|
else:
|
|
|
|
return {"message": f"❌ {file.filename} is not supported.", "type": "error"}
|
|
|
|
|
2023-05-21 09:32:22 +03:00
|
|
|
|
2023-05-24 23:21:22 +03:00
|
|
|
|
|
|
|
@app.post("/upload", dependencies=[Depends(JWTBearer())])
|
2023-05-22 10:12:46 +03:00
|
|
|
async def upload_file(commons: CommonsDep, file: UploadFile, enable_summarization: bool = False):
|
2023-05-22 09:39:55 +03:00
|
|
|
message = await filter_file(file, enable_summarization, commons['supabase'])
|
2023-05-24 23:21:22 +03:00
|
|
|
|
2023-05-21 02:20:55 +03:00
|
|
|
return message
|
|
|
|
|
2023-05-21 09:32:22 +03:00
|
|
|
|
2023-05-24 23:21:22 +03:00
|
|
|
@app.post("/chat/", dependencies=[Depends(JWTBearer())])
|
2023-05-22 09:39:55 +03:00
|
|
|
async def chat_endpoint(commons: CommonsDep, chat_message: ChatMessage):
|
2023-05-21 02:20:55 +03:00
|
|
|
history = chat_message.history
|
2023-05-22 09:39:55 +03:00
|
|
|
qa = get_qa_llm(chat_message)
|
2023-05-21 02:20:55 +03:00
|
|
|
history.append(("user", chat_message.question))
|
2023-05-22 09:39:55 +03:00
|
|
|
|
|
|
|
if chat_message.use_summarization:
|
|
|
|
# 1. get summaries from the vector store based on question
|
|
|
|
summaries = similarity_search(
|
|
|
|
chat_message.question, table='match_summaries')
|
|
|
|
# 2. evaluate summaries against the question
|
|
|
|
evaluations = llm_evaluate_summaries(
|
|
|
|
chat_message.question, summaries, chat_message.model)
|
|
|
|
# 3. pull in the top documents from summaries
|
|
|
|
logger.info('Evaluations: %s', evaluations)
|
|
|
|
if evaluations:
|
|
|
|
reponse = commons['supabase'].from_('documents').select(
|
|
|
|
'*').in_('id', values=[e['document_id'] for e in evaluations]).execute()
|
|
|
|
# 4. use top docs as additional context
|
|
|
|
additional_context = '---\nAdditional Context={}'.format(
|
|
|
|
'---\n'.join(data['content'] for data in reponse.data)
|
|
|
|
) + '\n'
|
|
|
|
model_response = qa(
|
|
|
|
{"question": additional_context + chat_message.question})
|
|
|
|
else:
|
|
|
|
model_response = qa({"question": chat_message.question})
|
2023-05-21 02:20:55 +03:00
|
|
|
history.append(("assistant", model_response["answer"]))
|
|
|
|
|
|
|
|
return {"history": history}
|
|
|
|
|
2023-05-21 09:32:22 +03:00
|
|
|
|
2023-05-24 23:21:22 +03:00
|
|
|
@app.post("/crawl/", dependencies=[Depends(JWTBearer())])
|
2023-05-22 10:12:46 +03:00
|
|
|
async def crawl_endpoint(commons: CommonsDep, crawl_website: CrawlWebsite, enable_summarization: bool = False):
|
2023-05-21 02:20:55 +03:00
|
|
|
file_path, file_name = crawl_website.process()
|
|
|
|
|
|
|
|
# Create a SpooledTemporaryFile from the file_path
|
|
|
|
spooled_file = SpooledTemporaryFile()
|
|
|
|
with open(file_path, 'rb') as f:
|
|
|
|
shutil.copyfileobj(f, spooled_file)
|
|
|
|
|
|
|
|
# Pass the SpooledTemporaryFile to UploadFile
|
|
|
|
file = UploadFile(file=spooled_file, filename=file_name)
|
2023-05-22 09:39:55 +03:00
|
|
|
message = await filter_file(file, enable_summarization, commons['supabase'])
|
2023-05-22 09:56:11 +03:00
|
|
|
return message
|
2023-05-21 02:20:55 +03:00
|
|
|
|
2023-05-21 09:32:22 +03:00
|
|
|
|
2023-05-24 23:21:22 +03:00
|
|
|
@app.get("/explore", dependencies=[Depends(JWTBearer())])
|
2023-05-22 09:39:55 +03:00
|
|
|
async def explore_endpoint(commons: CommonsDep):
|
|
|
|
response = commons['supabase'].table("documents").select(
|
2023-05-21 09:32:22 +03:00
|
|
|
"name:metadata->>file_name, size:metadata->>file_size", count="exact").execute()
|
2023-05-21 02:20:55 +03:00
|
|
|
documents = response.data # Access the data from the response
|
|
|
|
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
|
|
|
|
unique_data = [dict(t) for t in set(tuple(d.items()) for d in documents)]
|
|
|
|
# Sort the list of documents by size in decreasing order
|
|
|
|
unique_data.sort(key=lambda x: int(x['size']), reverse=True)
|
|
|
|
|
|
|
|
return {"documents": unique_data}
|
|
|
|
|
2023-05-21 09:32:22 +03:00
|
|
|
|
2023-05-24 23:21:22 +03:00
|
|
|
@app.delete("/explore/{file_name}", dependencies=[Depends(JWTBearer())])
|
2023-05-22 09:39:55 +03:00
|
|
|
async def delete_endpoint(commons: CommonsDep, file_name: str):
|
|
|
|
# Cascade delete the summary from the database first, because it has a foreign key constraint
|
|
|
|
commons['supabase'].table("summaries").delete().match(
|
|
|
|
{"metadata->>file_name": file_name}).execute()
|
|
|
|
commons['supabase'].table("documents").delete().match(
|
2023-05-21 09:32:22 +03:00
|
|
|
{"metadata->>file_name": file_name}).execute()
|
2023-05-21 02:20:55 +03:00
|
|
|
return {"message": f"{file_name} has been deleted."}
|
|
|
|
|
2023-05-21 09:32:22 +03:00
|
|
|
|
2023-05-24 23:21:22 +03:00
|
|
|
@app.get("/explore/{file_name}", dependencies=[Depends(JWTBearer())])
|
2023-05-22 09:39:55 +03:00
|
|
|
async def download_endpoint(commons: CommonsDep, file_name: str):
|
|
|
|
response = commons['supabase'].table("documents").select(
|
2023-05-21 09:32:22 +03:00
|
|
|
"metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name}).execute()
|
2023-05-21 02:20:55 +03:00
|
|
|
documents = response.data
|
2023-05-21 09:32:22 +03:00
|
|
|
# Returns all documents with the same file name
|
2023-05-21 02:20:55 +03:00
|
|
|
return {"documents": documents}
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/")
|
|
|
|
async def root():
|
|
|
|
return {"message": "Hello World"}
|