quivr/backend/api.py

from fastapi import FastAPI, UploadFile, File, HTTPException
import os
from pydantic import BaseModel
from typing import List, Tuple
from supabase import create_client, Client
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import SupabaseVectorStore
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from fastapi.openapi.utils import get_openapi
from tempfile import SpooledTemporaryFile
import shutil


from parsers.common import file_already_exists
from parsers.txt import process_txt
from parsers.csv import process_csv
from parsers.docx import process_docx
from parsers.pdf import process_pdf
from parsers.markdown import process_markdown
from parsers.powerpoint import process_powerpoint
from parsers.html import process_html
from parsers.audio import process_audio
from crawl.crawler import CrawlWebsite


from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

origins = [
    "http://localhost",
    "http://localhost:3000",
    "http://localhost:8080",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
openai_api_key = os.environ.get("OPENAI_API_KEY")
anthropic_api_key = ""
supabase: Client = create_client(supabase_url, supabase_key)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vector_store = SupabaseVectorStore(
    supabase, embeddings, table_name="documents")
memory = ConversationBufferMemory(
    memory_key="chat_history", return_messages=True)


class ChatMessage(BaseModel):
    model: str = "gpt-3.5-turbo"
    question: str
    history: List[Tuple[str, str]]  # A list of tuples where each tuple is (speaker, text)
    temperature: float = 0.0
    max_tokens: int = 256


file_processors = {
    ".txt": process_txt,
    ".csv": process_csv,
    ".md": process_markdown,
    ".markdown": process_markdown,
    ".m4a": process_audio,
    ".mp3": process_audio,
    ".webm": process_audio,
    ".mp4": process_audio,
    ".mpga": process_audio,
    ".wav": process_audio,
    ".mpeg": process_audio,
    ".pdf": process_pdf,
    ".html": process_html,
    ".pptx": process_powerpoint,
    ".docx": process_docx
}

async def filter_file(file: UploadFile, supabase, vector_store, stats_db):
    if await file_already_exists(supabase, file):
        return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}
    elif file.file._file.tell() < 1:
        return {"message": f"❌ {file.filename} is empty.", "type": "error"}
    else:
        file_extension = os.path.splitext(file.filename)[-1]
        if file_extension in file_processors:
            await file_processors[file_extension](vector_store, file, stats_db=None)
            return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}
        else:
            return {"message": f"❌ {file.filename} is not supported.", "type": "error"}

@app.post("/upload")
async def upload_file(file: UploadFile):
    message = await filter_file(file, supabase, vector_store, stats_db=None)
    return message

@app.post("/chat/")
async def chat_endpoint(chat_message: ChatMessage):
    history = chat_message.history
    # Logic from your Streamlit app goes here. For example:
    qa = None
    if chat_message.model.startswith("gpt"):
        qa = ConversationalRetrievalChain.from_llm(
            OpenAI(
                model_name=chat_message.model, openai_api_key=openai_api_key, temperature=chat_message.temperature, max_tokens=chat_message.max_tokens), vector_store.as_retriever(), memory=memory, verbose=True)
    elif anthropic_api_key and model.startswith("claude"):
        qa = ConversationalRetrievalChain.from_llm(
            ChatAnthropic(
                model=model, anthropic_api_key=anthropic_api_key, temperature=temperature, max_tokens_to_sample=max_tokens), vector_store.as_retriever(), memory=memory, verbose=True, max_tokens_limit=102400)

    history.append(("user", chat_message.question))
    model_response = qa({"question": chat_message.question})
    history.append(("assistant", model_response["answer"]))

    return {"history": history}

@app.post("/crawl/")
async def crawl_endpoint(crawl_website: CrawlWebsite):
    
    file_path, file_name = crawl_website.process()

    # Create a SpooledTemporaryFile from the file_path
    spooled_file = SpooledTemporaryFile()
    with open(file_path, 'rb') as f:
        shutil.copyfileobj(f, spooled_file)

    # Pass the SpooledTemporaryFile to UploadFile
    file = UploadFile(file=spooled_file, filename=file_name)
    message = await filter_file(file, supabase, vector_store)
    print(message)
    return {"message": message}

@app.get("/explore")
async def explore_endpoint():
    response = supabase.table("documents").select("name:metadata->>file_name, size:metadata->>file_size", count="exact").execute()
    documents = response.data  # Access the data from the response
    # Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
    unique_data = [dict(t) for t in set(tuple(d.items()) for d in documents)]
    # Sort the list of documents by size in decreasing order
    unique_data.sort(key=lambda x: int(x['size']), reverse=True)

    return {"documents": unique_data}

@app.delete("/explore/{file_name}")
async def delete_endpoint(file_name: str):
    response = supabase.table("documents").delete().match({"metadata->>file_name": file_name}).execute()
    return {"message": f"{file_name} has been deleted."}

@app.get("/explore/{file_name}")
async def download_endpoint(file_name: str):
    response = supabase.table("documents").select("metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name}).execute()
    documents = response.data
    ### Returns all documents with the same file name
    return {"documents": documents}


@app.get("/")
async def root():
    return {"message": "Hello World"}
New Webapp migration (#56) * feat(v2): loaders added * feature: Add scroll animations * feature: upload ui * feature: upload multiple files * fix: Same file name and size remove * feat(crawler): added * feat(parsers): v2 added more * feat(v2): audio now working * feat(v2): all loaders * feat(v2): explorer * chore: add links * feat(api): added status in return message * refactor(website): remove old code * feat(upload): return type for messages * feature: redirect to upload if ENV=local * fix(chat): fixed some issues * feature: respect response type * loading state * feature: Loading stat * feat(v2): added explore and chat pages * feature: modal settings * style: Chat UI * feature: scroll to bottom when chatting * feature: smooth scroll in chat * feature(anim): Slide chat in * feature: markdown chat * feat(explorer): list * feat(doc): added document item * feat(explore): added modal * Add clarification on Project API keys and web interface for migration scripts to Readme (#58) * fix(demo): changed link * add support to uploading zip file (#62) * Catch UnicodeEncodeError exception (#64) * feature: fixed chatbar * fix(loaders): missing argument * fix: layout * fix: One whole chatbox * fix: Scroll into view * fix(build): vercel issues * chore(streamlit): moved to own file * refactor(api): moved to backend folder * feat(docker): added docker compose * Fix a bug where langchain memories were not being cleaned (#71) * Update README.md (#70) * chore(streamlit): moved to own file * refactor(api): moved to backend folder * docs(readme): updated for new version * docs(readme): added old readme * docs(readme): update copy dot env file * docs(readme): cleanup --------- Co-authored-by: iMADi-ARCH <nandanaditya985@gmail.com> Co-authored-by: Matt LeBel <github@lebel.io> Co-authored-by: Evan Carlson <45178375+EvanCarlson@users.noreply.github.com> Co-authored-by: Mustafa Hasan Khan <65130881+mustafahasankhan@users.noreply.github.com> Co-authored-by: zhulixi <48713110+zlxxlz1026@users.noreply.github.com> Co-authored-by: Stanisław Tuszyński <stanislaw@tuszynski.me> 2023-05-21 02:20:55 +03:00			`from fastapi import FastAPI, UploadFile, File, HTTPException`
			`import os`
			`from pydantic import BaseModel`
			`from typing import List, Tuple`
			`from supabase import create_client, Client`
			`from langchain.embeddings.openai import OpenAIEmbeddings`
			`from langchain.memory import ConversationBufferMemory`
			`from langchain.vectorstores import SupabaseVectorStore`
			`from langchain.chains import ConversationalRetrievalChain`
			`from langchain.llms import OpenAI`
			`from fastapi.openapi.utils import get_openapi`
			`from tempfile import SpooledTemporaryFile`
			`import shutil`


			`from parsers.common import file_already_exists`
			`from parsers.txt import process_txt`
			`from parsers.csv import process_csv`
			`from parsers.docx import process_docx`
			`from parsers.pdf import process_pdf`
			`from parsers.markdown import process_markdown`
			`from parsers.powerpoint import process_powerpoint`
			`from parsers.html import process_html`
			`from parsers.audio import process_audio`
			`from crawl.crawler import CrawlWebsite`


			`from fastapi.middleware.cors import CORSMiddleware`

			`app = FastAPI()`

			`origins = [`
			`"http://localhost",`
			`"http://localhost:3000",`
			`"http://localhost:8080",`
			`]`

			`app.add_middleware(`
			`CORSMiddleware,`
			`allow_origins=origins,`
			`allow_credentials=True,`
			`allow_methods=["*"],`
			`allow_headers=["*"],`
			`)`

			`supabase_url = os.environ.get("SUPABASE_URL")`
			`supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")`
			`openai_api_key = os.environ.get("OPENAI_API_KEY")`
			`anthropic_api_key = ""`
			`supabase: Client = create_client(supabase_url, supabase_key)`
			`embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)`
			`vector_store = SupabaseVectorStore(`
			`supabase, embeddings, table_name="documents")`
			`memory = ConversationBufferMemory(`
			`memory_key="chat_history", return_messages=True)`


			`class ChatMessage(BaseModel):`
			`model: str = "gpt-3.5-turbo"`
			`question: str`
			`history: List[Tuple[str, str]] # A list of tuples where each tuple is (speaker, text)`
			`temperature: float = 0.0`
			`max_tokens: int = 256`






			`file_processors = {`
			`".txt": process_txt,`
			`".csv": process_csv,`
			`".md": process_markdown,`
			`".markdown": process_markdown,`
			`".m4a": process_audio,`
			`".mp3": process_audio,`
			`".webm": process_audio,`
			`".mp4": process_audio,`
			`".mpga": process_audio,`
			`".wav": process_audio,`
			`".mpeg": process_audio,`
			`".pdf": process_pdf,`
			`".html": process_html,`
			`".pptx": process_powerpoint,`
			`".docx": process_docx`
			`}`

			`async def filter_file(file: UploadFile, supabase, vector_store, stats_db):`
			`if await file_already_exists(supabase, file):`
			`return {"message": f"🤔 {file.filename} already exists.", "type": "warning"}`
			`elif file.file._file.tell() < 1:`
			`return {"message": f"❌ {file.filename} is empty.", "type": "error"}`
			`else:`
			`file_extension = os.path.splitext(file.filename)[-1]`
			`if file_extension in file_processors:`
			`await file_processors[file_extension](vector_store, file, stats_db=None)`
			`return {"message": f"✅ {file.filename} has been uploaded.", "type": "success"}`
			`else:`
			`return {"message": f"❌ {file.filename} is not supported.", "type": "error"}`

			`@app.post("/upload")`
			`async def upload_file(file: UploadFile):`
			`message = await filter_file(file, supabase, vector_store, stats_db=None)`
			`return message`

			`@app.post("/chat/")`
			`async def chat_endpoint(chat_message: ChatMessage):`
			`history = chat_message.history`
			`# Logic from your Streamlit app goes here. For example:`
			`qa = None`
			`if chat_message.model.startswith("gpt"):`
			`qa = ConversationalRetrievalChain.from_llm(`
			`OpenAI(`
			`model_name=chat_message.model, openai_api_key=openai_api_key, temperature=chat_message.temperature, max_tokens=chat_message.max_tokens), vector_store.as_retriever(), memory=memory, verbose=True)`
			`elif anthropic_api_key and model.startswith("claude"):`
			`qa = ConversationalRetrievalChain.from_llm(`
			`ChatAnthropic(`
			`model=model, anthropic_api_key=anthropic_api_key, temperature=temperature, max_tokens_to_sample=max_tokens), vector_store.as_retriever(), memory=memory, verbose=True, max_tokens_limit=102400)`

			`history.append(("user", chat_message.question))`
			`model_response = qa({"question": chat_message.question})`
			`history.append(("assistant", model_response["answer"]))`

			`return {"history": history}`

			`@app.post("/crawl/")`
			`async def crawl_endpoint(crawl_website: CrawlWebsite):`

			`file_path, file_name = crawl_website.process()`

			`# Create a SpooledTemporaryFile from the file_path`
			`spooled_file = SpooledTemporaryFile()`
			`with open(file_path, 'rb') as f:`
			`shutil.copyfileobj(f, spooled_file)`

			`# Pass the SpooledTemporaryFile to UploadFile`
			`file = UploadFile(file=spooled_file, filename=file_name)`
			`message = await filter_file(file, supabase, vector_store)`
			`print(message)`
			`return {"message": message}`

			`@app.get("/explore")`
			`async def explore_endpoint():`
			`response = supabase.table("documents").select("name:metadata->>file_name, size:metadata->>file_size", count="exact").execute()`
			`documents = response.data # Access the data from the response`
			`# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary`
			`unique_data = [dict(t) for t in set(tuple(d.items()) for d in documents)]`
			`# Sort the list of documents by size in decreasing order`
			`unique_data.sort(key=lambda x: int(x['size']), reverse=True)`

			`return {"documents": unique_data}`

			`@app.delete("/explore/{file_name}")`
			`async def delete_endpoint(file_name: str):`
			`response = supabase.table("documents").delete().match({"metadata->>file_name": file_name}).execute()`
			`return {"message": f"{file_name} has been deleted."}`

			`@app.get("/explore/{file_name}")`
			`async def download_endpoint(file_name: str):`
			`response = supabase.table("documents").select("metadata->>file_name, metadata->>file_size, metadata->>file_extension, metadata->>file_url").match({"metadata->>file_name": file_name}).execute()`
			`documents = response.data`
			`### Returns all documents with the same file name`
			`return {"documents": documents}`



			`@app.get("/")`
			`async def root():`
			`return {"message": "Hello World"}`