quivr/backend/routes/crawl_routes.py

import os
import shutil
from tempfile import SpooledTemporaryFile

from auth.auth_bearer import JWTBearer
from crawl.crawler import CrawlWebsite
from fastapi import APIRouter, Depends, Request, UploadFile
from middlewares.cors import add_cors_middleware
from models.users import User
from parsers.github import process_github
from utils.file import convert_bytes
from utils.processors import filter_file
from utils.vectors import CommonsDep

crawl_router = APIRouter()

@crawl_router.post("/crawl/", dependencies=[Depends(JWTBearer())])
async def crawl_endpoint(request: Request,commons: CommonsDep, crawl_website: CrawlWebsite, enable_summarization: bool = False, credentials: dict = Depends(JWTBearer())):
    max_brain_size = os.getenv("MAX_BRAIN_SIZE")
    if request.headers.get('Openai-Api-Key'):
        max_brain_size = os.getenv("MAX_BRAIN_SIZE_WITH_KEY",209715200)

    user = User(email=credentials.get('email', 'none'))
    user_vectors_response = commons['supabase'].table("vectors").select(
        "name:metadata->>file_name, size:metadata->>file_size", count="exact") \
            .filter("user_id", "eq", user.email)\
            .execute()
    documents = user_vectors_response.data  # Access the data from the response
    # Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
    user_unique_vectors = [dict(t) for t in set(tuple(d.items()) for d in documents)]

    current_brain_size = sum(float(doc['size']) for doc in user_unique_vectors)

    file_size = 1000000

    remaining_free_space =  float(max_brain_size) - (current_brain_size)

    if remaining_free_space - file_size < 0:
        message = {"message": f"❌ User's brain will exceed maximum capacity with this upload. Maximum file allowed is : {convert_bytes(remaining_free_space)}", "type": "error"}
    else: 
        user = User(email=credentials.get('email', 'none'))
        if not crawl_website.checkGithub():

            file_path, file_name = crawl_website.process()

            # Create a SpooledTemporaryFile from the file_path
            spooled_file = SpooledTemporaryFile()
            with open(file_path, 'rb') as f:
                shutil.copyfileobj(f, spooled_file)

            # Pass the SpooledTemporaryFile to UploadFile
            file = UploadFile(file=spooled_file, filename=file_name)
            message = await filter_file(file, enable_summarization, commons['supabase'], user=user)
            return message
        else:
            message = await process_github(crawl_website.url, "false", user=user, supabase=commons['supabase'])
Feat/user chat history (#275) * ♻️ refactor backend main routes * 🗃️ new user_id uuid column in users table * 🗃️ new chats table * ✨ new chat endpoints * ✨ change chat routes post to handle undef chat_id * ♻️ extract components from chat page * ✨ add chatId to useQuestion * ✨ new ChatsList * ✨ new optional dynamic route chat/{chat_id} * 🩹 add setQuestion to speach utils * feat: self supplied key (#286) * feat(brain): increased size if api key and more * fix(key): not displayed * feat(apikey): now password input * fix(twitter): handle wrong * feat(chat): basic source documents support (#289) * ♻️ refactor backend main routes * 🗃️ new user_id uuid column in users table * 🗃️ new chats table * ✨ new chat endpoints * ✨ change chat routes post to handle undef chat_id * ♻️ extract components from chat page * ✨ add chatId to useQuestion * ✨ new ChatsList * ✨ new optional dynamic route chat/{chat_id} * 🩹 add setQuestion to speach utils * 🎨 separate creation and update endpoints for chat * 🩹 add feat(chat): basic source documents support * ✨ add chatName upon creation and for chats list * 💄 improve chatsList style * User chat history and multiple chats (#290) * ♻️ refactor backend main routes * 🗃️ new user_id uuid column in users table * 🗃️ new chats table * ✨ new chat endpoints * ✨ change chat routes post to handle undef chat_id * ♻️ extract components from chat page * ✨ add chatId to useQuestion * ✨ new ChatsList * ✨ new optional dynamic route chat/{chat_id} * refactor(chat): use layout to avoid refetching all chats on every chat * refactor(chat): useChats hook instead of useQuestion * fix(chat): fix errors * refactor(chat): better folder structure * feat: self supplied key (#286) * feat(brain): increased size if api key and more * fix(key): not displayed * feat(apikey): now password input * fix(twitter): handle wrong * feat(chat): basic source documents support (#289) * style(chat): better looking sidebar * resume merge * fix(backend): add os and logger imports * small fixes * chore(chat): remove empty interface * chore(chat): use const * fix(chat): merge errors * fix(chat): remove useSpeech args * chore(chat): remove unused args * fix(chat): remove duplicate components --------- Co-authored-by: gozineb <zinebe@theodo.fr> Co-authored-by: Matt <77928207+mattzcarey@users.noreply.github.com> Co-authored-by: Stan Girard <girard.stanislas@gmail.com> Co-authored-by: xleven <xleven@outlook.com> * fix and refactor errors * fix(fresh): installation issues * chore(conflict): merged old code * fix(multi-chat): use update endpoint * feat(embeddings): now using users api key --------- Co-authored-by: Matt <77928207+mattzcarey@users.noreply.github.com> Co-authored-by: Stan Girard <girard.stanislas@gmail.com> Co-authored-by: xleven <xleven@outlook.com> Co-authored-by: Aditya Nandan <61308761+iMADi-ARCH@users.noreply.github.com> Co-authored-by: iMADi-ARCH <nandanaditya985@gmail.com> Co-authored-by: Mamadou DICKO <mamadoudicko100@gmail.com> 2023-06-11 00:59:16 +03:00			`import os`
			`import shutil`
			`from tempfile import SpooledTemporaryFile`

			`from auth.auth_bearer import JWTBearer`
			`from crawl.crawler import CrawlWebsite`
			`from fastapi import APIRouter, Depends, Request, UploadFile`
			`from middlewares.cors import add_cors_middleware`
			`from models.users import User`
			`from parsers.github import process_github`
			`from utils.file import convert_bytes`
			`from utils.processors import filter_file`
			`from utils.vectors import CommonsDep`

			`crawl_router = APIRouter()`

			`@crawl_router.post("/crawl/", dependencies=[Depends(JWTBearer())])`
			`async def crawl_endpoint(request: Request,commons: CommonsDep, crawl_website: CrawlWebsite, enable_summarization: bool = False, credentials: dict = Depends(JWTBearer())):`
			`max_brain_size = os.getenv("MAX_BRAIN_SIZE")`
			`if request.headers.get('Openai-Api-Key'):`
			`max_brain_size = os.getenv("MAX_BRAIN_SIZE_WITH_KEY",209715200)`

			`user = User(email=credentials.get('email', 'none'))`
			`user_vectors_response = commons['supabase'].table("vectors").select(`
			`"name:metadata->>file_name, size:metadata->>file_size", count="exact") \`
			`.filter("user_id", "eq", user.email)\`
			`.execute()`
			`documents = user_vectors_response.data # Access the data from the response`
			`# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary`
			`user_unique_vectors = [dict(t) for t in set(tuple(d.items()) for d in documents)]`

			`current_brain_size = sum(float(doc['size']) for doc in user_unique_vectors)`

			`file_size = 1000000`

			`remaining_free_space = float(max_brain_size) - (current_brain_size)`

			`if remaining_free_space - file_size < 0:`
			`message = {"message": f"❌ User's brain will exceed maximum capacity with this upload. Maximum file allowed is : {convert_bytes(remaining_free_space)}", "type": "error"}`
			`else:`
			`user = User(email=credentials.get('email', 'none'))`
			`if not crawl_website.checkGithub():`

			`file_path, file_name = crawl_website.process()`

			`# Create a SpooledTemporaryFile from the file_path`
			`spooled_file = SpooledTemporaryFile()`
			`with open(file_path, 'rb') as f:`
			`shutil.copyfileobj(f, spooled_file)`

			`# Pass the SpooledTemporaryFile to UploadFile`
			`file = UploadFile(file=spooled_file, filename=file_name)`
			`message = await filter_file(file, enable_summarization, commons['supabase'], user=user)`
			`return message`
			`else:`
			`message = await process_github(crawl_website.url, "false", user=user, supabase=commons['supabase'])`