2023-06-11 00:59:16 +03:00
import os
import shutil
from tempfile import SpooledTemporaryFile
2023-06-14 22:21:13 +03:00
from auth . auth_bearer import AuthBearer , get_current_user
2023-06-11 00:59:16 +03:00
from crawl . crawler import CrawlWebsite
from fastapi import APIRouter , Depends , Request , UploadFile
2023-06-19 23:54:01 +03:00
from models . settings import CommonsDep , common_dependencies
2023-06-11 00:59:16 +03:00
from models . users import User
from parsers . github import process_github
from utils . file import convert_bytes
from utils . processors import filter_file
crawl_router = APIRouter ( )
2023-06-12 18:58:05 +03:00
def get_unique_user_data ( commons , user ) :
2023-06-15 15:43:40 +03:00
"""
Retrieve unique user data vectors .
"""
2023-06-11 00:59:16 +03:00
user_vectors_response = commons [ ' supabase ' ] . table ( " vectors " ) . select (
" name:metadata->>file_name, size:metadata->>file_size " , count = " exact " ) \
. filter ( " user_id " , " eq " , user . email ) \
. execute ( )
documents = user_vectors_response . data # Access the data from the response
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
user_unique_vectors = [ dict ( t ) for t in set ( tuple ( d . items ( ) ) for d in documents ) ]
2023-06-12 18:58:05 +03:00
return user_unique_vectors
2023-06-15 15:43:40 +03:00
@crawl_router.post ( " /crawl/ " , dependencies = [ Depends ( AuthBearer ( ) ) ] , tags = [ " Crawl " ] )
2023-06-19 23:54:01 +03:00
async def crawl_endpoint ( request : Request , crawl_website : CrawlWebsite , enable_summarization : bool = False , current_user : User = Depends ( get_current_user ) ) :
2023-06-15 15:43:40 +03:00
"""
Crawl a website and process the crawled data .
"""
2023-06-19 23:54:01 +03:00
commons = common_dependencies ( )
2023-06-12 18:58:05 +03:00
max_brain_size = os . getenv ( " MAX_BRAIN_SIZE " )
if request . headers . get ( ' Openai-Api-Key ' ) :
max_brain_size = os . getenv ( " MAX_BRAIN_SIZE_WITH_KEY " , 209715200 )
user_unique_vectors = get_unique_user_data ( commons , current_user )
2023-06-11 00:59:16 +03:00
current_brain_size = sum ( float ( doc [ ' size ' ] ) for doc in user_unique_vectors )
file_size = 1000000
remaining_free_space = float ( max_brain_size ) - ( current_brain_size )
if remaining_free_space - file_size < 0 :
message = { " message " : f " ❌ User ' s brain will exceed maximum capacity with this upload. Maximum file allowed is : { convert_bytes ( remaining_free_space ) } " , " type " : " error " }
else :
if not crawl_website . checkGithub ( ) :
file_path , file_name = crawl_website . process ( )
# Create a SpooledTemporaryFile from the file_path
spooled_file = SpooledTemporaryFile ( )
with open ( file_path , ' rb ' ) as f :
shutil . copyfileobj ( f , spooled_file )
# Pass the SpooledTemporaryFile to UploadFile
file = UploadFile ( file = spooled_file , filename = file_name )
2023-06-17 00:36:53 +03:00
message = await filter_file ( commons , file , enable_summarization , user = current_user , openai_api_key = request . headers . get ( ' Openai-Api-Key ' , None ) )
2023-06-11 00:59:16 +03:00
return message
else :
2023-06-19 12:23:58 +03:00
message = await process_github ( commons , crawl_website . url , " false " , user = current_user , supabase = commons [ ' supabase ' ] , user_openai_api_key = request . headers . get ( ' Openai-Api-Key ' , None ) )