Fix/file upload explore (#412)

This commit is contained in:
Zineb El Bachiri 2023-06-29 18:26:03 +02:00 committed by GitHub
parent ed61880a38
commit 4d9bd512ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 132 additions and 81 deletions

View File

@ -40,7 +40,6 @@ class Brain(BaseModel):
def remaining_brain_size(self):
return float(self.max_brain_size) - self.brain_size
@classmethod
def create(cls, *args, **kwargs):
commons = common_dependencies()
@ -79,18 +78,17 @@ class Brain(BaseModel):
self.id = response.data[0]['brain_id']
return response.data
def create_brain_user(self, user_id : UUID, rights, default_brain):
def create_brain_user(self, user_id: UUID, rights, default_brain):
commons = common_dependencies()
response = commons["supabase"].table("brains_users").insert({"brain_id": str(self.id), "user_id":str( user_id), "rights": rights, "default_brain": default_brain}).execute()
response = commons["supabase"].table("brains_users").insert({"brain_id": str(self.id), "user_id": str(user_id), "rights": rights, "default_brain": default_brain}).execute()
return response.data
def create_brain_vector(self, vector_id):
def create_brain_vector(self, vector_id, file_sha1):
response = (
self.commons["supabase"]
.table("brains_vectors")
.insert({"brain_id": str(self.id), "vector_id": str(vector_id)})
.insert({"brain_id": str(self.id), "vector_id": str(vector_id), "file_sha1": file_sha1})
.execute()
)
return response.data
@ -115,7 +113,7 @@ class Brain(BaseModel):
# not used
vector_ids = self.get_vector_ids_from_file_sha1(file_sha1)
for vector_id in vector_ids:
self.create_brain_vector(vector_id)
self.create_brain_vector(vector_id, file_sha1)
def get_unique_brain_files(self):
"""
@ -142,15 +140,24 @@ class Brain(BaseModel):
return self.files
def get_unique_files_from_vector_ids(self, vectors_ids : List[int]):
def get_unique_files_from_vector_ids(self, vectors_ids: List[int]):
# Move into Vectors class
"""
Retrieve unique user data vectors.
"""
vectors_response = self.commons['supabase'].table("vectors").select(
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
print('vectors_ids', vectors_ids)
print('tuple(vectors_ids)', tuple(vectors_ids))
if len(vectors_ids) == 1:
vectors_response = self.commons['supabase'].table("vectors").select(
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
.filter("id", "eq", vectors_ids[0])\
.execute()
else:
vectors_response = self.commons['supabase'].table("vectors").select(
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
.filter("id", "in", tuple(vectors_ids))\
.execute()
documents = vectors_response.data # Access the data from the response
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
unique_files = [dict(t) for t in set(tuple(d.items()) for d in documents)]
@ -187,6 +194,7 @@ def get_default_user_brain(user: User):
.execute()
)
print("Default brain response:", response.data)
default_brain_id = response.data[0]["brain_id"] if response.data else None
print(f"Default brain id: {default_brain_id}")

View File

@ -6,24 +6,26 @@ from uuid import UUID
from fastapi import UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from logger import get_logger
from models.brains import Brain
from models.settings import CommonsDep, common_dependencies
from pydantic import BaseModel
from utils.file import compute_sha1_from_file
logger = get_logger(__name__)
class File(BaseModel):
id: Optional[UUID] = None
file: Optional[UploadFile]
file_name: Optional[str] = ""
file_size: Optional[int] = ""
file_sha1: Optional[str] = ""
vectors_ids: Optional[int]=[]
vectors_ids: Optional[int] = []
file_extension: Optional[str] = ""
content: Optional[Any]= None
content: Optional[Any] = None
chunk_size: int = 500
chunk_overlap: int= 0
documents: Optional[Any]= None
chunk_overlap: int = 0
documents: Optional[Any] = None
_commons: Optional[CommonsDep] = None
def __init__(self, **kwargs):
@ -56,7 +58,6 @@ class File(BaseModel):
print("documents", documents)
os.remove(tmp_file.name)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
@ -68,6 +69,11 @@ class File(BaseModel):
print(self.documents)
def set_file_vectors_ids(self):
"""
Set the vectors_ids property with the ids of the vectors
that are associated with the file in the vectors table
"""
commons = common_dependencies()
response = (
commons["supabase"].table("vectors")
@ -78,32 +84,45 @@ class File(BaseModel):
self.vectors_ids = response.data
return
def file_already_exists(self, brain_id):
commons = common_dependencies()
def file_already_exists(self):
"""
Check if file already exists in vectors table
"""
self.set_file_vectors_ids()
print("file_sha1", self.file_sha1)
print("vectors_ids", self.vectors_ids)
print("len(vectors_ids)", len(self.vectors_ids))
# if the file does not exist in vectors then no need to go check in brains_vectors
if len(self.vectors_ids) == 0:
return False
for vector in self.vectors_ids:
response = (
commons["supabase"].table("brains_vectors")
.select("brain_id, vector_id")
.filter("brain_id", "eq", brain_id)
.filter("vector_id", "eq", vector['id'])
.execute()
)
print("response.data", response.data)
if len(response.data) == 0:
return False
return True
def file_already_exists_in_brain(self, brain_id):
commons = common_dependencies()
self.set_file_vectors_ids()
# Check if file exists in that brain
response = (
commons["supabase"].table("brains_vectors")
.select("brain_id, vector_id")
.filter("brain_id", "eq", brain_id)
.filter("file_sha1", "eq", self.file_sha1)
.execute()
)
print("response.data", response.data)
if len(response.data) == 0:
return False
return True
def file_is_empty(self):
return self.file.file._file.tell() < 1
def link_file_to_brain(self, brain: Brain):
self.set_file_vectors_ids()
for vector_id in self.vectors_ids:
brain.create_brain_vector(vector_id['id'], self.file_sha1)
print(f"Successfully linked file {self.file_sha1} to brain {brain.id}")

View File

@ -5,24 +5,23 @@ from models.settings import common_dependencies
from pydantic import BaseModel
logger = get_logger(__name__)
class User(BaseModel):
id: UUID
email: str
user_openai_api_key: str = None
requests_count: int = 0
user_openai_api_key: str = None
# [TODO] Rename the user table and its references to 'user_usage'
def create_user( self,date):
def create_user(self, date):
commons = common_dependencies()
logger.info(f"New user entry in db document for user {self.email}")
return(commons['supabase'].table("users").insert(
return (commons['supabase'].table("users").insert(
{"user_id": self.id, "email": self.email, "date": date, "requests_count": 1}).execute())
def get_user_request_stats(self):
commons = common_dependencies()
requests_stats = commons['supabase'].from_('users').select(
@ -43,12 +42,11 @@ class User(BaseModel):
return userItem["requests_count"]
def increment_user_request_count(self, date):
commons = common_dependencies()
requests_count = self.fetch_user_requests_count(date) + 1
logger.info(f"User {self.email} request count updated to {requests_count}")
commons['supabase'].table("users").update(
{ "requests_count": requests_count}).match({"user_id": self.id, "date": date}).execute()
{"requests_count": requests_count}).match({"user_id": self.id, "date": date}).execute()
self.requests_count = requests_count

View File

@ -31,6 +31,7 @@ async def process_file(
}
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)
neurons = Neurons(commons=commons)
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
@ -38,7 +39,7 @@ async def process_file(
created_vector_id = created_vector[0]
brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id)
brain.create_brain_vector(created_vector_id, file.file_sha1)
return

View File

@ -15,8 +15,8 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, brain_
random_dir_name = os.urandom(16).hex()
dateshort = time.strftime("%Y%m%d")
loader = GitLoader(
clone_url=repo,
repo_path="/tmp/" + random_dir_name,
clone_url=repo,
repo_path="/tmp/" + random_dir_name,
)
documents = loader.load()
os.system("rm -rf /tmp/" + random_dir_name)
@ -44,21 +44,21 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, brain_
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)
file = File(file_sha1 = compute_sha1_from_content(doc.page_content.encode("utf-8")))
file = File(file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8")))
exist = file.file_already_exists(brain_id)
if not exist:
file_exists = file.file_already_exists()
if not file_exists:
print(f"Creating entry for file {file.file_sha1} in vectors...")
neurons = Neurons(commons=commons)
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
created_vector_id = created_vector[0]
brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id)
print("Created vector sids ", created_vector)
print("Created vector for ", doc.metadata["file_name"])
# add created_vector x brains in db
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
if not file_exists_in_brain:
file.add_file_to_brain(brain_id)
brain = Brain(id=brain_id)
file.link_file_to_brain(brain)
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}

View File

@ -131,10 +131,13 @@ async def brain_endpoint(
brain.create_brain()
default_brain = get_default_user_brain(current_user)
if default_brain:
# create a brain X user entry
logger.info(f"Default brain already exists for user {current_user.id}")
brain.create_brain_user(user_id = current_user.id, rights="Owner", default_brain=False)
else:
logger.info(f"Default brain does not exist for user {current_user.id}. It will be created.")
brain.create_brain_user(user_id = current_user.id, rights="Owner", default_brain=True)
return {"id": brain.id, "name": brain.name}
# update existing brain

View File

@ -9,8 +9,8 @@ from models.users import User
explore_router = APIRouter()
@explore_router.get("/explore", dependencies=[Depends(AuthBearer())], tags=["Explore"])
async def explore_endpoint(brain_id: UUID = Query(..., description="The ID of the brain"),current_user: User = Depends(get_current_user)):
@explore_router.get("/explore/", dependencies=[Depends(AuthBearer())], tags=["Explore"])
async def explore_endpoint(brain_id: UUID = Query(..., description="The ID of the brain"), current_user: User = Depends(get_current_user)):
"""
Retrieve and explore unique user data vectors.
"""

View File

@ -1,4 +1,5 @@
from models.brains import Brain
from models.files import File
from models.settings import CommonsDep
from parsers.audio import process_audio
@ -35,20 +36,32 @@ file_processors = {
}
def create_response(message, type):
return {"message": message, "type": type}
async def filter_file(commons: CommonsDep, file: File, enable_summarization: bool, brain_id, openai_api_key):
await file.compute_file_sha1()
print("file sha1", file.file_sha1)
if file.file_already_exists( brain_id):
return {"message": f"🤔 {file.file.filename} already exists in brain {brain_id}.", "type": "warning"}
elif file.file_is_empty():
return {"message": f"{file.file.filename} is empty.", "type": "error"}
else:
if file.file_extension in file_processors:
await file_processors[file.file_extension](commons,file, enable_summarization, brain_id ,openai_api_key )
return {"message": f"{file.file.filename} has been uploaded to brain {brain_id}.", "type": "success"}
else:
return {"message": f"{file.file.filename} is not supported.", "type": "error"}
file_exists = file.file_already_exists()
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
if file_exists_in_brain:
return create_response(f"🤔 {file.file.filename} already exists in brain {brain_id}.", "warning")
elif file.file_is_empty():
return create_response(f"{file.file.filename} is empty.", "error")
elif file_exists:
file.link_file_to_brain(brain=Brain(id=brain_id))
return create_response(f"{file.file.filename} has been uploaded to brain {brain_id}.", "success")
if file.file_extension in file_processors:
try:
await file_processors[file.file_extension](commons, file, enable_summarization, brain_id, openai_api_key)
return create_response(f"{file.file.filename} has been uploaded to brain {brain_id}.", "success")
except Exception as e:
# Add more specific exceptions as needed.
print(f"Error processing file: {e}")
return create_response(f"⚠️ An error occurred while processing {file.file.filename}.", "error")
return create_response(f"{file.file.filename} is not supported.", "error")

View File

@ -7,7 +7,6 @@ import { useBrainContext } from "@/lib/context/BrainProvider/hooks/useBrainConte
import { useSupabase } from "@/lib/context/SupabaseProvider";
import { useAxios, useToast } from "@/lib/hooks";
import { useEventTracking } from "@/services/analytics/useEventTracking";
import { useFeature } from "@growthbook/growthbook-react";
import { UUID } from "crypto";
export const useFileUploader = () => {
@ -17,11 +16,9 @@ export const useFileUploader = () => {
const [files, setFiles] = useState<File[]>([]);
const { session } = useSupabase();
const { currentBrain, createBrain } = useBrainContext();
const { currentBrain } = useBrainContext();
const { axiosInstance } = useAxios();
const shouldUseMultipleBrains = useFeature("multiple-brains").on;
if (session === null) {
redirect("/login");
}
@ -90,17 +87,12 @@ export const useFileUploader = () => {
if (currentBrain?.id !== undefined) {
setFiles([]);
await Promise.all(files.map((file) => upload(file, currentBrain?.id)));
} else {
publish({
text: "Please, select or create a brain to upload a file",
variant: "warning",
});
}
console.log("Please select or create a brain to upload a file");
if (currentBrain?.id === undefined && shouldUseMultipleBrains !== true) {
const createdBrainId = await createBrain("Default");
createdBrainId
? await Promise.all(files.map((file) => upload(file, createdBrainId)))
: null;
setFiles([]);
}
setIsPending(false);
};

View File

@ -0,0 +1,17 @@
BEGIN;
-- Add the file_sha1 column if it doesn't exist
ALTER TABLE IF EXISTS brains_vectors
ADD COLUMN IF NOT EXISTS file_sha1 TEXT;
-- Update the file_sha1 column with values from vectors.metadata
UPDATE brains_vectors
SET file_sha1 = subquery.file_sha1
FROM (
SELECT vectors.id, vectors.metadata->>'file_sha1' AS file_sha1
FROM vectors
) AS subquery
WHERE brains_vectors.vector_id = subquery.id
AND (brains_vectors.file_sha1 IS NULL OR brains_vectors.file_sha1 = '');
COMMIT;