mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-15 09:32:22 +03:00
Fix/file upload explore (#412)
This commit is contained in:
parent
ed61880a38
commit
4d9bd512ec
@ -40,7 +40,6 @@ class Brain(BaseModel):
|
||||
def remaining_brain_size(self):
|
||||
return float(self.max_brain_size) - self.brain_size
|
||||
|
||||
|
||||
@classmethod
|
||||
def create(cls, *args, **kwargs):
|
||||
commons = common_dependencies()
|
||||
@ -79,18 +78,17 @@ class Brain(BaseModel):
|
||||
self.id = response.data[0]['brain_id']
|
||||
return response.data
|
||||
|
||||
def create_brain_user(self, user_id : UUID, rights, default_brain):
|
||||
def create_brain_user(self, user_id: UUID, rights, default_brain):
|
||||
commons = common_dependencies()
|
||||
response = commons["supabase"].table("brains_users").insert({"brain_id": str(self.id), "user_id":str( user_id), "rights": rights, "default_brain": default_brain}).execute()
|
||||
|
||||
response = commons["supabase"].table("brains_users").insert({"brain_id": str(self.id), "user_id": str(user_id), "rights": rights, "default_brain": default_brain}).execute()
|
||||
|
||||
return response.data
|
||||
|
||||
def create_brain_vector(self, vector_id):
|
||||
def create_brain_vector(self, vector_id, file_sha1):
|
||||
response = (
|
||||
self.commons["supabase"]
|
||||
.table("brains_vectors")
|
||||
.insert({"brain_id": str(self.id), "vector_id": str(vector_id)})
|
||||
.insert({"brain_id": str(self.id), "vector_id": str(vector_id), "file_sha1": file_sha1})
|
||||
.execute()
|
||||
)
|
||||
return response.data
|
||||
@ -115,7 +113,7 @@ class Brain(BaseModel):
|
||||
# not used
|
||||
vector_ids = self.get_vector_ids_from_file_sha1(file_sha1)
|
||||
for vector_id in vector_ids:
|
||||
self.create_brain_vector(vector_id)
|
||||
self.create_brain_vector(vector_id, file_sha1)
|
||||
|
||||
def get_unique_brain_files(self):
|
||||
"""
|
||||
@ -142,15 +140,24 @@ class Brain(BaseModel):
|
||||
|
||||
return self.files
|
||||
|
||||
def get_unique_files_from_vector_ids(self, vectors_ids : List[int]):
|
||||
def get_unique_files_from_vector_ids(self, vectors_ids: List[int]):
|
||||
# Move into Vectors class
|
||||
"""
|
||||
Retrieve unique user data vectors.
|
||||
"""
|
||||
print('vectors_ids', vectors_ids)
|
||||
print('tuple(vectors_ids)', tuple(vectors_ids))
|
||||
if len(vectors_ids) == 1:
|
||||
vectors_response = self.commons['supabase'].table("vectors").select(
|
||||
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
|
||||
.filter("id", "eq", vectors_ids[0])\
|
||||
.execute()
|
||||
else:
|
||||
vectors_response = self.commons['supabase'].table("vectors").select(
|
||||
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
|
||||
.filter("id", "in", tuple(vectors_ids))\
|
||||
.execute()
|
||||
|
||||
documents = vectors_response.data # Access the data from the response
|
||||
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
|
||||
unique_files = [dict(t) for t in set(tuple(d.items()) for d in documents)]
|
||||
@ -187,6 +194,7 @@ def get_default_user_brain(user: User):
|
||||
.execute()
|
||||
)
|
||||
|
||||
print("Default brain response:", response.data)
|
||||
default_brain_id = response.data[0]["brain_id"] if response.data else None
|
||||
|
||||
print(f"Default brain id: {default_brain_id}")
|
||||
|
@ -6,24 +6,26 @@ from uuid import UUID
|
||||
from fastapi import UploadFile
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from logger import get_logger
|
||||
from models.brains import Brain
|
||||
from models.settings import CommonsDep, common_dependencies
|
||||
from pydantic import BaseModel
|
||||
from utils.file import compute_sha1_from_file
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class File(BaseModel):
|
||||
id: Optional[UUID] = None
|
||||
file: Optional[UploadFile]
|
||||
file_name: Optional[str] = ""
|
||||
file_size: Optional[int] = ""
|
||||
file_sha1: Optional[str] = ""
|
||||
vectors_ids: Optional[int]=[]
|
||||
vectors_ids: Optional[int] = []
|
||||
file_extension: Optional[str] = ""
|
||||
content: Optional[Any]= None
|
||||
content: Optional[Any] = None
|
||||
chunk_size: int = 500
|
||||
chunk_overlap: int= 0
|
||||
documents: Optional[Any]= None
|
||||
chunk_overlap: int = 0
|
||||
documents: Optional[Any] = None
|
||||
_commons: Optional[CommonsDep] = None
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
@ -56,7 +58,6 @@ class File(BaseModel):
|
||||
|
||||
print("documents", documents)
|
||||
|
||||
|
||||
os.remove(tmp_file.name)
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
||||
@ -68,6 +69,11 @@ class File(BaseModel):
|
||||
print(self.documents)
|
||||
|
||||
def set_file_vectors_ids(self):
|
||||
"""
|
||||
Set the vectors_ids property with the ids of the vectors
|
||||
that are associated with the file in the vectors table
|
||||
"""
|
||||
|
||||
commons = common_dependencies()
|
||||
response = (
|
||||
commons["supabase"].table("vectors")
|
||||
@ -78,24 +84,31 @@ class File(BaseModel):
|
||||
self.vectors_ids = response.data
|
||||
return
|
||||
|
||||
def file_already_exists(self, brain_id):
|
||||
commons = common_dependencies()
|
||||
|
||||
def file_already_exists(self):
|
||||
"""
|
||||
Check if file already exists in vectors table
|
||||
"""
|
||||
self.set_file_vectors_ids()
|
||||
|
||||
print("file_sha1", self.file_sha1)
|
||||
print("vectors_ids", self.vectors_ids)
|
||||
print("len(vectors_ids)", len(self.vectors_ids))
|
||||
|
||||
# if the file does not exist in vectors then no need to go check in brains_vectors
|
||||
if len(self.vectors_ids) == 0:
|
||||
return False
|
||||
|
||||
for vector in self.vectors_ids:
|
||||
return True
|
||||
|
||||
def file_already_exists_in_brain(self, brain_id):
|
||||
commons = common_dependencies()
|
||||
self.set_file_vectors_ids()
|
||||
# Check if file exists in that brain
|
||||
response = (
|
||||
commons["supabase"].table("brains_vectors")
|
||||
.select("brain_id, vector_id")
|
||||
.filter("brain_id", "eq", brain_id)
|
||||
.filter("vector_id", "eq", vector['id'])
|
||||
.filter("file_sha1", "eq", self.file_sha1)
|
||||
.execute()
|
||||
)
|
||||
print("response.data", response.data)
|
||||
@ -107,3 +120,9 @@ class File(BaseModel):
|
||||
def file_is_empty(self):
|
||||
return self.file.file._file.tell() < 1
|
||||
|
||||
def link_file_to_brain(self, brain: Brain):
|
||||
self.set_file_vectors_ids()
|
||||
|
||||
for vector_id in self.vectors_ids:
|
||||
brain.create_brain_vector(vector_id['id'], self.file_sha1)
|
||||
print(f"Successfully linked file {self.file_sha1} to brain {brain.id}")
|
||||
|
@ -5,24 +5,23 @@ from models.settings import common_dependencies
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class User(BaseModel):
|
||||
id: UUID
|
||||
email: str
|
||||
user_openai_api_key: str = None
|
||||
requests_count: int = 0
|
||||
user_openai_api_key: str = None
|
||||
|
||||
|
||||
# [TODO] Rename the user table and its references to 'user_usage'
|
||||
def create_user( self,date):
|
||||
def create_user(self, date):
|
||||
|
||||
commons = common_dependencies()
|
||||
logger.info(f"New user entry in db document for user {self.email}")
|
||||
|
||||
return(commons['supabase'].table("users").insert(
|
||||
return (commons['supabase'].table("users").insert(
|
||||
{"user_id": self.id, "email": self.email, "date": date, "requests_count": 1}).execute())
|
||||
|
||||
|
||||
def get_user_request_stats(self):
|
||||
commons = common_dependencies()
|
||||
requests_stats = commons['supabase'].from_('users').select(
|
||||
@ -43,12 +42,11 @@ class User(BaseModel):
|
||||
|
||||
return userItem["requests_count"]
|
||||
|
||||
|
||||
def increment_user_request_count(self, date):
|
||||
commons = common_dependencies()
|
||||
requests_count = self.fetch_user_requests_count(date) + 1
|
||||
logger.info(f"User {self.email} request count updated to {requests_count}")
|
||||
commons['supabase'].table("users").update(
|
||||
{ "requests_count": requests_count}).match({"user_id": self.id, "date": date}).execute()
|
||||
{"requests_count": requests_count}).match({"user_id": self.id, "date": date}).execute()
|
||||
self.requests_count = requests_count
|
||||
|
@ -31,6 +31,7 @@ async def process_file(
|
||||
}
|
||||
doc_with_metadata = Document(
|
||||
page_content=doc.page_content, metadata=metadata)
|
||||
|
||||
neurons = Neurons(commons=commons)
|
||||
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
|
||||
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
||||
@ -38,7 +39,7 @@ async def process_file(
|
||||
created_vector_id = created_vector[0]
|
||||
|
||||
brain = Brain(id=brain_id)
|
||||
brain.create_brain_vector(created_vector_id)
|
||||
brain.create_brain_vector(created_vector_id, file.file_sha1)
|
||||
|
||||
return
|
||||
|
||||
|
@ -44,21 +44,21 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, brain_
|
||||
doc_with_metadata = Document(
|
||||
page_content=doc.page_content, metadata=metadata)
|
||||
|
||||
file = File(file_sha1 = compute_sha1_from_content(doc.page_content.encode("utf-8")))
|
||||
file = File(file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8")))
|
||||
|
||||
exist = file.file_already_exists(brain_id)
|
||||
if not exist:
|
||||
file_exists = file.file_already_exists()
|
||||
|
||||
if not file_exists:
|
||||
print(f"Creating entry for file {file.file_sha1} in vectors...")
|
||||
neurons = Neurons(commons=commons)
|
||||
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
|
||||
|
||||
created_vector_id = created_vector[0]
|
||||
|
||||
brain = Brain(id=brain_id)
|
||||
brain.create_brain_vector(created_vector_id)
|
||||
|
||||
print("Created vector sids ", created_vector)
|
||||
print("Created vector for ", doc.metadata["file_name"])
|
||||
# add created_vector x brains in db
|
||||
|
||||
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
|
||||
|
||||
if not file_exists_in_brain:
|
||||
file.add_file_to_brain(brain_id)
|
||||
brain = Brain(id=brain_id)
|
||||
file.link_file_to_brain(brain)
|
||||
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}
|
||||
|
||||
|
@ -131,10 +131,13 @@ async def brain_endpoint(
|
||||
brain.create_brain()
|
||||
default_brain = get_default_user_brain(current_user)
|
||||
if default_brain:
|
||||
# create a brain X user entry
|
||||
logger.info(f"Default brain already exists for user {current_user.id}")
|
||||
brain.create_brain_user(user_id = current_user.id, rights="Owner", default_brain=False)
|
||||
else:
|
||||
logger.info(f"Default brain does not exist for user {current_user.id}. It will be created.")
|
||||
brain.create_brain_user(user_id = current_user.id, rights="Owner", default_brain=True)
|
||||
|
||||
|
||||
return {"id": brain.id, "name": brain.name}
|
||||
|
||||
# update existing brain
|
||||
|
@ -9,8 +9,8 @@ from models.users import User
|
||||
explore_router = APIRouter()
|
||||
|
||||
|
||||
@explore_router.get("/explore", dependencies=[Depends(AuthBearer())], tags=["Explore"])
|
||||
async def explore_endpoint(brain_id: UUID = Query(..., description="The ID of the brain"),current_user: User = Depends(get_current_user)):
|
||||
@explore_router.get("/explore/", dependencies=[Depends(AuthBearer())], tags=["Explore"])
|
||||
async def explore_endpoint(brain_id: UUID = Query(..., description="The ID of the brain"), current_user: User = Depends(get_current_user)):
|
||||
"""
|
||||
Retrieve and explore unique user data vectors.
|
||||
"""
|
||||
|
@ -1,4 +1,5 @@
|
||||
|
||||
from models.brains import Brain
|
||||
from models.files import File
|
||||
from models.settings import CommonsDep
|
||||
from parsers.audio import process_audio
|
||||
@ -35,20 +36,32 @@ file_processors = {
|
||||
}
|
||||
|
||||
|
||||
def create_response(message, type):
|
||||
return {"message": message, "type": type}
|
||||
|
||||
|
||||
async def filter_file(commons: CommonsDep, file: File, enable_summarization: bool, brain_id, openai_api_key):
|
||||
await file.compute_file_sha1()
|
||||
|
||||
print("file sha1", file.file_sha1)
|
||||
if file.file_already_exists( brain_id):
|
||||
return {"message": f"🤔 {file.file.filename} already exists in brain {brain_id}.", "type": "warning"}
|
||||
elif file.file_is_empty():
|
||||
return {"message": f"❌ {file.file.filename} is empty.", "type": "error"}
|
||||
else:
|
||||
if file.file_extension in file_processors:
|
||||
await file_processors[file.file_extension](commons,file, enable_summarization, brain_id ,openai_api_key )
|
||||
return {"message": f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "type": "success"}
|
||||
else:
|
||||
return {"message": f"❌ {file.file.filename} is not supported.", "type": "error"}
|
||||
file_exists = file.file_already_exists()
|
||||
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
|
||||
|
||||
if file_exists_in_brain:
|
||||
return create_response(f"🤔 {file.file.filename} already exists in brain {brain_id}.", "warning")
|
||||
elif file.file_is_empty():
|
||||
return create_response(f"❌ {file.file.filename} is empty.", "error")
|
||||
elif file_exists:
|
||||
file.link_file_to_brain(brain=Brain(id=brain_id))
|
||||
return create_response(f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "success")
|
||||
|
||||
if file.file_extension in file_processors:
|
||||
try:
|
||||
await file_processors[file.file_extension](commons, file, enable_summarization, brain_id, openai_api_key)
|
||||
return create_response(f"✅ {file.file.filename} has been uploaded to brain {brain_id}.", "success")
|
||||
except Exception as e:
|
||||
# Add more specific exceptions as needed.
|
||||
print(f"Error processing file: {e}")
|
||||
return create_response(f"⚠️ An error occurred while processing {file.file.filename}.", "error")
|
||||
|
||||
return create_response(f"❌ {file.file.filename} is not supported.", "error")
|
||||
|
@ -7,7 +7,6 @@ import { useBrainContext } from "@/lib/context/BrainProvider/hooks/useBrainConte
|
||||
import { useSupabase } from "@/lib/context/SupabaseProvider";
|
||||
import { useAxios, useToast } from "@/lib/hooks";
|
||||
import { useEventTracking } from "@/services/analytics/useEventTracking";
|
||||
import { useFeature } from "@growthbook/growthbook-react";
|
||||
import { UUID } from "crypto";
|
||||
|
||||
export const useFileUploader = () => {
|
||||
@ -17,11 +16,9 @@ export const useFileUploader = () => {
|
||||
const [files, setFiles] = useState<File[]>([]);
|
||||
const { session } = useSupabase();
|
||||
|
||||
const { currentBrain, createBrain } = useBrainContext();
|
||||
const { currentBrain } = useBrainContext();
|
||||
const { axiosInstance } = useAxios();
|
||||
|
||||
const shouldUseMultipleBrains = useFeature("multiple-brains").on;
|
||||
|
||||
if (session === null) {
|
||||
redirect("/login");
|
||||
}
|
||||
@ -90,17 +87,12 @@ export const useFileUploader = () => {
|
||||
if (currentBrain?.id !== undefined) {
|
||||
setFiles([]);
|
||||
await Promise.all(files.map((file) => upload(file, currentBrain?.id)));
|
||||
} else {
|
||||
publish({
|
||||
text: "Please, select or create a brain to upload a file",
|
||||
variant: "warning",
|
||||
});
|
||||
}
|
||||
console.log("Please select or create a brain to upload a file");
|
||||
|
||||
if (currentBrain?.id === undefined && shouldUseMultipleBrains !== true) {
|
||||
const createdBrainId = await createBrain("Default");
|
||||
createdBrainId
|
||||
? await Promise.all(files.map((file) => upload(file, createdBrainId)))
|
||||
: null;
|
||||
setFiles([]);
|
||||
}
|
||||
|
||||
setIsPending(false);
|
||||
};
|
||||
|
||||
|
17
scripts/20230629143400_add_file_sha1_brains_vectors.sql
Normal file
17
scripts/20230629143400_add_file_sha1_brains_vectors.sql
Normal file
@ -0,0 +1,17 @@
|
||||
BEGIN;
|
||||
|
||||
-- Add the file_sha1 column if it doesn't exist
|
||||
ALTER TABLE IF EXISTS brains_vectors
|
||||
ADD COLUMN IF NOT EXISTS file_sha1 TEXT;
|
||||
|
||||
-- Update the file_sha1 column with values from vectors.metadata
|
||||
UPDATE brains_vectors
|
||||
SET file_sha1 = subquery.file_sha1
|
||||
FROM (
|
||||
SELECT vectors.id, vectors.metadata->>'file_sha1' AS file_sha1
|
||||
FROM vectors
|
||||
) AS subquery
|
||||
WHERE brains_vectors.vector_id = subquery.id
|
||||
AND (brains_vectors.file_sha1 IS NULL OR brains_vectors.file_sha1 = '');
|
||||
|
||||
COMMIT;
|
Loading…
Reference in New Issue
Block a user