mirror of
https://github.com/StanGirard/quivr.git
synced 2024-10-26 22:10:26 +03:00
feat(perf): increased perf embedding and search for files (#1182)
* feat(upload): changed to task * feat(sha1): added column for better speed
This commit is contained in:
parent
cdf587cfde
commit
4d41901106
@ -1,5 +1,7 @@
|
|||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from models.brains import Brain
|
from models.brains import Brain
|
||||||
|
from models.settings import get_supabase_db
|
||||||
|
|
||||||
from repository.files.upload_file import DocumentSerializable
|
from repository.files.upload_file import DocumentSerializable
|
||||||
from utils.vectors import Neurons
|
from utils.vectors import Neurons
|
||||||
|
|
||||||
@ -12,6 +14,9 @@ def create_embedding_for_document(
|
|||||||
doc = DocumentSerializable.from_json(doc_with_metadata)
|
doc = DocumentSerializable.from_json(doc_with_metadata)
|
||||||
created_vector = neurons.create_vector(doc, user_openai_api_key)
|
created_vector = neurons.create_vector(doc, user_openai_api_key)
|
||||||
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
||||||
|
database = get_supabase_db()
|
||||||
|
database.set_file_sha_from_metadata(file_sha1)
|
||||||
|
|
||||||
|
|
||||||
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
|
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
|
||||||
|
|
||||||
|
@ -172,7 +172,7 @@ class Brain(Repository):
|
|||||||
vectorsResponse = (
|
vectorsResponse = (
|
||||||
self.db.table("vectors")
|
self.db.table("vectors")
|
||||||
.select("id")
|
.select("id")
|
||||||
.filter("metadata->>file_sha1", "eq", file_sha1)
|
.filter("file_sha1", "eq", file_sha1)
|
||||||
.execute()
|
.execute()
|
||||||
)
|
)
|
||||||
return vectorsResponse.data
|
return vectorsResponse.data
|
||||||
|
@ -9,7 +9,7 @@ class File(Repository):
|
|||||||
response = (
|
response = (
|
||||||
self.db.table("vectors")
|
self.db.table("vectors")
|
||||||
.select("id")
|
.select("id")
|
||||||
.filter("metadata->>file_sha1", "eq", file_sha1)
|
.filter("file_sha1", "eq", file_sha1)
|
||||||
.execute()
|
.execute()
|
||||||
)
|
)
|
||||||
return response.data
|
return response.data
|
||||||
|
@ -23,7 +23,18 @@ class Vector(Repository):
|
|||||||
response = (
|
response = (
|
||||||
self.db.table("vectors")
|
self.db.table("vectors")
|
||||||
.select("id")
|
.select("id")
|
||||||
.filter("metadata->>file_sha1", "eq", file_sha1)
|
.filter("file_sha1", "eq", file_sha1)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
def set_file_sha_from_metadata(self, file_sha1):
|
||||||
|
# It looks at the file that have a file_sha1 in the metadata that is corresponding but an empty file_sha1 column and set it
|
||||||
|
response = (
|
||||||
|
self.db.table("vectors")
|
||||||
|
.update({"file_sha1": file_sha1})
|
||||||
|
.match({"metadata->>file_sha1": file_sha1})
|
||||||
.execute()
|
.execute()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
21
scripts/202309157004032_add_sha1_column.sql
Normal file
21
scripts/202309157004032_add_sha1_column.sql
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
-- Check if file_sha1 column does not exist
|
||||||
|
IF NOT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'vectors' AND column_name = 'file_sha1') THEN
|
||||||
|
-- Add the file_sha1 column
|
||||||
|
ALTER TABLE public.vectors ADD COLUMN file_sha1 TEXT;
|
||||||
|
|
||||||
|
-- Populate file_sha1 using metadata JSONB column
|
||||||
|
UPDATE public.vectors SET file_sha1 = metadata->>'file_sha1';
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
|
||||||
|
-- Update migrations table
|
||||||
|
INSERT INTO migrations (name)
|
||||||
|
SELECT '202309157004032_add_sha1_column'
|
||||||
|
WHERE NOT EXISTS (
|
||||||
|
SELECT 1 FROM migrations WHERE name = '202309157004032_add_sha1_column'
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMIT;
|
@ -24,6 +24,7 @@ CREATE EXTENSION IF NOT EXISTS vector;
|
|||||||
CREATE TABLE IF NOT EXISTS vectors (
|
CREATE TABLE IF NOT EXISTS vectors (
|
||||||
id UUID DEFAULT uuid_generate_v4() PRIMARY KEY,
|
id UUID DEFAULT uuid_generate_v4() PRIMARY KEY,
|
||||||
content TEXT,
|
content TEXT,
|
||||||
|
file_sha1 TEXT,
|
||||||
metadata JSONB,
|
metadata JSONB,
|
||||||
embedding VECTOR(1536)
|
embedding VECTOR(1536)
|
||||||
);
|
);
|
||||||
@ -248,9 +249,9 @@ CREATE POLICY "Access Quivr Storage 1jccrwz_2" ON storage.objects FOR UPDATE TO
|
|||||||
CREATE POLICY "Access Quivr Storage 1jccrwz_3" ON storage.objects FOR DELETE TO anon USING (bucket_id = 'quivr');
|
CREATE POLICY "Access Quivr Storage 1jccrwz_3" ON storage.objects FOR DELETE TO anon USING (bucket_id = 'quivr');
|
||||||
|
|
||||||
INSERT INTO migrations (name)
|
INSERT INTO migrations (name)
|
||||||
SELECT '20230913110420_add_storage_bucket'
|
SELECT '202309157004032_add_sha1_column'
|
||||||
WHERE NOT EXISTS (
|
WHERE NOT EXISTS (
|
||||||
SELECT 1 FROM migrations WHERE name = '20230913110420_add_storage_bucket'
|
SELECT 1 FROM migrations WHERE name = '202309157004032_add_sha1_column'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user