feat(perf): increased perf embedding and search for files (#1182)

* feat(upload): changed to task

* feat(sha1): added column for better speed
This commit is contained in:
Stan Girard 2023-09-15 23:39:29 +02:00 committed by GitHub
parent cdf587cfde
commit 4d41901106
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 43 additions and 5 deletions

View File

@ -1,5 +1,7 @@
from celery import shared_task
from models.brains import Brain
from models.settings import get_supabase_db
from repository.files.upload_file import DocumentSerializable
from utils.vectors import Neurons
@ -12,6 +14,9 @@ def create_embedding_for_document(
doc = DocumentSerializable.from_json(doc_with_metadata)
created_vector = neurons.create_vector(doc, user_openai_api_key)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
database = get_supabase_db()
database.set_file_sha_from_metadata(file_sha1)
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none

View File

@ -172,7 +172,7 @@ class Brain(Repository):
vectorsResponse = (
self.db.table("vectors")
.select("id")
.filter("metadata->>file_sha1", "eq", file_sha1)
.filter("file_sha1", "eq", file_sha1)
.execute()
)
return vectorsResponse.data

View File

@ -9,7 +9,7 @@ class File(Repository):
response = (
self.db.table("vectors")
.select("id")
.filter("metadata->>file_sha1", "eq", file_sha1)
.filter("file_sha1", "eq", file_sha1)
.execute()
)
return response.data

View File

@ -23,7 +23,18 @@ class Vector(Repository):
response = (
self.db.table("vectors")
.select("id")
.filter("metadata->>file_sha1", "eq", file_sha1)
.filter("file_sha1", "eq", file_sha1)
.execute()
)
return response
def set_file_sha_from_metadata(self, file_sha1):
# It looks at the file that have a file_sha1 in the metadata that is corresponding but an empty file_sha1 column and set it
response = (
self.db.table("vectors")
.update({"file_sha1": file_sha1})
.match({"metadata->>file_sha1": file_sha1})
.execute()
)

View File

@ -0,0 +1,21 @@
DO $$
BEGIN
-- Check if file_sha1 column does not exist
IF NOT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'vectors' AND column_name = 'file_sha1') THEN
-- Add the file_sha1 column
ALTER TABLE public.vectors ADD COLUMN file_sha1 TEXT;
-- Populate file_sha1 using metadata JSONB column
UPDATE public.vectors SET file_sha1 = metadata->>'file_sha1';
END IF;
END $$;
-- Update migrations table
INSERT INTO migrations (name)
SELECT '202309157004032_add_sha1_column'
WHERE NOT EXISTS (
SELECT 1 FROM migrations WHERE name = '202309157004032_add_sha1_column'
);
COMMIT;

View File

@ -24,6 +24,7 @@ CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS vectors (
id UUID DEFAULT uuid_generate_v4() PRIMARY KEY,
content TEXT,
file_sha1 TEXT,
metadata JSONB,
embedding VECTOR(1536)
);
@ -248,9 +249,9 @@ CREATE POLICY "Access Quivr Storage 1jccrwz_2" ON storage.objects FOR UPDATE TO
CREATE POLICY "Access Quivr Storage 1jccrwz_3" ON storage.objects FOR DELETE TO anon USING (bucket_id = 'quivr');
INSERT INTO migrations (name)
SELECT '20230913110420_add_storage_bucket'
SELECT '202309157004032_add_sha1_column'
WHERE NOT EXISTS (
SELECT 1 FROM migrations WHERE name = '20230913110420_add_storage_bucket'
SELECT 1 FROM migrations WHERE name = '202309157004032_add_sha1_column'
);