mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-12-14 17:03:29 +03:00
Fix: change vector id to UUID (#609)
* fix: document upload * feat: explore fix to use uuid id * chore: remove prints * fix: tables.sql
This commit is contained in:
parent
f9a04ffbe2
commit
cef45ea712
@ -18,7 +18,7 @@ class Brain(BaseModel):
|
||||
model: Optional[str] = "gpt-3.5-turbo-0613"
|
||||
temperature: Optional[float] = 0.0
|
||||
max_tokens: Optional[int] = 256
|
||||
max_brain_size: Optional[int] = int(os.getenv("MAX_BRAIN_SIZE",52428800))
|
||||
max_brain_size: Optional[int] = int(os.getenv("MAX_BRAIN_SIZE", 52428800))
|
||||
files: List[Any] = []
|
||||
|
||||
class Config:
|
||||
|
@ -5,7 +5,6 @@ from fastapi import APIRouter, Depends, Query
|
||||
from models.brains import Brain
|
||||
from models.settings import common_dependencies
|
||||
from models.users import User
|
||||
|
||||
from routes.authorizations.brain_authorization import (
|
||||
has_brain_authorization,
|
||||
validate_brain_authorization,
|
||||
|
@ -67,38 +67,39 @@ def error_callback(exception):
|
||||
print("An exception occurred:", exception)
|
||||
|
||||
|
||||
def process_batch(batch_ids):
|
||||
def process_batch(batch_ids: List[str]):
|
||||
commons = common_dependencies()
|
||||
if len(batch_ids) == 1:
|
||||
return (
|
||||
commons["supabase"]
|
||||
.table("vectors")
|
||||
.select(
|
||||
"name:metadata->>file_name, size:metadata->>file_size",
|
||||
count="exact",
|
||||
)
|
||||
.filter("id", "eq", batch_ids[0])
|
||||
.execute()
|
||||
).data
|
||||
else:
|
||||
return (
|
||||
commons["supabase"]
|
||||
.table("vectors")
|
||||
.select(
|
||||
"name:metadata->>file_name, size:metadata->>file_size",
|
||||
count="exact",
|
||||
)
|
||||
.filter("id", "in", tuple(batch_ids))
|
||||
.execute()
|
||||
).data
|
||||
supabase = commons["supabase"]
|
||||
try:
|
||||
if len(batch_ids) == 1:
|
||||
return (
|
||||
supabase.table("vectors")
|
||||
.select(
|
||||
"name:metadata->>file_name, size:metadata->>file_size",
|
||||
count="exact",
|
||||
)
|
||||
.eq("id", batch_ids[0]) # Use parameter binding for single ID
|
||||
.execute()
|
||||
).data
|
||||
else:
|
||||
return (
|
||||
supabase.table("vectors")
|
||||
.select(
|
||||
"name:metadata->>file_name, size:metadata->>file_size",
|
||||
count="exact",
|
||||
)
|
||||
.in_("id", batch_ids) # Use parameter binding for multiple IDs
|
||||
.execute()
|
||||
).data
|
||||
except Exception as e:
|
||||
logger.error("Error retrieving batched vectors", e)
|
||||
|
||||
|
||||
def get_unique_files_from_vector_ids(vectors_ids: List[int]):
|
||||
def get_unique_files_from_vector_ids(vectors_ids: List[str]):
|
||||
# Move into Vectors class
|
||||
"""
|
||||
Retrieve unique user data vectors.
|
||||
"""
|
||||
print("vectors_ids", vectors_ids)
|
||||
|
||||
# constants
|
||||
BATCH_SIZE = 5
|
||||
|
97
scripts/202307111517031_change_vectors_id_type.sql
Normal file
97
scripts/202307111517031_change_vectors_id_type.sql
Normal file
@ -0,0 +1,97 @@
|
||||
-- Change vector ID type from BIGINT to UUID for langchain compatibility: https://github.com/hwchase17/langchain/commit/f773c217236ef07bea2203bc20d166569a0a0596
|
||||
BEGIN;
|
||||
|
||||
-- Create a temporary mapping table
|
||||
CREATE TEMP TABLE tmp_id_mapping (
|
||||
old_id BIGINT,
|
||||
new_id UUID
|
||||
);
|
||||
|
||||
-- Generate new UUIDs for each row in vectors, store old and new IDs in mapping table
|
||||
INSERT INTO tmp_id_mapping (old_id, new_id)
|
||||
SELECT id, uuid_generate_v4() FROM vectors;
|
||||
|
||||
-- Create a new vectors table with the desired structure
|
||||
CREATE TABLE vectors_new (
|
||||
id UUID PRIMARY KEY,
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
embedding VECTOR(1536)
|
||||
);
|
||||
|
||||
-- Copy data from the old vectors table to the new one, replacing old IDs with new UUIDs
|
||||
INSERT INTO vectors_new (id, content, metadata, embedding)
|
||||
SELECT tmp_id_mapping.new_id, vectors.content, vectors.metadata, vectors.embedding
|
||||
FROM vectors
|
||||
JOIN tmp_id_mapping ON vectors.id = tmp_id_mapping.old_id;
|
||||
|
||||
-- Rename the old vectors table and the new one
|
||||
ALTER TABLE vectors RENAME TO vectors_old;
|
||||
ALTER TABLE vectors_new RENAME TO vectors;
|
||||
|
||||
-- Add new UUID columns in brains_vectors and summaries
|
||||
ALTER TABLE brains_vectors ADD COLUMN new_vector_id UUID;
|
||||
ALTER TABLE summaries ADD COLUMN new_document_id UUID;
|
||||
|
||||
-- Update the new columns in brains_vectors and summaries to match the new UUIDs
|
||||
UPDATE brains_vectors
|
||||
SET new_vector_id = tmp_id_mapping.new_id
|
||||
FROM tmp_id_mapping
|
||||
WHERE brains_vectors.vector_id = tmp_id_mapping.old_id;
|
||||
|
||||
UPDATE summaries
|
||||
SET new_document_id = tmp_id_mapping.new_id
|
||||
FROM tmp_id_mapping
|
||||
WHERE summaries.document_id = tmp_id_mapping.old_id;
|
||||
|
||||
-- Drop old columns and rename new columns in brains_vectors and summaries
|
||||
ALTER TABLE brains_vectors DROP COLUMN vector_id;
|
||||
ALTER TABLE brains_vectors RENAME COLUMN new_vector_id TO vector_id;
|
||||
|
||||
ALTER TABLE summaries DROP COLUMN document_id;
|
||||
ALTER TABLE summaries RENAME COLUMN new_document_id TO document_id;
|
||||
|
||||
-- Add foreign key constraints back to brains_vectors and summaries
|
||||
ALTER TABLE brains_vectors ADD CONSTRAINT brains_vectors_vector_id_fkey FOREIGN KEY (vector_id) REFERENCES vectors (id);
|
||||
ALTER TABLE summaries ADD CONSTRAINT summaries_document_id_fkey FOREIGN KEY (document_id) REFERENCES vectors (id);
|
||||
|
||||
-- Update the match_vectors function
|
||||
DROP FUNCTION IF EXISTS match_vectors(VECTOR, INT, UUID);
|
||||
CREATE FUNCTION match_vectors(query_embedding VECTOR(1536), match_count INT, p_brain_id UUID)
|
||||
RETURNS TABLE(
|
||||
id UUID,
|
||||
brain_id UUID,
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
embedding VECTOR(1536),
|
||||
similarity FLOAT
|
||||
) LANGUAGE plpgsql AS $$
|
||||
#variable_conflict use_column
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
vectors.id,
|
||||
brains_vectors.brain_id,
|
||||
vectors.content,
|
||||
vectors.metadata,
|
||||
vectors.embedding,
|
||||
1 - (vectors.embedding <=> query_embedding) AS similarity
|
||||
FROM
|
||||
vectors
|
||||
INNER JOIN
|
||||
brains_vectors ON vectors.id = brains_vectors.vector_id
|
||||
WHERE brains_vectors.brain_id = p_brain_id
|
||||
ORDER BY
|
||||
vectors.embedding <=> query_embedding
|
||||
LIMIT match_count;
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Update migrations table
|
||||
INSERT INTO migrations (name)
|
||||
SELECT '202307111517031_change_vectors_id_type'
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM migrations WHERE name = '202307111517031_change_vectors_id_type'
|
||||
);
|
||||
|
||||
COMMIT;
|
@ -31,7 +31,7 @@ CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
-- Create vectors table
|
||||
CREATE TABLE IF NOT EXISTS vectors (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
id UUID DEFAULT uuid_generate_v4() PRIMARY KEY,
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
embedding VECTOR(1536)
|
||||
@ -40,7 +40,7 @@ CREATE TABLE IF NOT EXISTS vectors (
|
||||
-- Create function to match vectors
|
||||
CREATE OR REPLACE FUNCTION match_vectors(query_embedding VECTOR(1536), match_count INT, p_brain_id UUID)
|
||||
RETURNS TABLE(
|
||||
id BIGINT,
|
||||
id UUID,
|
||||
brain_id UUID,
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
@ -68,7 +68,6 @@ BEGIN
|
||||
END;
|
||||
$$;
|
||||
|
||||
|
||||
-- Create stats table
|
||||
CREATE TABLE IF NOT EXISTS stats (
|
||||
time TIMESTAMP,
|
||||
@ -82,7 +81,7 @@ CREATE TABLE IF NOT EXISTS stats (
|
||||
-- Create summaries table
|
||||
CREATE TABLE IF NOT EXISTS summaries (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
document_id BIGINT REFERENCES vectors(id),
|
||||
document_id UUID REFERENCES vectors(id),
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
embedding VECTOR(1536)
|
||||
@ -92,7 +91,7 @@ CREATE TABLE IF NOT EXISTS summaries (
|
||||
CREATE OR REPLACE FUNCTION match_summaries(query_embedding VECTOR(1536), match_count INT, match_threshold FLOAT)
|
||||
RETURNS TABLE(
|
||||
id BIGINT,
|
||||
document_id BIGINT,
|
||||
document_id UUID,
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
embedding VECTOR(1536),
|
||||
@ -145,13 +144,13 @@ CREATE TABLE IF NOT EXISTS brains_users (
|
||||
default_brain BOOLEAN DEFAULT false,
|
||||
PRIMARY KEY (brain_id, user_id),
|
||||
FOREIGN KEY (user_id) REFERENCES auth.users (id),
|
||||
FOREIGN KEY (brain_id) REFERENCES Brains (brain_id)
|
||||
FOREIGN KEY (brain_id) REFERENCES brains (brain_id)
|
||||
);
|
||||
|
||||
-- Create brains X vectors table
|
||||
CREATE TABLE IF NOT EXISTS brains_vectors (
|
||||
brain_id UUID,
|
||||
vector_id BIGINT,
|
||||
vector_id UUID,
|
||||
file_sha1 TEXT,
|
||||
PRIMARY KEY (brain_id, vector_id),
|
||||
FOREIGN KEY (vector_id) REFERENCES vectors (id),
|
||||
@ -164,17 +163,16 @@ CREATE TABLE IF NOT EXISTS brain_subscription_invitations (
|
||||
email VARCHAR(255),
|
||||
rights VARCHAR(255),
|
||||
PRIMARY KEY (brain_id, email),
|
||||
FOREIGN KEY (brain_id) REFERENCES Brains (brain_id)
|
||||
FOREIGN KEY (brain_id) REFERENCES brains (brain_id)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS migrations (
|
||||
name VARCHAR(255) PRIMARY KEY,
|
||||
executed_at TIMESTAMPTZ DEFAULT current_timestamp
|
||||
);
|
||||
|
||||
INSERT INTO migrations (name)
|
||||
SELECT '202307111517030_add_subscription_invitations_table'
|
||||
SELECT '202307111517031_change_vectors_id_type'
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM migrations WHERE name = '202307111517030_add_subscription_invitations_table'
|
||||
);
|
||||
SELECT 1 FROM migrations WHERE name = '202307111517031_change_vectors_id_type'
|
||||
);
|
||||
|
Loading…
Reference in New Issue
Block a user