Fix: change vector id to UUID (#609)

* fix: document upload

* feat: explore fix to use uuid id

* chore: remove prints

* fix: tables.sql
This commit is contained in:
Matt 2023-07-12 11:44:34 +01:00 committed by GitHub
parent f9a04ffbe2
commit cef45ea712
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 134 additions and 39 deletions

View File

@ -18,7 +18,7 @@ class Brain(BaseModel):
model: Optional[str] = "gpt-3.5-turbo-0613"
temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 256
max_brain_size: Optional[int] = int(os.getenv("MAX_BRAIN_SIZE",52428800))
max_brain_size: Optional[int] = int(os.getenv("MAX_BRAIN_SIZE", 52428800))
files: List[Any] = []
class Config:

View File

@ -5,7 +5,6 @@ from fastapi import APIRouter, Depends, Query
from models.brains import Brain
from models.settings import common_dependencies
from models.users import User
from routes.authorizations.brain_authorization import (
has_brain_authorization,
validate_brain_authorization,

View File

@ -67,38 +67,39 @@ def error_callback(exception):
print("An exception occurred:", exception)
def process_batch(batch_ids):
def process_batch(batch_ids: List[str]):
commons = common_dependencies()
if len(batch_ids) == 1:
return (
commons["supabase"]
.table("vectors")
.select(
"name:metadata->>file_name, size:metadata->>file_size",
count="exact",
)
.filter("id", "eq", batch_ids[0])
.execute()
).data
else:
return (
commons["supabase"]
.table("vectors")
.select(
"name:metadata->>file_name, size:metadata->>file_size",
count="exact",
)
.filter("id", "in", tuple(batch_ids))
.execute()
).data
supabase = commons["supabase"]
try:
if len(batch_ids) == 1:
return (
supabase.table("vectors")
.select(
"name:metadata->>file_name, size:metadata->>file_size",
count="exact",
)
.eq("id", batch_ids[0]) # Use parameter binding for single ID
.execute()
).data
else:
return (
supabase.table("vectors")
.select(
"name:metadata->>file_name, size:metadata->>file_size",
count="exact",
)
.in_("id", batch_ids) # Use parameter binding for multiple IDs
.execute()
).data
except Exception as e:
logger.error("Error retrieving batched vectors", e)
def get_unique_files_from_vector_ids(vectors_ids: List[int]):
def get_unique_files_from_vector_ids(vectors_ids: List[str]):
# Move into Vectors class
"""
Retrieve unique user data vectors.
"""
print("vectors_ids", vectors_ids)
# constants
BATCH_SIZE = 5

View File

@ -0,0 +1,97 @@
-- Change vector ID type from BIGINT to UUID for langchain compatibility: https://github.com/hwchase17/langchain/commit/f773c217236ef07bea2203bc20d166569a0a0596
BEGIN;
-- Create a temporary mapping table
CREATE TEMP TABLE tmp_id_mapping (
old_id BIGINT,
new_id UUID
);
-- Generate new UUIDs for each row in vectors, store old and new IDs in mapping table
INSERT INTO tmp_id_mapping (old_id, new_id)
SELECT id, uuid_generate_v4() FROM vectors;
-- Create a new vectors table with the desired structure
CREATE TABLE vectors_new (
id UUID PRIMARY KEY,
content TEXT,
metadata JSONB,
embedding VECTOR(1536)
);
-- Copy data from the old vectors table to the new one, replacing old IDs with new UUIDs
INSERT INTO vectors_new (id, content, metadata, embedding)
SELECT tmp_id_mapping.new_id, vectors.content, vectors.metadata, vectors.embedding
FROM vectors
JOIN tmp_id_mapping ON vectors.id = tmp_id_mapping.old_id;
-- Rename the old vectors table and the new one
ALTER TABLE vectors RENAME TO vectors_old;
ALTER TABLE vectors_new RENAME TO vectors;
-- Add new UUID columns in brains_vectors and summaries
ALTER TABLE brains_vectors ADD COLUMN new_vector_id UUID;
ALTER TABLE summaries ADD COLUMN new_document_id UUID;
-- Update the new columns in brains_vectors and summaries to match the new UUIDs
UPDATE brains_vectors
SET new_vector_id = tmp_id_mapping.new_id
FROM tmp_id_mapping
WHERE brains_vectors.vector_id = tmp_id_mapping.old_id;
UPDATE summaries
SET new_document_id = tmp_id_mapping.new_id
FROM tmp_id_mapping
WHERE summaries.document_id = tmp_id_mapping.old_id;
-- Drop old columns and rename new columns in brains_vectors and summaries
ALTER TABLE brains_vectors DROP COLUMN vector_id;
ALTER TABLE brains_vectors RENAME COLUMN new_vector_id TO vector_id;
ALTER TABLE summaries DROP COLUMN document_id;
ALTER TABLE summaries RENAME COLUMN new_document_id TO document_id;
-- Add foreign key constraints back to brains_vectors and summaries
ALTER TABLE brains_vectors ADD CONSTRAINT brains_vectors_vector_id_fkey FOREIGN KEY (vector_id) REFERENCES vectors (id);
ALTER TABLE summaries ADD CONSTRAINT summaries_document_id_fkey FOREIGN KEY (document_id) REFERENCES vectors (id);
-- Update the match_vectors function
DROP FUNCTION IF EXISTS match_vectors(VECTOR, INT, UUID);
CREATE FUNCTION match_vectors(query_embedding VECTOR(1536), match_count INT, p_brain_id UUID)
RETURNS TABLE(
id UUID,
brain_id UUID,
content TEXT,
metadata JSONB,
embedding VECTOR(1536),
similarity FLOAT
) LANGUAGE plpgsql AS $$
#variable_conflict use_column
BEGIN
RETURN QUERY
SELECT
vectors.id,
brains_vectors.brain_id,
vectors.content,
vectors.metadata,
vectors.embedding,
1 - (vectors.embedding <=> query_embedding) AS similarity
FROM
vectors
INNER JOIN
brains_vectors ON vectors.id = brains_vectors.vector_id
WHERE brains_vectors.brain_id = p_brain_id
ORDER BY
vectors.embedding <=> query_embedding
LIMIT match_count;
END;
$$;
-- Update migrations table
INSERT INTO migrations (name)
SELECT '202307111517031_change_vectors_id_type'
WHERE NOT EXISTS (
SELECT 1 FROM migrations WHERE name = '202307111517031_change_vectors_id_type'
);
COMMIT;

View File

@ -31,7 +31,7 @@ CREATE EXTENSION IF NOT EXISTS vector;
-- Create vectors table
CREATE TABLE IF NOT EXISTS vectors (
id BIGSERIAL PRIMARY KEY,
id UUID DEFAULT uuid_generate_v4() PRIMARY KEY,
content TEXT,
metadata JSONB,
embedding VECTOR(1536)
@ -40,7 +40,7 @@ CREATE TABLE IF NOT EXISTS vectors (
-- Create function to match vectors
CREATE OR REPLACE FUNCTION match_vectors(query_embedding VECTOR(1536), match_count INT, p_brain_id UUID)
RETURNS TABLE(
id BIGINT,
id UUID,
brain_id UUID,
content TEXT,
metadata JSONB,
@ -68,7 +68,6 @@ BEGIN
END;
$$;
-- Create stats table
CREATE TABLE IF NOT EXISTS stats (
time TIMESTAMP,
@ -82,7 +81,7 @@ CREATE TABLE IF NOT EXISTS stats (
-- Create summaries table
CREATE TABLE IF NOT EXISTS summaries (
id BIGSERIAL PRIMARY KEY,
document_id BIGINT REFERENCES vectors(id),
document_id UUID REFERENCES vectors(id),
content TEXT,
metadata JSONB,
embedding VECTOR(1536)
@ -92,7 +91,7 @@ CREATE TABLE IF NOT EXISTS summaries (
CREATE OR REPLACE FUNCTION match_summaries(query_embedding VECTOR(1536), match_count INT, match_threshold FLOAT)
RETURNS TABLE(
id BIGINT,
document_id BIGINT,
document_id UUID,
content TEXT,
metadata JSONB,
embedding VECTOR(1536),
@ -145,13 +144,13 @@ CREATE TABLE IF NOT EXISTS brains_users (
default_brain BOOLEAN DEFAULT false,
PRIMARY KEY (brain_id, user_id),
FOREIGN KEY (user_id) REFERENCES auth.users (id),
FOREIGN KEY (brain_id) REFERENCES Brains (brain_id)
FOREIGN KEY (brain_id) REFERENCES brains (brain_id)
);
-- Create brains X vectors table
CREATE TABLE IF NOT EXISTS brains_vectors (
brain_id UUID,
vector_id BIGINT,
vector_id UUID,
file_sha1 TEXT,
PRIMARY KEY (brain_id, vector_id),
FOREIGN KEY (vector_id) REFERENCES vectors (id),
@ -164,17 +163,16 @@ CREATE TABLE IF NOT EXISTS brain_subscription_invitations (
email VARCHAR(255),
rights VARCHAR(255),
PRIMARY KEY (brain_id, email),
FOREIGN KEY (brain_id) REFERENCES Brains (brain_id)
FOREIGN KEY (brain_id) REFERENCES brains (brain_id)
);
CREATE TABLE IF NOT EXISTS migrations (
name VARCHAR(255) PRIMARY KEY,
executed_at TIMESTAMPTZ DEFAULT current_timestamp
);
INSERT INTO migrations (name)
SELECT '202307111517030_add_subscription_invitations_table'
SELECT '202307111517031_change_vectors_id_type'
WHERE NOT EXISTS (
SELECT 1 FROM migrations WHERE name = '202307111517030_add_subscription_invitations_table'
);
SELECT 1 FROM migrations WHERE name = '202307111517031_change_vectors_id_type'
);