quivr/supabase/migrations/20240320215813_fix_match_vector_function.sql
Damien Mourot af59816c3f
fix(retriever): Update match_vectors sql function to rank chunks in correct order (#2367)
# Description
Fix match_vectors function that rank chunks in the wrong order 

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ x ] My code follows the style guidelines of this project
- [ x ] I have performed a self-review of my code
- [ x ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
2024-03-20 22:59:57 -07:00

50 lines
1.8 KiB
PL/PgSQL

set check_function_bodies = off;
CREATE OR REPLACE FUNCTION public.match_vectors(query_embedding vector, p_brain_id uuid, max_chunk_sum integer)
RETURNS TABLE(id uuid, brain_id uuid, content text, metadata jsonb, embedding vector, similarity double precision)
LANGUAGE plpgsql
AS $function$
BEGIN
RETURN QUERY
WITH ranked_vectors AS (
SELECT
v.id AS vector_id, -- Explicitly qualified
bv.brain_id AS vector_brain_id, -- Explicitly qualified and aliased
v.content AS vector_content, -- Explicitly qualified and aliased
v.metadata AS vector_metadata, -- Explicitly qualified and aliased
v.embedding AS vector_embedding, -- Explicitly qualified and aliased
1 - (v.embedding <=> query_embedding) AS calculated_similarity, -- Calculated and aliased
(v.metadata->>'chunk_size')::integer AS chunk_size -- Explicitly qualified
FROM
vectors v
INNER JOIN
brains_vectors bv ON v.id = bv.vector_id
WHERE
bv.brain_id = p_brain_id
ORDER BY
calculated_similarity -- Aliased similarity
), filtered_vectors AS (
SELECT
vector_id,
vector_brain_id,
vector_content,
vector_metadata,
vector_embedding,
calculated_similarity,
chunk_size,
sum(chunk_size) OVER (ORDER BY calculated_similarity DESC) AS running_total
FROM ranked_vectors
)
SELECT
vector_id AS id,
vector_brain_id AS brain_id,
vector_content AS content,
vector_metadata AS metadata,
vector_embedding AS embedding,
calculated_similarity AS similarity
FROM filtered_vectors
WHERE running_total <= max_chunk_sum;
END;
$function$
;