feat(chunks): now chunk size is saved in database dynamically and not just 500 (#2164)

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
This commit is contained in:
Stan Girard 2024-02-06 23:23:37 -08:00 committed by GitHub
parent 358e50b98d
commit 03c49693b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 102 additions and 42 deletions

View File

@ -1,9 +1,7 @@
from ast import List
from datetime import datetime, timedelta
from uuid import UUID
from logger import get_logger
from models.databases.entity import LLMModels
from models.databases.repository import Repository
logger = get_logger(__name__)
@ -269,11 +267,8 @@ class UserUsage(Repository):
"""
Increment the user's requests count for a specific day
"""
current = self.get_user_requests_count_for_day(user_id, date)
self.update_user_request_count(
user_id, daily_requests_count=current + number, date=date
)
self.update_user_request_count(user_id, daily_requests_count=number, date=date)
def update_user_request_count(self, user_id, daily_requests_count, date):
response = (

View File

@ -5,6 +5,7 @@ from uuid import UUID
from fastapi import UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from logger import get_logger
from models.databases.supabase.supabase import SupabaseDB
from models.settings import get_supabase_db
@ -26,7 +27,7 @@ class File(BaseModel):
content: Optional[Any] = None
chunk_size: int = 500
chunk_overlap: int = 0
documents: Optional[Any] = None
documents: Optional[Document] = None
@property
def supabase_db(self) -> SupabaseDB:

View File

@ -71,25 +71,32 @@ class UserUsage(UserIdentity):
current_requests_count = self.supabase_db.get_user_requests_count_for_month(
self.id, date
)
daily_requests_count = self.supabase_db.get_user_requests_count_for_day(
self.id, date
)
logger.info("Are you here?")
logger.info(current_requests_count)
if current_requests_count == 0:
if daily_requests_count == 0:
logger.info("Request count is 0, creating new record")
if self.email is None:
raise ValueError("User Email should be defined for daily usage table")
self.supabase_db.create_user_daily_usage(
user_id=self.id, date=date, user_email=self.email, number=number
)
self.daily_requests_count = current_requests_count + number
self.daily_requests_count = number
return
logger.info("Request count is not 0, updating the record")
self.supabase_db.increment_user_request_count(
user_id=self.id,
date=date,
number=number,
number=current_requests_count + number,
)
self.daily_requests_count = current_requests_count
self.daily_requests_count = current_requests_count + number
logger.info(
f"User {self.email} request count updated to {current_requests_count}"
f"User {self.email} request count updated to {self.daily_requests_count}"
)

View File

@ -41,16 +41,23 @@ def find_model_and_generate_metadata(
follow_up_questions = chat_service.get_follow_up_question(chat_id)
metadata["follow_up_questions"] = follow_up_questions
# Default model is gpt-3.5-turbo-0125
default_model = "gpt-3.5-turbo-0125"
model_to_use = LLMModels( # TODO Implement default models in database
name="gpt-3.5-turbo-0125", price=1, max_input=12000, max_output=1000
name=default_model, price=1, max_input=4000, max_output=1000
)
logger.info("Brain model: %s", brain.model)
# If brain.model is None, set it to the default_model
if brain.model is None:
brain.model = default_model
is_brain_model_available = any(
brain.model == model_dict.get("name") for model_dict in models_settings
)
is_user_allowed_model = brain.model in user_settings.get(
"models", ["gpt-3.5-turbo-0125"]
"models", [default_model]
) # Checks if the model is available in the list of models
logger.info(f"Brain model: {brain.model}")
@ -63,7 +70,7 @@ def find_model_and_generate_metadata(
model_to_use.name = brain.model
for model_dict in models_settings:
if model_dict.get("name") == model_to_use.name:
logger.info(f"Using model {model_to_use.name}")
model_to_use.price = model_dict.get("price")
model_to_use.max_input = model_dict.get("max_input")
model_to_use.max_output = model_dict.get("max_output")
break
@ -72,6 +79,9 @@ def find_model_and_generate_metadata(
metadata["max_tokens"] = model_to_use.max_output
metadata["max_input"] = model_to_use.max_input
logger.info(f"Model to use: {model_to_use}")
logger.info(f"Metadata: {metadata}")
return model_to_use, metadata

View File

@ -3,7 +3,6 @@ import uuid
import pytest
from fastapi import HTTPException
from modules.prompt.repository.prompts import Prompts
from modules.prompt.service.prompt_service import DeletePromptResponse
def test_get_public_prompts(client, api_key):
@ -15,20 +14,6 @@ def test_get_public_prompts(client, api_key):
assert len(response.json()) == 0
def test_delete_prompt_by_id():
# Arrange
prompts = Prompts()
prompt_id = uuid.uuid4() # Generate a valid UUID
# Act
result = prompts.delete_prompt_by_id(prompt_id)
# Assert
assert isinstance(result, DeletePromptResponse)
assert result.status == "deleted"
assert result.prompt_id == prompt_id
def test_delete_prompt_by_id_not_found():
# Arrange
prompts = Prompts()

View File

@ -13,8 +13,6 @@ logger = get_logger(__name__)
class Neurons(BaseModel):
def create_vector(self, docs):
documents_vector_store = get_documents_vector_store()
logger.info("Creating vector for document")
logger.info(f"Document: {docs}")
try:
sids = documents_vector_store.add_documents(docs)

View File

@ -1,5 +1,6 @@
import time
import tiktoken
from logger import get_logger
from models import File
from modules.brain.service.brain_vector_service import BrainVectorService
@ -35,10 +36,16 @@ async def process_file(
}
docs = []
enc = tiktoken.get_encoding("cl100k_base")
if file.documents is not None:
for doc in file.documents: # pyright: ignore reportPrivateUsage=none
new_metadata = metadata.copy()
len_chunk = len(enc.encode(doc.page_content))
logger.info(f"Chunk size: {len_chunk}")
new_metadata["chunk_size"] = len_chunk
doc_with_metadata = DocumentSerializable(
page_content=doc.page_content, metadata=metadata
page_content=doc.page_content, metadata=new_metadata
)
docs.append(doc_with_metadata)

View File

@ -79,7 +79,7 @@ class CustomSupabaseVectorStore(SupabaseVectorStore):
table,
{
"query_embedding": query_embedding,
"match_count": self.number_docs,
"max_chunk_sum": self.max_input,
"p_brain_id": str(self.brain_id),
},
).execute()

View File

@ -46,8 +46,8 @@ services:
context: backend
dockerfile: Dockerfile
container_name: worker
volumes:
- ./backend/:/code/
# volumes:
# - ./backend/:/code/
command: >
/bin/sh -c "
watchmedo auto-restart -d . -p '*.py' --recursive -- celery -A celery_worker worker -l info
@ -64,8 +64,7 @@ services:
build:
context: backend
dockerfile: Dockerfile
volumes:
- ./backend/:/code/
container_name: beat
command: >
/bin/sh -c "
@ -83,8 +82,7 @@ services:
build:
context: backend
dockerfile: Dockerfile
volumes:
- ./backend/:/code/
container_name: flower
command: celery -A celery_worker flower -l info --port=5555
restart: always

View File

@ -1,5 +1,3 @@
alter extension "wrappers" update to '0.2.0';
alter table "public"."brains" drop constraint "brains_prompt_id_fkey";
alter table "public"."chat_history" drop constraint "chat_history_prompt_id_fkey";

View File

@ -0,0 +1,61 @@
alter table "public"."brains_vectors" drop constraint "brains_vectors_vector_id_fkey";
drop function if exists "public"."match_vectors"(query_embedding vector, match_count integer, p_brain_id uuid);
CREATE INDEX vectors_metadata_idx ON public.vectors USING gin (metadata);
alter table "public"."brains_vectors" add constraint "brains_vectors_vector_id_fkey" FOREIGN KEY (vector_id) REFERENCES vectors(id) ON UPDATE CASCADE ON DELETE CASCADE not valid;
alter table "public"."brains_vectors" validate constraint "brains_vectors_vector_id_fkey";
set check_function_bodies = off;
CREATE OR REPLACE FUNCTION public.match_vectors(query_embedding vector, p_brain_id uuid, max_chunk_sum integer)
RETURNS TABLE(id uuid, brain_id uuid, content text, metadata jsonb, embedding vector, similarity double precision)
LANGUAGE plpgsql
AS $function$
BEGIN
RETURN QUERY
WITH ranked_vectors AS (
SELECT
v.id AS vector_id, -- Explicitly qualified
bv.brain_id AS vector_brain_id, -- Explicitly qualified and aliased
v.content AS vector_content, -- Explicitly qualified and aliased
v.metadata AS vector_metadata, -- Explicitly qualified and aliased
v.embedding AS vector_embedding, -- Explicitly qualified and aliased
1 - (v.embedding <=> query_embedding) AS calculated_similarity, -- Calculated and aliased
(v.metadata->>'chunk_size')::integer AS chunk_size -- Explicitly qualified
FROM
vectors v
INNER JOIN
brains_vectors bv ON v.id = bv.vector_id
WHERE
bv.brain_id = p_brain_id
ORDER BY
calculated_similarity -- Aliased similarity
), filtered_vectors AS (
SELECT
vector_id,
vector_brain_id,
vector_content,
vector_metadata,
vector_embedding,
calculated_similarity,
chunk_size,
sum(chunk_size) OVER (ORDER BY calculated_similarity) AS running_total
FROM ranked_vectors
)
SELECT
vector_id AS id,
vector_brain_id AS brain_id,
vector_content AS content,
vector_metadata AS metadata,
vector_embedding AS embedding,
calculated_similarity AS similarity
FROM filtered_vectors
WHERE running_total <= max_chunk_sum;
END;
$function$
;