Feat/static analysis (#582)

* feat: add static analysis

* chore: update Makefile add static analysis script

* chore: add vscode extensions recommandations
This commit is contained in:
Mamadou DICKO 2023-07-10 14:27:49 +02:00 committed by GitHub
parent f2a06dc6de
commit 9e9f531c99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 400 additions and 186 deletions

8
.vscode/extensions.json vendored Normal file
View File

@ -0,0 +1,8 @@
{
"recommendations": [
"ms-pyright.pyright",
"dbaeumer.vscode-eslint",
"ms-python.vscode-pylance",
"ms-pyright.pyright"
]
}

View File

@ -12,7 +12,8 @@
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": true
"source.organizeImports": true,
"source.fixAll": true
}
},
"[typescriptreact]": {
@ -41,4 +42,7 @@
"**/.docusaurus/": true,
"**/node_modules/": true,
},
"python.linting.pycodestyleCategorySeverity.W": "Error",
"python.defaultInterpreterPath": "python3",
"python.linting.flake8CategorySeverity.W": "Error",
}

View File

@ -4,4 +4,11 @@ dev:
docker compose -f docker-compose.dev.yml up --build
prod:
docker compose -f docker-compose.yml up --build
docker compose -f docker-compose.yml up --build
test-type:
@if command -v python3 &>/dev/null; then \
python3 -m pyright; \
else \
python -m pyright; \
fi

View File

@ -1,12 +1,13 @@
import os
from typing import Optional
from auth.api_key_handler import get_user_from_api_key, verify_api_key
from auth.jwt_token_handler import decode_access_token, verify_token
from fastapi import Depends, HTTPException, Request
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from models.users import User
from auth.api_key_handler import get_user_from_api_key, verify_api_key
from auth.jwt_token_handler import decode_access_token, verify_token
class AuthBearer(HTTPBearer):
def __init__(self, auto_error: bool = True):
@ -20,7 +21,7 @@ class AuthBearer(HTTPBearer):
request
)
self.check_scheme(credentials)
token = credentials.credentials
token = credentials.credentials # pyright: ignore reportPrivateUsage=none
return await self.authenticate(
token,
)
@ -52,7 +53,7 @@ class AuthBearer(HTTPBearer):
def get_test_user(self) -> User:
return User(
email="test@example.com", id="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX"
email="test@example.com", id="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" # type: ignore
) # replace with test user information

View File

@ -9,6 +9,9 @@ from models.users import User
SECRET_KEY = os.environ.get("JWT_SECRET_KEY")
ALGORITHM = "HS256"
if not SECRET_KEY:
raise ValueError("JWT_SECRET_KEY environment variable not set")
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
to_encode = data.copy()
@ -27,9 +30,12 @@ def decode_access_token(token: str) -> User:
token, SECRET_KEY, algorithms=[ALGORITHM], options={"verify_aud": False}
)
except JWTError:
return None
return None # pyright: ignore reportPrivateUsage=none
return User(email=payload.get("email"), id=payload.get("sub"))
return User(
email=payload.get("email"),
id=payload.get("sub"), # pyright: ignore reportPrivateUsage=none
)
def verify_token(token: str):

View File

@ -4,7 +4,6 @@ import tempfile
import unicodedata
import requests
from langchain.document_loaders import GitLoader
from pydantic import BaseModel
@ -29,7 +28,7 @@ class CrawlWebsite(BaseModel):
file_name = slugify(self.url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(temp_file_path, "w") as temp_file:
temp_file.write(content)
temp_file.write(content) # pyright: ignore reportPrivateUsage=none
# Process the file
if content:

View File

@ -20,19 +20,21 @@ class BaseBrainPicking(BaseModel):
"""
# Instantiate settings
brain_settings = BrainSettings()
brain_settings = BrainSettings() # type: ignore other parameters are optional
# Default class attributes
model: str = None
model: str = None # pyright: ignore reportPrivateUsage=none
temperature: float = 0.0
chat_id: str = None
brain_id: str = None
chat_id: str = None # pyright: ignore reportPrivateUsage=none
brain_id: str = None # pyright: ignore reportPrivateUsage=none
max_tokens: int = 256
user_openai_api_key: str = None
user_openai_api_key: str = None # pyright: ignore reportPrivateUsage=none
streaming: bool = False
openai_api_key: str = None
callbacks: List[AsyncCallbackHandler] = None
openai_api_key: str = None # pyright: ignore reportPrivateUsage=none
callbacks: List[
AsyncCallbackHandler
] = None # pyright: ignore reportPrivateUsage=none
def _determine_api_key(self, openai_api_key, user_openai_api_key):
"""If user provided an API key, use it."""
@ -55,10 +57,12 @@ class BaseBrainPicking(BaseModel):
def _determine_callback_array(
self, streaming
) -> List[AsyncIteratorCallbackHandler]:
) -> List[AsyncIteratorCallbackHandler]: # pyright: ignore reportPrivateUsage=none
"""If streaming is set, set the AsyncIteratorCallbackHandler as the only callback."""
if streaming:
return [AsyncIteratorCallbackHandler]
return [
AsyncIteratorCallbackHandler # pyright: ignore reportPrivateUsage=none
]
def __init__(self, **data):
super().__init__(**data)
@ -66,8 +70,12 @@ class BaseBrainPicking(BaseModel):
self.openai_api_key = self._determine_api_key(
self.brain_settings.openai_api_key, self.user_openai_api_key
)
self.streaming = self._determine_streaming(self.model, self.streaming)
self.callbacks = self._determine_callback_array(self.streaming)
self.streaming = self._determine_streaming(
self.model, self.streaming
) # pyright: ignore reportPrivateUsage=none
self.callbacks = self._determine_callback_array(
self.streaming
) # pyright: ignore reportPrivateUsage=none
class Config:
"""Configuration of the Pydantic Object"""

View File

@ -1,4 +1,5 @@
from typing import Optional
from .FunctionCall import FunctionCall
@ -6,7 +7,7 @@ class OpenAiAnswer:
def __init__(
self,
content: Optional[str] = None,
function_call: FunctionCall = None,
function_call: FunctionCall = None, # pyright: ignore reportPrivateUsage=none
):
self.content = content
self.function_call = function_call

View File

@ -13,11 +13,12 @@ from repository.chat.format_chat_history import format_chat_history
from repository.chat.get_chat_history import get_chat_history
from repository.chat.update_chat_history import update_chat_history
from repository.chat.update_message_by_id import update_message_by_id
from supabase import Client, create_client
from supabase.client import Client, create_client
from vectorstore.supabase import (
CustomSupabaseVectorStore,
) # Custom class for handling vector storage with Supabase
)
# Custom class for handling vector storage with Supabase
from .base import BaseBrainPicking
from .prompts.CONDENSE_PROMPT import CONDENSE_QUESTION_PROMPT
@ -42,7 +43,7 @@ class OpenAIBrainPicking(BaseBrainPicking):
max_tokens: int,
user_openai_api_key: str,
streaming: bool = False,
) -> "OpenAIBrainPicking":
) -> "OpenAIBrainPicking": # pyright: ignore reportPrivateUsage=none
"""
Initialize the BrainPicking class by setting embeddings, supabase client, vector store, language model and chains.
:return: OpenAIBrainPicking instance
@ -59,7 +60,9 @@ class OpenAIBrainPicking(BaseBrainPicking):
@property
def embeddings(self) -> OpenAIEmbeddings:
return OpenAIEmbeddings(openai_api_key=self.openai_api_key)
return OpenAIEmbeddings(
openai_api_key=self.openai_api_key
) # pyright: ignore reportPrivateUsage=none
@property
def supabase_client(self) -> Client:
@ -92,14 +95,16 @@ class OpenAIBrainPicking(BaseBrainPicking):
@property
def doc_chain(self) -> LLMChain:
return load_qa_chain(llm=self.doc_llm, chain_type="stuff")
return load_qa_chain(
llm=self.doc_llm, chain_type="stuff"
) # pyright: ignore reportPrivateUsage=none
@property
def qa(self) -> ConversationalRetrievalChain:
return ConversationalRetrievalChain(
retriever=self.vector_store.as_retriever(),
question_generator=self.question_generator,
combine_docs_chain=self.doc_chain,
combine_docs_chain=self.doc_chain, # pyright: ignore reportPrivateUsage=none
verbose=True,
)
@ -116,7 +121,7 @@ class OpenAIBrainPicking(BaseBrainPicking):
model=model,
streaming=streaming,
callbacks=callbacks,
)
) # pyright: ignore reportPrivateUsage=none
def _call_chain(self, chain, question, history):
"""
@ -205,8 +210,10 @@ class OpenAIBrainPicking(BaseBrainPicking):
task = asyncio.create_task(
wrap_done(
self.qa._acall_chain(self.qa, question, transformed_history),
callback.done,
self.qa._acall_chain( # pyright: ignore reportPrivateUsage=none
self.qa, question, transformed_history
),
callback.done, # pyright: ignore reportPrivateUsage=none
)
)
@ -217,7 +224,7 @@ class OpenAIBrainPicking(BaseBrainPicking):
)
# Use the aiter method of the callback to stream the response with server-sent-events
async for token in callback.aiter():
async for token in callback.aiter(): # pyright: ignore reportPrivateUsage=none
logger.info("Token: %s", token)
# Add the token to the response_tokens list

View File

@ -2,15 +2,16 @@ from typing import Any, Dict, List, Optional
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from llm.models.FunctionCall import FunctionCall
from llm.models.OpenAiAnswer import OpenAiAnswer
from logger import get_logger
from models.chat import ChatHistory
from repository.chat.get_chat_history import get_chat_history
from repository.chat.update_chat_history import update_chat_history
from supabase import Client, create_client
from supabase.client import Client, create_client
from vectorstore.supabase import CustomSupabaseVectorStore
from llm.models.FunctionCall import FunctionCall
from llm.models.OpenAiAnswer import OpenAiAnswer
from .base import BaseBrainPicking
logger = get_logger(__name__)
@ -27,7 +28,10 @@ def format_answer(model_response: Dict[str, Any]) -> OpenAiAnswer:
answer["function_call"]["arguments"],
)
return OpenAiAnswer(content=content, function_call=function_call)
return OpenAiAnswer(
content=content,
function_call=function_call, # pyright: ignore reportPrivateUsage=none
)
class OpenAIFunctionsBrainPicking(BaseBrainPicking):
@ -48,7 +52,7 @@ class OpenAIFunctionsBrainPicking(BaseBrainPicking):
brain_id: str,
user_openai_api_key: str,
# TODO: add streaming
) -> "OpenAIFunctionsBrainPicking":
) -> "OpenAIFunctionsBrainPicking": # pyright: ignore reportPrivateUsage=none
super().__init__(
model=model,
chat_id=chat_id,
@ -61,11 +65,15 @@ class OpenAIFunctionsBrainPicking(BaseBrainPicking):
@property
def openai_client(self) -> ChatOpenAI:
return ChatOpenAI(openai_api_key=self.openai_api_key)
return ChatOpenAI(
openai_api_key=self.openai_api_key
) # pyright: ignore reportPrivateUsage=none
@property
def embeddings(self) -> OpenAIEmbeddings:
return OpenAIEmbeddings(openai_api_key=self.openai_api_key)
return OpenAIEmbeddings(
openai_api_key=self.openai_api_key
) # pyright: ignore reportPrivateUsage=none
@property
def supabase_client(self) -> Client:
@ -125,7 +133,9 @@ class OpenAIFunctionsBrainPicking(BaseBrainPicking):
"""
logger.info("Getting context")
return self.vector_store.similarity_search(query=question)
return self.vector_store.similarity_search(
query=question
) # pyright: ignore reportPrivateUsage=none
def _construct_prompt(
self, question: str, useContext: bool = False, useHistory: bool = False

View File

@ -21,7 +21,7 @@ class PrivateGPT4AllBrainPicking(BaseBrainPicking):
chat_id: str,
brain_id: str,
streaming: bool,
) -> "PrivateGPT4AllBrainPicking":
) -> "PrivateGPT4AllBrainPicking": # pyright: ignore reportPrivateUsage=none
"""
Initialize the PrivateBrainPicking class by calling the parent class's initializer.
:param brain_id: The brain_id in the DB.
@ -57,4 +57,4 @@ class PrivateGPT4AllBrainPicking(BaseBrainPicking):
n_batch=model_n_batch,
backend="gptj",
verbose=True,
)
) # pyright: ignore reportPrivateUsage=none

View File

@ -31,7 +31,7 @@ Summarize the following text:
{{/assistant~}}
""",
llm=summary_llm,
)
) # pyright: ignore reportPrivateUsage=none
summary = summary(document=document)
logger.info("Summarization: %s", summary)
@ -78,10 +78,12 @@ Summary
{{/assistant~}}
""",
llm=evaluation_llm,
)
) # pyright: ignore reportPrivateUsage=none
result = evaluation(question=question, summaries=summaries)
evaluations = {}
for evaluation in result["evaluation"].split("\n"):
for evaluation in result["evaluation"].split(
"\n"
): # pyright: ignore reportPrivateUsage=none
if evaluation == "" or not evaluation[0].isdigit():
continue
logger.info("Evaluation Row: %s", evaluation)

View File

@ -2,11 +2,12 @@ import os
from typing import Any, List, Optional
from uuid import UUID
from models.settings import CommonsDep, common_dependencies
from models.users import User
from pydantic import BaseModel
from utils.vectors import get_unique_files_from_vector_ids
from models.settings import CommonsDep, common_dependencies
from models.users import User
class Brain(BaseModel):
id: Optional[UUID] = None
@ -15,7 +16,7 @@ class Brain(BaseModel):
model: Optional[str] = "gpt-3.5-turbo-0613"
temperature: Optional[float] = 0.0
max_tokens: Optional[int] = 256
brain_size: Optional[float] = 0.0
brain_size: Optional[float] = 0.0 # pyright: ignore reportPrivateUsage=none
max_brain_size: Optional[int] = int(os.getenv("MAX_BRAIN_SIZE", 0))
files: List[Any] = []
_commons: Optional[CommonsDep] = None
@ -27,7 +28,7 @@ class Brain(BaseModel):
def commons(self) -> CommonsDep:
if not self._commons:
self.__class__._commons = common_dependencies()
return self._commons
return self._commons # pyright: ignore reportPrivateUsage=none
@property
def brain_size(self):
@ -39,12 +40,17 @@ class Brain(BaseModel):
@property
def remaining_brain_size(self):
return float(self.max_brain_size) - self.brain_size
return (
float(self.max_brain_size) # pyright: ignore reportPrivateUsage=none
- self.brain_size # pyright: ignore reportPrivateUsage=none
)
@classmethod
def create(cls, *args, **kwargs):
commons = common_dependencies()
return cls(commons=commons, *args, **kwargs)
return cls(
commons=commons, *args, **kwargs # pyright: ignore reportPrivateUsage=none
) # pyright: ignore reportPrivateUsage=none
def get_user_brains(self, user_id):
response = (

View File

@ -9,10 +9,18 @@ class Chat:
chat_name: str
def __init__(self, chat_dict: dict):
self.chat_id = chat_dict.get("chat_id")
self.user_id = chat_dict.get("user_id")
self.creation_time = chat_dict.get("creation_time")
self.chat_name = chat_dict.get("chat_name")
self.chat_id = chat_dict.get(
"chat_id"
) # pyright: ignore reportPrivateUsage=none
self.user_id = chat_dict.get(
"user_id"
) # pyright: ignore reportPrivateUsage=none
self.creation_time = chat_dict.get(
"creation_time"
) # pyright: ignore reportPrivateUsage=none
self.chat_name = chat_dict.get(
"chat_name"
) # pyright: ignore reportPrivateUsage=none
@dataclass
@ -24,11 +32,21 @@ class ChatHistory:
message_time: str
def __init__(self, chat_dict: dict):
self.chat_id = chat_dict.get("chat_id")
self.message_id = chat_dict.get("message_id")
self.user_message = chat_dict.get("user_message")
self.assistant = chat_dict.get("assistant")
self.message_time = chat_dict.get("message_time")
self.chat_id = chat_dict.get(
"chat_id"
) # pyright: ignore reportPrivateUsage=none
self.message_id = chat_dict.get(
"message_id"
) # pyright: ignore reportPrivateUsage=none
self.user_message = chat_dict.get(
"user_message"
) # pyright: ignore reportPrivateUsage=none
self.assistant = chat_dict.get(
"assistant"
) # pyright: ignore reportPrivateUsage=none
self.message_time = chat_dict.get(
"message_time"
) # pyright: ignore reportPrivateUsage=none
def to_dict(self):
return asdict(self)

View File

@ -6,11 +6,12 @@ from uuid import UUID
from fastapi import UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from logger import get_logger
from models.brains import Brain
from models.settings import CommonsDep, common_dependencies
from pydantic import BaseModel
from utils.file import compute_sha1_from_file
from models.brains import Brain
from models.settings import CommonsDep, common_dependencies
logger = get_logger(__name__)
@ -18,9 +19,9 @@ class File(BaseModel):
id: Optional[UUID] = None
file: Optional[UploadFile]
file_name: Optional[str] = ""
file_size: Optional[int] = ""
file_size: Optional[int] = "" # pyright: ignore reportPrivateUsage=none
file_sha1: Optional[str] = ""
vectors_ids: Optional[int] = []
vectors_ids: Optional[int] = [] # pyright: ignore reportPrivateUsage=none
file_extension: Optional[str] = ""
content: Optional[Any] = None
chunk_size: int = 500
@ -30,16 +31,25 @@ class File(BaseModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
if self.file:
self.file_name = self.file.filename
self.file_size = self.file.file._file.tell()
self.file_extension = os.path.splitext(self.file.filename)[-1].lower()
self.file_size = (
self.file.file._file.tell() # pyright: ignore reportPrivateUsage=none
)
self.file_extension = os.path.splitext(
self.file.filename # pyright: ignore reportPrivateUsage=none
)[-1].lower()
async def compute_file_sha1(self):
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file.filename) as tmp_file:
await self.file.seek(0)
self.content = await self.file.read()
with tempfile.NamedTemporaryFile(
delete=False,
suffix=self.file.filename, # pyright: ignore reportPrivateUsage=none
) as tmp_file:
await self.file.seek(0) # pyright: ignore reportPrivateUsage=none
self.content = (
await self.file.read() # pyright: ignore reportPrivateUsage=none
)
tmp_file.write(self.content)
tmp_file.flush()
self.file_sha1 = compute_sha1_from_file(tmp_file.name)
@ -48,18 +58,21 @@ class File(BaseModel):
def compute_documents(self, loader_class):
logger.info(f"Computing documents from file {self.file_name}")
documents = []
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file.filename) as tmp_file:
tmp_file.write(self.content)
with tempfile.NamedTemporaryFile(
delete=False,
suffix=self.file.filename, # pyright: ignore reportPrivateUsage=none
) as tmp_file:
tmp_file.write(self.content) # pyright: ignore reportPrivateUsage=none
tmp_file.flush()
loader = loader_class(tmp_file.name)
documents = loader.load()
print("documents", documents)
os.remove(tmp_file.name)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
)
@ -70,20 +83,21 @@ class File(BaseModel):
def set_file_vectors_ids(self):
"""
Set the vectors_ids property with the ids of the vectors
Set the vectors_ids property with the ids of the vectors
that are associated with the file in the vectors table
"""
commons = common_dependencies()
commons = common_dependencies()
response = (
commons["supabase"].table("vectors")
commons["supabase"]
.table("vectors")
.select("id")
.filter("metadata->>file_sha1", "eq", self.file_sha1)
.execute()
)
self.vectors_ids = response.data
return
def file_already_exists(self):
"""
Check if file already exists in vectors table
@ -92,20 +106,24 @@ class File(BaseModel):
print("file_sha1", self.file_sha1)
print("vectors_ids", self.vectors_ids)
print("len(vectors_ids)", len(self.vectors_ids))
print(
"len(vectors_ids)",
len(self.vectors_ids), # pyright: ignore reportPrivateUsage=none
)
# if the file does not exist in vectors then no need to go check in brains_vectors
if len(self.vectors_ids) == 0:
if len(self.vectors_ids) == 0: # pyright: ignore reportPrivateUsage=none
return False
return True
def file_already_exists_in_brain(self, brain_id):
commons = common_dependencies()
commons = common_dependencies()
self.set_file_vectors_ids()
# Check if file exists in that brain
response = (
commons["supabase"].table("brains_vectors")
commons["supabase"]
.table("brains_vectors")
.select("brain_id, vector_id")
.filter("brain_id", "eq", brain_id)
.filter("file_sha1", "eq", self.file_sha1)
@ -114,15 +132,17 @@ class File(BaseModel):
print("response.data", response.data)
if len(response.data) == 0:
return False
return True
def file_is_empty(self):
return self.file.file._file.tell() < 1
return (
self.file.file._file.tell() < 1 # pyright: ignore reportPrivateUsage=none
)
def link_file_to_brain(self, brain: Brain):
self.set_file_vectors_ids()
for vector_id in self.vectors_ids:
brain.create_brain_vector(vector_id['id'], self.file_sha1)
for vector_id in self.vectors_ids: # pyright: ignore reportPrivateUsage=none
brain.create_brain_vector(vector_id["id"], self.file_sha1)
print(f"Successfully linked file {self.file_sha1} to brain {brain.id}")

View File

@ -3,7 +3,7 @@ from typing import Annotated
from fastapi import Depends
from langchain.embeddings.openai import OpenAIEmbeddings
from pydantic import BaseSettings
from supabase import Client, create_client
from supabase.client import Client, create_client
from vectorstore.supabase import SupabaseVectorStore
@ -22,8 +22,10 @@ class LLMSettings(BaseSettings):
def common_dependencies() -> dict:
settings = BrainSettings()
embeddings = OpenAIEmbeddings(openai_api_key=settings.openai_api_key)
settings = BrainSettings() # pyright: ignore reportPrivateUsage=none
embeddings = OpenAIEmbeddings(
openai_api_key=settings.openai_api_key
) # pyright: ignore reportPrivateUsage=none
supabase_client: Client = create_client(
settings.supabase_url, settings.supabase_service_key
)

View File

@ -10,23 +10,33 @@ from models.settings import CommonsDep
from utils.file import compute_sha1_from_content
async def process_audio(commons: CommonsDep, file: File, enable_summarization: bool, user, user_openai_api_key):
async def process_audio(
commons: CommonsDep, # pyright: ignore reportPrivateUsage=none
file: File,
enable_summarization: bool,
user,
user_openai_api_key,
):
temp_filename = None
file_sha = ""
dateshort = time.strftime("%Y%m%d-%H%M%S")
file_meta_name = f"audiotranscript_{dateshort}.txt"
# use this for whisper
openai_api_key = os.environ.get("OPENAI_API_KEY")
os.environ.get("OPENAI_API_KEY")
if user_openai_api_key:
openai_api_key = user_openai_api_key
pass
try:
upload_file = file.file
with tempfile.NamedTemporaryFile(delete=False, suffix=upload_file.filename) as tmp_file:
await upload_file.seek(0)
content = await upload_file.read()
with tempfile.NamedTemporaryFile(
delete=False,
suffix=upload_file.filename, # pyright: ignore reportPrivateUsage=none
) as tmp_file:
await upload_file.seek(0) # pyright: ignore reportPrivateUsage=none
content = (
await upload_file.read() # pyright: ignore reportPrivateUsage=none
)
tmp_file.write(content)
tmp_file.flush()
tmp_file.close()
@ -36,21 +46,42 @@ async def process_audio(commons: CommonsDep, file: File, enable_summarization: b
with open(tmp_file.name, "rb") as audio_file:
transcript = openai.Audio.transcribe("whisper-1", audio_file)
file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
file_size = len(transcript.text.encode("utf-8"))
file_sha = compute_sha1_from_content(
transcript.text.encode("utf-8") # pyright: ignore reportPrivateUsage=none
)
file_size = len(
transcript.text.encode("utf-8") # pyright: ignore reportPrivateUsage=none
)
chunk_size = 500
chunk_overlap = 0
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_text(transcript.text.encode("utf-8"))
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
texts = text_splitter.split_text(
transcript.text.encode("utf-8") # pyright: ignore reportPrivateUsage=none
)
docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha, "file_size": file_size, "file_name": file_meta_name,
"chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts]
docs_with_metadata = [
Document(
page_content=text,
metadata={
"file_sha1": file_sha,
"file_size": file_size,
"file_name": file_meta_name,
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"date": dateshort,
},
)
for text in texts
]
commons.documents_vector_store.add_documents(docs_with_metadata)
commons.documents_vector_store.add_documents( # pyright: ignore reportPrivateUsage=none
docs_with_metadata
)
finally:
if temp_filename and os.path.exists(temp_filename):
os.remove(temp_filename)
os.remove(temp_filename)

View File

@ -19,7 +19,7 @@ async def process_file(
file.compute_documents(loader_class)
for doc in file.documents:
for doc in file.documents: # pyright: ignore reportPrivateUsage=none
metadata = {
"file_sha1": file.file_sha1,
"file_size": file.file_size,
@ -29,17 +29,15 @@ async def process_file(
"date": dateshort,
"summarization": "true" if enable_summarization else "false",
}
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)
doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)
neurons = Neurons(commons=commons)
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
created_vector_id = created_vector[0]
created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none
brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id, file.file_sha1)
return

View File

@ -11,7 +11,13 @@ from utils.file import compute_sha1_from_content
from utils.vectors import Neurons
async def process_github(commons: CommonsDep, repo, enable_summarization, brain_id, user_openai_api_key):
async def process_github(
commons: CommonsDep, # pyright: ignore reportPrivateUsage=none
repo,
enable_summarization,
brain_id,
user_openai_api_key,
):
random_dir_name = os.urandom(16).hex()
dateshort = time.strftime("%Y%m%d")
loader = GitLoader(
@ -24,41 +30,60 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, brain_
chunk_size = 500
chunk_overlap = 0
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
documents = text_splitter.split_documents(documents)
print(documents[:1])
for doc in documents:
if doc.metadata["file_type"] in [".pyc",".png",".svg", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git", ".json"]:
if doc.metadata["file_type"] in [
".pyc",
".png",
".svg",
".env",
".lock",
".gitignore",
".gitmodules",
".gitattributes",
".gitkeep",
".git",
".json",
]:
continue
metadata = {
"file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")),
"file_size": len(doc.page_content)*8,
"file_size": len(doc.page_content) * 8,
"file_name": doc.metadata["file_name"],
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"date": dateshort,
"summarization": "true" if enable_summarization else "false"
"summarization": "true" if enable_summarization else "false",
}
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)
file = File(file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8")))
doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)
file = File(
file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8"))
)
file_exists = file.file_already_exists()
if not file_exists:
print(f"Creating entry for file {file.file_sha1} in vectors...")
neurons = Neurons(commons=commons)
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
neurons = Neurons(commons=commons)
created_vector = neurons.create_vector(
doc_with_metadata, user_openai_api_key
)
print("Created vector sids ", created_vector)
print("Created vector for ", doc.metadata["file_name"])
file_exists_in_brain = file.file_already_exists_in_brain(brain_id)
if not file_exists_in_brain:
file.add_file_to_brain(brain_id)
file.add_file_to_brain(brain_id) # pyright: ignore reportPrivateUsage=none
brain = Brain(id=brain_id)
file.link_file_to_brain(brain)
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}
return {
"message": f"✅ Github with {len(documents)} files has been uploaded.",
"type": "success",
}

View File

@ -0,0 +1,5 @@
{
"exclude": [
"supabase"
]
}

View File

@ -1,6 +1,7 @@
from typing import List # For type hinting
from models.chat import ChatHistory
from models.settings import common_dependencies
from typing import List # For type hinting
def get_chat_history(chat_id: str) -> List[ChatHistory]:
@ -16,4 +17,7 @@ def get_chat_history(chat_id: str) -> List[ChatHistory]:
if history is None:
return []
else:
return [ChatHistory(message) for message in history]
return [
ChatHistory(message) # pyright: ignore reportPrivateUsage=none
for message in history
]

View File

@ -21,7 +21,7 @@ def update_chat(chat_id, chat_data: ChatUpdatableProperties) -> Chat:
if not chat_id:
logger.error("No chat_id provided")
return
return # pyright: ignore reportPrivateUsage=none
updates = {}
@ -41,4 +41,4 @@ def update_chat(chat_id, chat_data: ChatUpdatableProperties) -> Chat:
logger.info(f"Chat {chat_id} updated")
else:
logger.info(f"No updates to apply for chat {chat_id}")
return updated_chat
return updated_chat # pyright: ignore reportPrivateUsage=none

View File

@ -23,4 +23,4 @@ def update_chat_history(chat_id: str, user_message: str, assistant: str) -> Chat
raise HTTPException(
status_code=500, detail="An exception occurred while updating chat history."
)
return ChatHistory(response[0])
return ChatHistory(response[0]) # pyright: ignore reportPrivateUsage=none

View File

@ -6,13 +6,15 @@ logger = get_logger(__name__)
def update_message_by_id(
message_id: str, user_message: str = None, assistant: str = None
message_id: str,
user_message: str = None, # pyright: ignore reportPrivateUsage=none
assistant: str = None, # pyright: ignore reportPrivateUsage=none
) -> ChatHistory:
commons = common_dependencies()
if not message_id:
logger.error("No message_id provided")
return
return # pyright: ignore reportPrivateUsage=none
updates = {}
@ -35,4 +37,4 @@ def update_message_by_id(
logger.info(f"Message {message_id} updated")
else:
logger.info(f"No updates to apply for message {message_id}")
return ChatHistory(updated_message)
return ChatHistory(updated_message) # pyright: ignore reportPrivateUsage=none

View File

@ -22,4 +22,5 @@ asyncpg==0.27.0
flake8==6.0.0
flake8-black==0.3.6
sentence_transformers>=2.0.0
sentry-sdk==1.26.0
sentry-sdk==1.26.0
pyright==1.1.316

View File

@ -142,24 +142,27 @@ async def create_brain_endpoint(
In the brains table & in the brains_users table and put the creator user as 'Owner'
"""
brain = Brain(name=brain.name)
brain = Brain(name=brain.name) # pyright: ignore reportPrivateUsage=none
brain.create_brain()
brain.create_brain() # pyright: ignore reportPrivateUsage=none
default_brain = get_default_user_brain(current_user)
if default_brain:
logger.info(f"Default brain already exists for user {current_user.id}")
brain.create_brain_user(
brain.create_brain_user( # pyright: ignore reportPrivateUsage=none
user_id=current_user.id, rights="Owner", default_brain=False
)
else:
logger.info(
f"Default brain does not exist for user {current_user.id}. It will be created."
)
brain.create_brain_user(
brain.create_brain_user( # pyright: ignore reportPrivateUsage=none
user_id=current_user.id, rights="Owner", default_brain=True
)
return {"id": brain.id, "name": brain.name}
return {
"id": brain.id, # pyright: ignore reportPrivateUsage=none
"name": brain.name,
}
# update existing brain
@ -182,10 +185,12 @@ async def update_brain_endpoint(
brain = Brain(id=brain_id)
# Add new file to brain , il file_sha1 already exists in brains_vectors -> out (not now)
if brain.file_sha1:
if brain.file_sha1: # pyright: ignore reportPrivateUsage=none
# add all the vector Ids to the brains_vectors with the given brain.brain_id
brain.update_brain_with_file(file_sha1=input_brain.file_sha1)
brain.update_brain_with_file(
file_sha1=input_brain.file_sha1 # pyright: ignore reportPrivateUsage=none
)
print("brain:", brain)
brain.update_brain_fields(commons, brain)
brain.update_brain_fields(commons, brain) # pyright: ignore reportPrivateUsage=none
return {"message": f"Brain {brain_id} has been updated."}

View File

@ -78,8 +78,8 @@ def check_user_limit(
user.increment_user_request_count(date)
if int(user.requests_count) >= int(max_requests_number):
raise HTTPException(
status_code=429,
detail="You have reached the maximum number of requests for today.",
status_code=429, # pyright: ignore reportPrivateUsage=none
detail="You have reached the maximum number of requests for today.", # pyright: ignore reportPrivateUsage=none
)
else:
pass
@ -97,7 +97,7 @@ async def get_chats(current_user: User = Depends(get_current_user)):
This endpoint retrieves all the chats associated with the current authenticated user. It returns a list of chat objects
containing the chat ID and chat name for each chat.
"""
chats = get_user_chats(current_user.id)
chats = get_user_chats(current_user.id) # pyright: ignore reportPrivateUsage=none
return {"chats": chats}
@ -127,10 +127,11 @@ async def update_chat_metadata_handler(
Update chat attributes
"""
chat = get_chat_by_id(chat_id)
chat = get_chat_by_id(chat_id) # pyright: ignore reportPrivateUsage=none
if current_user.id != chat.user_id:
raise HTTPException(
status_code=403, detail="You should be the owner of the chat to update it."
status_code=403, # pyright: ignore reportPrivateUsage=none
detail="You should be the owner of the chat to update it.", # pyright: ignore reportPrivateUsage=none
)
return update_chat(chat_id=chat_id, chat_data=chat_data)
@ -181,7 +182,7 @@ async def create_question_handler(
temperature=chat_question.temperature,
max_tokens=chat_question.max_tokens,
brain_id=str(brain_id),
user_openai_api_key=current_user.user_openai_api_key,
user_openai_api_key=current_user.user_openai_api_key, # pyright: ignore reportPrivateUsage=none
)
else:
@ -191,10 +192,12 @@ async def create_question_handler(
max_tokens=chat_question.max_tokens,
temperature=chat_question.temperature,
brain_id=str(brain_id),
user_openai_api_key=current_user.user_openai_api_key,
user_openai_api_key=current_user.user_openai_api_key, # pyright: ignore reportPrivateUsage=none
)
chat_answer = gpt_answer_generator.generate_answer(chat_question.question)
chat_answer = gpt_answer_generator.generate_answer( # pyright: ignore reportPrivateUsage=none
chat_question.question
)
return chat_answer
except HTTPException as e:
@ -217,7 +220,10 @@ async def create_stream_question_handler(
if chat_question.model not in streaming_compatible_models:
# Forward the request to the none streaming endpoint
return await create_question_handler(
request, chat_question, chat_id, current_user
request,
chat_question,
chat_id,
current_user, # pyright: ignore reportPrivateUsage=none
)
try:
@ -238,12 +244,14 @@ async def create_stream_question_handler(
max_tokens=chat_question.max_tokens,
temperature=chat_question.temperature,
brain_id=str(brain_id),
user_openai_api_key=user_openai_api_key,
user_openai_api_key=user_openai_api_key, # pyright: ignore reportPrivateUsage=none
streaming=True,
)
return StreamingResponse(
gpt_answer_generator.generate_stream(chat_question.question),
gpt_answer_generator.generate_stream( # pyright: ignore reportPrivateUsage=none
chat_question.question
),
media_type="text/event-stream",
)
@ -259,4 +267,4 @@ async def get_chat_history_handler(
chat_id: UUID,
) -> List[ChatHistory]:
# TODO: RBAC with current_user
return get_chat_history(chat_id)
return get_chat_history(chat_id) # pyright: ignore reportPrivateUsage=none

View File

@ -35,7 +35,9 @@ async def crawl_endpoint(
commons = common_dependencies()
if request.headers.get("Openai-Api-Key"):
brain.max_brain_size = os.getenv("MAX_BRAIN_SIZE_WITH_KEY", 209715200)
brain.max_brain_size = os.getenv(
"MAX_BRAIN_SIZE_WITH_KEY", 209715200
) # pyright: ignore reportPrivateUsage=none
file_size = 1000000
remaining_free_space = brain.remaining_brain_size
@ -47,14 +49,20 @@ async def crawl_endpoint(
}
else:
if not crawl_website.checkGithub():
file_path, file_name = crawl_website.process()
(
file_path,
file_name,
) = crawl_website.process() # pyright: ignore reportPrivateUsage=none
# Create a SpooledTemporaryFile from the file_path
spooled_file = SpooledTemporaryFile()
with open(file_path, "rb") as f:
shutil.copyfileobj(f, spooled_file)
# Pass the SpooledTemporaryFile to UploadFile
uploadFile = UploadFile(file=spooled_file, filename=file_name)
uploadFile = UploadFile(
file=spooled_file, # pyright: ignore reportPrivateUsage=none
filename=file_name,
)
file = File(file=uploadFile)
# check remaining free space here !!
message = await filter_file(

View File

@ -42,7 +42,9 @@ async def upload_file(
commons = common_dependencies()
if request.headers.get("Openai-Api-Key"):
brain.max_brain_size = os.getenv("MAX_BRAIN_SIZE_WITH_KEY", 209715200)
brain.max_brain_size = os.getenv(
"MAX_BRAIN_SIZE_WITH_KEY", 209715200
) # pyright: ignore reportPrivateUsage=none
remaining_free_space = brain.remaining_brain_size
file_size = get_file_size(uploadFile)

View File

@ -10,6 +10,9 @@ client = TestClient(app)
API_KEY = os.getenv("CI_TEST_API_KEY")
if not API_KEY:
raise ValueError("CI_TEST_API_KEY environment variable not set. Cannot run tests.")
def test_read_main():
response = client.get("/")
@ -54,7 +57,8 @@ def test_create_and_delete_api_key():
def test_retrieve_default_brain():
# Making a GET request to the /brains/default/ endpoint
response = client.get(
"/brains/default/", headers={"Authorization": "Bearer " + API_KEY}
"/brains/default/",
headers={"Authorization": "Bearer " + API_KEY},
)
# Assert that the response status code is 200 (HTTP OK)
@ -88,7 +92,9 @@ def test_create_brain():
# Making a POST request to the /brains/ endpoint
response = client.post(
"/brains/", json=payload, headers={"Authorization": "Bearer " + API_KEY}
"/brains/",
json=payload,
headers={"Authorization": "Bearer " + API_KEY},
)
# Assert that the response status code is 200 (HTTP OK)
@ -106,7 +112,10 @@ def test_create_brain():
def test_retrieve_all_brains():
# Making a GET request to the /brains/ endpoint to retrieve all brains for the current user
response = client.get("/brains/", headers={"Authorization": "Bearer " + API_KEY})
response = client.get(
"/brains/",
headers={"Authorization": "Bearer " + API_KEY},
)
# Assert that the response status code is 200 (HTTP OK)
assert response.status_code == 200
@ -120,7 +129,10 @@ def test_retrieve_all_brains():
def test_delete_all_brains():
# First, retrieve all brains for the current user
response = client.get("/brains/", headers={"Authorization": "Bearer " + API_KEY})
response = client.get(
"/brains/",
headers={"Authorization": "Bearer " + API_KEY},
)
# Assert that the response status code is 200 (HTTP OK)
assert response.status_code == 200
@ -133,7 +145,8 @@ def test_delete_all_brains():
# Send a DELETE request to delete the specific brain
delete_response = client.delete(
f"/brains/{brain_id}/", headers={"Authorization": "Bearer " + API_KEY}
f"/brains/{brain_id}/",
headers={"Authorization": "Bearer " + API_KEY},
)
# Assert that the DELETE response status code is 200 (HTTP OK)
@ -142,7 +155,10 @@ def test_delete_all_brains():
def test_get_all_chats():
# Making a GET request to the /chat endpoint to retrieve all chats
response = client.get("/chat", headers={"Authorization": "Bearer " + API_KEY})
response = client.get(
"/chat",
headers={"Authorization": "Bearer " + API_KEY},
)
# Assert that the response status code is 200 (HTTP OK)
assert response.status_code == 200

View File

@ -18,8 +18,10 @@ def convert_bytes(bytes, precision=2):
def get_file_size(file: UploadFile):
# move the cursor to the end of the file
file.file._file.seek(0, 2)
file_size = file.file._file.tell() # Getting the size of the file
file.file._file.seek(0, 2) # pyright: ignore reportPrivateUsage=none
file_size = (
file.file._file.tell() # pyright: ignore reportPrivateUsage=none
) # Getting the size of the file
# move the cursor back to the beginning of the file
file.file.seek(0)

View File

@ -54,14 +54,19 @@ async def filter_file(
if file_exists_in_brain:
return create_response(
f"🤔 {file.file.filename} already exists in brain {brain_id}.", "warning"
f"🤔 {file.file.filename} already exists in brain {brain_id}.", # pyright: ignore reportPrivateUsage=none
"warning",
)
elif file.file_is_empty():
return create_response(f"{file.file.filename} is empty.", "error")
return create_response(
f"{file.file.filename} is empty.", # pyright: ignore reportPrivateUsage=none
"error", # pyright: ignore reportPrivateUsage=none
)
elif file_exists:
file.link_file_to_brain(brain=Brain(id=brain_id))
return create_response(
f"{file.file.filename} has been uploaded to brain {brain_id}.", "success"
f"{file.file.filename} has been uploaded to brain {brain_id}.", # pyright: ignore reportPrivateUsage=none
"success",
)
if file.file_extension in file_processors:
@ -70,14 +75,18 @@ async def filter_file(
commons, file, enable_summarization, brain_id, openai_api_key
)
return create_response(
f"{file.file.filename} has been uploaded to brain {brain_id}.",
f"{file.file.filename} has been uploaded to brain {brain_id}.", # pyright: ignore reportPrivateUsage=none
"success",
)
except Exception as e:
# Add more specific exceptions as needed.
print(f"Error processing file: {e}")
return create_response(
f"⚠️ An error occurred while processing {file.file.filename}.", "error"
f"⚠️ An error occurred while processing {file.file.filename}.", # pyright: ignore reportPrivateUsage=none
"error",
)
return create_response(f"{file.file.filename} is not supported.", "error")
return create_response(
f"{file.file.filename} is not supported.", # pyright: ignore reportPrivateUsage=none
"error",
)

View File

@ -13,7 +13,7 @@ logger = get_logger(__name__)
class Neurons(BaseModel):
commons: CommonsDep
settings = BrainSettings()
settings = BrainSettings() # pyright: ignore reportPrivateUsage=none
def create_vector(self, doc, user_openai_api_key=None):
logger.info("Creating vector for document")
@ -21,7 +21,7 @@ class Neurons(BaseModel):
if user_openai_api_key:
self.commons["documents_vector_store"]._embedding = OpenAIEmbeddings(
openai_api_key=user_openai_api_key
)
) # pyright: ignore reportPrivateUsage=none
try:
sids = self.commons["documents_vector_store"].add_documents([doc])
if sids and len(sids) > 0:
@ -64,7 +64,7 @@ def create_summary(commons: CommonsDep, document_id, content, metadata):
def error_callback(exception):
print('An exception occurred:', exception)
print("An exception occurred:", exception)
def process_batch(batch_ids):
@ -106,14 +106,14 @@ def get_unique_files_from_vector_ids(vectors_ids: List[int]):
with ThreadPoolExecutor() as executor:
futures = []
for i in range(0, len(vectors_ids), BATCH_SIZE):
batch_ids = vectors_ids[i:i + BATCH_SIZE]
batch_ids = vectors_ids[i : i + BATCH_SIZE]
future = executor.submit(process_batch, batch_ids)
futures.append(future)
# Retrieve the results
vectors_responses = [future.result() for future in futures]
documents = [item for sublist in vectors_responses for item in sublist]
print('document', documents)
print("document", documents)
unique_files = [dict(t) for t in set(tuple(d.items()) for d in documents)]
return unique_files

View File

@ -3,8 +3,7 @@ from typing import Any, List
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import SupabaseVectorStore
from supabase import Client
from supabase.client import Client
class CustomSupabaseVectorStore(SupabaseVectorStore):