From 8fbb4b2d914b01df1b5bc455538a24ff82ad34bc Mon Sep 17 00:00:00 2001 From: Matt <77928207+mattzcarey@users.noreply.github.com> Date: Tue, 11 Jul 2023 19:15:56 +0100 Subject: [PATCH] fix: gpt4all (#595) * fix: gpt4all * fix: pyright * Update backend/llm/openai.py * fix: remove backend tag * fix: typing * feat: qa_base class * fix: pyright * fix: model_path not found --- .backend_env.example | 4 +- backend/llm/__init__.py | 2 + backend/llm/openai.py | 197 +---------------------- backend/llm/openai_functions.py | 9 +- backend/llm/private_gpt4all.py | 57 ++++--- backend/llm/qa_base.py | 228 +++++++++++++++++++++++++++ backend/models/settings.py | 4 +- backend/requirements.txt | 2 +- backend/routes/chat_routes.py | 9 +- docs/docs/backend/llm/private-llm.md | 8 +- 10 files changed, 290 insertions(+), 230 deletions(-) create mode 100644 backend/llm/qa_base.py diff --git a/.backend_env.example b/.backend_env.example index 5b8d37ced..2cccdcfb3 100644 --- a/.backend_env.example +++ b/.backend_env.example @@ -12,8 +12,6 @@ MAX_REQUESTS_NUMBER=200 #Private LLM Variables PRIVATE=False MODEL_PATH=./local_models/ggml-gpt4all-j-v1.3-groovy.bin -MODEL_N_CTX=1000 -MODEL_N_BATCH=8 #RESEND -RESEND_API_KEY= \ No newline at end of file +RESEND_API_KEY= diff --git a/backend/llm/__init__.py b/backend/llm/__init__.py index e73676c15..67c47cf15 100644 --- a/backend/llm/__init__.py +++ b/backend/llm/__init__.py @@ -1,10 +1,12 @@ from .base import BaseBrainPicking +from .qa_base import QABaseBrainPicking from .openai import OpenAIBrainPicking from .openai_functions import OpenAIFunctionsBrainPicking from .private_gpt4all import PrivateGPT4AllBrainPicking __all__ = [ "BaseBrainPicking", + "QABaseBrainPicking", "OpenAIBrainPicking", "OpenAIFunctionsBrainPicking", "PrivateGPT4AllBrainPicking", diff --git a/backend/llm/openai.py b/backend/llm/openai.py index 214dce4fd..31bf50301 100644 --- a/backend/llm/openai.py +++ b/backend/llm/openai.py @@ -1,31 +1,13 @@ -import asyncio -import json -from typing import AsyncIterable, Awaitable - -from langchain.chains import ConversationalRetrievalChain, LLMChain -from langchain.chains.question_answering import load_qa_chain from langchain.chat_models import ChatOpenAI from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.llms.base import LLM +from langchain.llms.base import BaseLLM +from llm.qa_base import QABaseBrainPicking from logger import get_logger -from models.chat import ChatHistory -from repository.chat.format_chat_history import format_chat_history -from repository.chat.get_chat_history import get_chat_history -from repository.chat.update_chat_history import update_chat_history -from repository.chat.update_message_by_id import update_message_by_id -from supabase.client import Client, create_client -from vectorstore.supabase import ( - CustomSupabaseVectorStore, -) - -# Custom class for handling vector storage with Supabase -from .base import BaseBrainPicking -from .prompts.CONDENSE_PROMPT import CONDENSE_QUESTION_PROMPT logger = get_logger(__name__) -class OpenAIBrainPicking(BaseBrainPicking): +class OpenAIBrainPicking(QABaseBrainPicking): """ Main class for the OpenAI Brain Picking functionality. It allows to initialize a Chat model, generate questions and retrieve answers using ConversationalRetrievalChain. @@ -64,182 +46,17 @@ class OpenAIBrainPicking(BaseBrainPicking): openai_api_key=self.openai_api_key ) # pyright: ignore reportPrivateUsage=none - @property - def supabase_client(self) -> Client: - return create_client( - self.brain_settings.supabase_url, self.brain_settings.supabase_service_key - ) - - @property - def vector_store(self) -> CustomSupabaseVectorStore: - return CustomSupabaseVectorStore( - self.supabase_client, - self.embeddings, - table_name="vectors", - brain_id=self.brain_id, - ) - - @property - def question_llm(self) -> LLM: - return self._create_llm(model=self.model, streaming=False) - - @property - def doc_llm(self) -> LLM: - return self._create_llm( - model=self.model, streaming=self.streaming, callbacks=self.callbacks - ) - - @property - def question_generator(self) -> LLMChain: - return LLMChain(llm=self.question_llm, prompt=CONDENSE_QUESTION_PROMPT) - - @property - def doc_chain(self) -> LLMChain: - return load_qa_chain( - llm=self.doc_llm, chain_type="stuff" - ) # pyright: ignore reportPrivateUsage=none - - @property - def qa(self) -> ConversationalRetrievalChain: - return ConversationalRetrievalChain( - retriever=self.vector_store.as_retriever(), - question_generator=self.question_generator, - combine_docs_chain=self.doc_chain, # pyright: ignore reportPrivateUsage=none - verbose=True, - ) - - def _create_llm(self, model, streaming=False, callbacks=None) -> LLM: + def _create_llm(self, model, streaming=False, callbacks=None) -> BaseLLM: """ Determine the language model to be used. :param model: Language model name to be used. - :param private_model_args: Dictionary containing model_path, n_ctx and n_batch. - :param private: Boolean value to determine if private model is to be used. + :param streaming: Whether to enable streaming of the model + :param callbacks: Callbacks to be used for streaming :return: Language model instance """ return ChatOpenAI( - temperature=0, + temperature=self.temperature, model=model, streaming=streaming, callbacks=callbacks, ) # pyright: ignore reportPrivateUsage=none - - def _call_chain(self, chain, question, history): - """ - Call a chain with a given question and history. - :param chain: The chain eg QA (ConversationalRetrievalChain) - :param question: The user prompt - :param history: The chat history from DB - :return: The answer. - """ - return chain( - { - "question": question, - "chat_history": history, - } - ) - - def generate_answer(self, question: str) -> ChatHistory: - """ - Generate an answer to a given question by interacting with the language model. - :param question: The question - :return: The generated answer. - """ - transformed_history = [] - - # Get the history from the database - history = get_chat_history(self.chat_id) - - # Format the chat history into a list of tuples (human, ai) - transformed_history = format_chat_history(history) - - # Generate the model response using the QA chain - model_response = self._call_chain(self.qa, question, transformed_history) - - answer = model_response["answer"] - - # Update chat history - chat_answer = update_chat_history( - chat_id=self.chat_id, - user_message=question, - assistant=answer, - ) - - return chat_answer - - async def _acall_chain(self, chain, question, history): - """ - Call a chain with a given question and history. - :param chain: The chain eg QA (ConversationalRetrievalChain) - :param question: The user prompt - :param history: The chat history from DB - :return: The answer. - """ - return chain.acall( - { - "question": question, - "chat_history": history, - } - ) - - async def generate_stream(self, question: str) -> AsyncIterable: - """ - Generate a streaming answer to a given question by interacting with the language model. - :param question: The question - :return: An async iterable which generates the answer. - """ - - history = get_chat_history(self.chat_id) - callback = self.callbacks[0] - - transformed_history = [] - - # Format the chat history into a list of tuples (human, ai) - transformed_history = format_chat_history(history) - - # Initialize a list to hold the tokens - response_tokens = [] - - # Wrap an awaitable with a event to signal when it's done or an exception is raised. - async def wrap_done(fn: Awaitable, event: asyncio.Event): - try: - await fn - except Exception as e: - logger.error(f"Caught exception: {e}") - finally: - event.set() - - task = asyncio.create_task( - wrap_done( - self.qa._acall_chain( # pyright: ignore reportPrivateUsage=none - self.qa, question, transformed_history - ), - callback.done, # pyright: ignore reportPrivateUsage=none - ) - ) - - streamed_chat_history = update_chat_history( - chat_id=self.chat_id, - user_message=question, - assistant="", - ) - - # Use the aiter method of the callback to stream the response with server-sent-events - async for token in callback.aiter(): # pyright: ignore reportPrivateUsage=none - logger.info("Token: %s", token) - - # Add the token to the response_tokens list - response_tokens.append(token) - streamed_chat_history.assistant = token - - yield f"data: {json.dumps(streamed_chat_history.to_dict())}" - - await task - - # Join the tokens to create the assistant's response - assistant = "".join(response_tokens) - - update_message_by_id( - message_id=streamed_chat_history.message_id, - user_message=question, - assistant=assistant, - ) diff --git a/backend/llm/openai_functions.py b/backend/llm/openai_functions.py index 08001ed57..88187ffcf 100644 --- a/backend/llm/openai_functions.py +++ b/backend/llm/openai_functions.py @@ -2,6 +2,8 @@ from typing import Any, Dict, List, Optional from langchain.chat_models import ChatOpenAI from langchain.embeddings.openai import OpenAIEmbeddings +from llm.models.FunctionCall import FunctionCall +from llm.models.OpenAiAnswer import OpenAiAnswer from logger import get_logger from models.chat import ChatHistory from repository.chat.get_chat_history import get_chat_history @@ -9,9 +11,6 @@ from repository.chat.update_chat_history import update_chat_history from supabase.client import Client, create_client from vectorstore.supabase import CustomSupabaseVectorStore -from llm.models.FunctionCall import FunctionCall -from llm.models.OpenAiAnswer import OpenAiAnswer - from .base import BaseBrainPicking logger = get_logger(__name__) @@ -148,8 +147,8 @@ class OpenAIFunctionsBrainPicking(BaseBrainPicking): { "role": "system", "content": """Your name is Quivr. You are an assistant that has access to a person's documents and that can answer questions about them. - A person will ask you a question and you will provide a helpful answer. - Write the answer in the same language as the question. + A person will ask you a question and you will provide a helpful answer. + Write the answer in the same language as the question. You have access to functions to help you answer the question. If you don't know the answer, just say that you don't know but be helpful and explain why you can't answer""", } diff --git a/backend/llm/private_gpt4all.py b/backend/llm/private_gpt4all.py index 655960612..9d1fb15d7 100644 --- a/backend/llm/private_gpt4all.py +++ b/backend/llm/private_gpt4all.py @@ -1,60 +1,73 @@ -from langchain.llms.base import LLM +from typing import Optional +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.llms.base import BaseLLM from langchain.llms.gpt4all import GPT4All +from llm.qa_base import QABaseBrainPicking from logger import get_logger -from models.settings import LLMSettings - -from .base import BaseBrainPicking logger = get_logger(__name__) -class PrivateGPT4AllBrainPicking(BaseBrainPicking): +class PrivateGPT4AllBrainPicking(QABaseBrainPicking): """ This subclass of BrainPicking is used to specifically work with the private language model GPT4All. """ - # Initialize class settings - llm_settings = LLMSettings() + # Define the default model path + model_path: str = "./local_models/ggml-gpt4all-j-v1.3-groovy.bin" def __init__( self, chat_id: str, brain_id: str, + user_openai_api_key: Optional[str], streaming: bool, - ) -> "PrivateGPT4AllBrainPicking": # pyright: ignore reportPrivateUsage=none + model_path: str, + ) -> None: """ Initialize the PrivateBrainPicking class by calling the parent class's initializer. :param brain_id: The brain_id in the DB. :param chat_id: The id of the chat in the DB. :param streaming: Whether to enable streaming of the model - :return: PrivateBrainPicking instance + :param model_path: The path to the model. If not provided, a default path is used. """ - # set defaults - model = "gpt4all-j-1.3" - super().__init__( - model=model, + model="gpt4all-j-1.3", brain_id=brain_id, chat_id=chat_id, + user_openai_api_key=user_openai_api_key, streaming=streaming, ) - def _create_llm(self) -> LLM: + # Set the model path + self.model_path = model_path + + # TODO: Use private embeddings model. This involves some restructuring of how we store the embeddings. + @property + def embeddings(self) -> OpenAIEmbeddings: + return OpenAIEmbeddings( + openai_api_key=self.openai_api_key + ) # pyright: ignore reportPrivateUsage=none + + def _create_llm( + self, + model, + streaming=False, + callbacks=None, + ) -> BaseLLM: """ Override the _create_llm method to enforce the use of a private model. + :param model: Language model name to be used. + :param streaming: Whether to enable streaming of the model + :param callbacks: Callbacks to be used for streaming :return: Language model instance """ - model_path = self.llm_settings.model_path - model_n_ctx = self.llm_settings.model_n_ctx - model_n_batch = self.llm_settings.model_n_batch + model_path = self.model_path - logger.info("Using private model: %s", model_path) + logger.info("Using private model: %s", model) + logger.info("Streaming is set to %s", streaming) return GPT4All( model=model_path, - n_ctx=model_n_ctx, - n_batch=model_n_batch, - backend="gptj", - verbose=True, ) # pyright: ignore reportPrivateUsage=none diff --git a/backend/llm/qa_base.py b/backend/llm/qa_base.py new file mode 100644 index 000000000..9752af44d --- /dev/null +++ b/backend/llm/qa_base.py @@ -0,0 +1,228 @@ +import asyncio +import json +from abc import abstractmethod, abstractproperty +from typing import AsyncIterable, Awaitable + +from langchain.chains import ConversationalRetrievalChain, LLMChain +from langchain.chains.question_answering import load_qa_chain +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.llms.base import BaseLLM +from logger import get_logger +from models.chat import ChatHistory +from repository.chat.format_chat_history import format_chat_history +from repository.chat.get_chat_history import get_chat_history +from repository.chat.update_chat_history import update_chat_history +from repository.chat.update_message_by_id import update_message_by_id +from supabase.client import Client, create_client +from vectorstore.supabase import CustomSupabaseVectorStore + +from .base import BaseBrainPicking +from .prompts.CONDENSE_PROMPT import CONDENSE_QUESTION_PROMPT + +logger = get_logger(__name__) + + +class QABaseBrainPicking(BaseBrainPicking): + """ + Base class for the Brain Picking functionality using the Conversational Retrieval Chain (QA) from Langchain. + It is not designed to be used directly, but to be subclassed by other classes which use the QA chain. + """ + + def __init__( + self, + model: str, + brain_id: str, + chat_id: str, + streaming: bool = False, + **kwargs, + ) -> "QABaseBrainPicking": # pyright: ignore reportPrivateUsage=none + """ + Initialize the QA BrainPicking class by setting embeddings, supabase client, vector store, language model and chains. + :return: QABrainPicking instance + """ + super().__init__( + model=model, + brain_id=brain_id, + chat_id=chat_id, + streaming=streaming, + **kwargs, + ) + + @abstractproperty + def embeddings(self) -> OpenAIEmbeddings: + raise NotImplementedError("This property should be overridden in a subclass.") + + @property + def supabase_client(self) -> Client: + return create_client( + self.brain_settings.supabase_url, self.brain_settings.supabase_service_key + ) + + @property + def vector_store(self) -> CustomSupabaseVectorStore: + return CustomSupabaseVectorStore( + self.supabase_client, + self.embeddings, + table_name="vectors", + brain_id=self.brain_id, + ) + + @property + def question_llm(self): + return self._create_llm(model=self.model, streaming=False) + + @property + def doc_llm(self): + return self._create_llm( + model=self.model, streaming=self.streaming, callbacks=self.callbacks + ) + + @property + def question_generator(self) -> LLMChain: + return LLMChain(llm=self.question_llm, prompt=CONDENSE_QUESTION_PROMPT) + + @property + def doc_chain(self) -> LLMChain: + return load_qa_chain( + llm=self.doc_llm, chain_type="stuff" + ) # pyright: ignore reportPrivateUsage=none + + @property + def qa(self) -> ConversationalRetrievalChain: + return ConversationalRetrievalChain( + retriever=self.vector_store.as_retriever(), + question_generator=self.question_generator, + combine_docs_chain=self.doc_chain, # pyright: ignore reportPrivateUsage=none + verbose=True, + ) + + @abstractmethod + def _create_llm(self, model, streaming=False, callbacks=None) -> BaseLLM: + """ + Determine the language model to be used. + :param model: Language model name to be used. + :param streaming: Whether to enable streaming of the model + :param callbacks: Callbacks to be used for streaming + :return: Language model instance + """ + + def _call_chain(self, chain, question, history): + """ + Call a chain with a given question and history. + :param chain: The chain eg QA (ConversationalRetrievalChain) + :param question: The user prompt + :param history: The chat history from DB + :return: The answer. + """ + return chain( + { + "question": question, + "chat_history": history, + } + ) + + def generate_answer(self, question: str) -> ChatHistory: + """ + Generate an answer to a given question by interacting with the language model. + :param question: The question + :return: The generated answer. + """ + transformed_history = [] + + # Get the history from the database + history = get_chat_history(self.chat_id) + + # Format the chat history into a list of tuples (human, ai) + transformed_history = format_chat_history(history) + + # Generate the model response using the QA chain + model_response = self._call_chain(self.qa, question, transformed_history) + + answer = model_response["answer"] + + # Update chat history + chat_answer = update_chat_history( + chat_id=self.chat_id, + user_message=question, + assistant=answer, + ) + + return chat_answer + + async def _acall_chain(self, chain, question, history): + """ + Call a chain with a given question and history. + :param chain: The chain eg QA (ConversationalRetrievalChain) + :param question: The user prompt + :param history: The chat history from DB + :return: The answer. + """ + return chain.acall( + { + "question": question, + "chat_history": history, + } + ) + + async def generate_stream(self, question: str) -> AsyncIterable: + """ + Generate a streaming answer to a given question by interacting with the language model. + :param question: The question + :return: An async iterable which generates the answer. + """ + + history = get_chat_history(self.chat_id) + callback = self.callbacks[0] + + transformed_history = [] + + # Format the chat history into a list of tuples (human, ai) + transformed_history = format_chat_history(history) + + # Initialize a list to hold the tokens + response_tokens = [] + + # Wrap an awaitable with a event to signal when it's done or an exception is raised. + async def wrap_done(fn: Awaitable, event: asyncio.Event): + try: + await fn + except Exception as e: + logger.error(f"Caught exception: {e}") + finally: + event.set() + + task = asyncio.create_task( + wrap_done( + self.qa._acall_chain( # pyright: ignore reportPrivateUsage=none + self.qa, question, transformed_history + ), + callback.done, # pyright: ignore reportPrivateUsage=none + ) + ) + + streamed_chat_history = update_chat_history( + chat_id=self.chat_id, + user_message=question, + assistant="", + ) + + # Use the aiter method of the callback to stream the response with server-sent-events + async for token in callback.aiter(): # pyright: ignore reportPrivateUsage=none + logger.info("Token: %s", token) + + # Add the token to the response_tokens list + response_tokens.append(token) + streamed_chat_history.assistant = token + + yield f"data: {json.dumps(streamed_chat_history.to_dict())}" + + await task + + # Join the tokens to create the assistant's response + assistant = "".join(response_tokens) + + update_message_by_id( + message_id=streamed_chat_history.message_id, + user_message=question, + assistant=assistant, + ) diff --git a/backend/models/settings.py b/backend/models/settings.py index 8ce45cb4d..68b15662e 100644 --- a/backend/models/settings.py +++ b/backend/models/settings.py @@ -16,9 +16,7 @@ class BrainSettings(BaseSettings): class LLMSettings(BaseSettings): private: bool = False - model_path: str = "gpt2" - model_n_ctx: int = 1000 - model_n_batch: int = 8 + model_path: str = "./local_models/ggml-gpt4all-j-v1.3-groovy.bin" def common_dependencies() -> dict: diff --git a/backend/requirements.txt b/backend/requirements.txt index f0ec3a582..38a7f24ad 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,5 +1,5 @@ pymupdf==1.22.3 -langchain==0.0.207 +langchain==0.0.228 Markdown==3.4.3 openai==0.27.6 pdf2image==1.16.3 diff --git a/backend/routes/chat_routes.py b/backend/routes/chat_routes.py index af9b9edf5..40246bf76 100644 --- a/backend/routes/chat_routes.py +++ b/backend/routes/chat_routes.py @@ -172,7 +172,9 @@ async def create_question_handler( gpt_answer_generator = PrivateGPT4AllBrainPicking( chat_id=str(chat_id), brain_id=str(brain_id), + user_openai_api_key=current_user.user_openai_api_key, streaming=False, + model_path=llm_settings.model_path, ) elif chat_question.model in openai_function_compatible_models: @@ -228,6 +230,7 @@ async def create_stream_question_handler( try: user_openai_api_key = request.headers.get("Openai-Api-Key") + streaming = True check_user_limit(current_user) llm_settings = LLMSettings() @@ -235,7 +238,9 @@ async def create_stream_question_handler( gpt_answer_generator = PrivateGPT4AllBrainPicking( chat_id=str(chat_id), brain_id=str(brain_id), - streaming=False, + user_openai_api_key=user_openai_api_key, + streaming=streaming, + model_path=llm_settings.model_path, ) else: gpt_answer_generator = OpenAIBrainPicking( @@ -245,7 +250,7 @@ async def create_stream_question_handler( temperature=chat_question.temperature, brain_id=str(brain_id), user_openai_api_key=user_openai_api_key, # pyright: ignore reportPrivateUsage=none - streaming=True, + streaming=streaming, ) return StreamingResponse( diff --git a/docs/docs/backend/llm/private-llm.md b/docs/docs/backend/llm/private-llm.md index 5283673ff..f4d32ed51 100644 --- a/docs/docs/backend/llm/private-llm.md +++ b/docs/docs/backend/llm/private-llm.md @@ -4,20 +4,20 @@ sidebar_position: 1 # Private LLM -Quivr now has the capability to use a private LLM model powered by GPT4All (other open source models coming soon). +Quivr now has the capability to use a private LLM model powered by GPT4All (other open source models coming soon). This is simular to the functionality provided by the PrivateGPT project. This means that your data never leaves the server. The LLM is downloaded to the server and runs inference on your question locally. ## How to use + Set the 'private' flag to True in the /backend/.env file. You can also set other model parameters in the .env file. -Download the GPT4All model from [here]( - https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) and place it in the /backend/local_models folder. +Download the GPT4All model from [here](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) and place it in the /backend/local_models folder. Or you can download any model from their ecosystem on there [website](https://gpt4all.io/index.html). ## Future Plans + We are planning to add more models to the private LLM feature. We are also planning on using a local embedding model from Hugging Face to reduce our reliance on OpenAI's API. We will also be adding the ability to use a private LLM model from in the frontend and api. Currently it is only available if you self host the backend. -