Feat/local llm bug fix (#1758)

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
This commit is contained in:
Stan Girard 2023-11-29 19:17:16 +01:00 committed by GitHub
parent c6d45669b3
commit e1cde0fcb4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 146 additions and 232 deletions

View File

@ -10,6 +10,9 @@ GOOGLE_CLOUD_PROJECT=<change-me>
CELERY_BROKER_URL=redis://redis:6379/0 CELERY_BROKER_URL=redis://redis:6379/0
CELEBRY_BROKER_QUEUE_NAME=quivr-preview.fifo CELEBRY_BROKER_QUEUE_NAME=quivr-preview.fifo
#LOCAL
#OLLAMA_API_BASE_URL=http://host.docker.internal:11434 # local all in one remove comment to use local llm with Ollama
#RESEND #RESEND

View File

@ -3,6 +3,7 @@ from typing import Optional
from uuid import UUID from uuid import UUID
from fastapi import HTTPException from fastapi import HTTPException
from logger import get_logger
from litellm import completion from litellm import completion
from models.chats import ChatQuestion from models.chats import ChatQuestion
from models.databases.supabase.chats import CreateChatHistory from models.databases.supabase.chats import CreateChatHistory
@ -17,6 +18,7 @@ from llm.utils.get_api_brain_definition_as_json_schema import (
get_api_brain_definition_as_json_schema, get_api_brain_definition_as_json_schema,
) )
logger = get_logger(__name__)
class APIBrainQA( class APIBrainQA(
QABaseBrainPicking, QABaseBrainPicking,
@ -53,7 +55,6 @@ class APIBrainQA(
brain_id: UUID, brain_id: UUID,
): ):
yield "🧠<Deciding what to do>🧠" yield "🧠<Deciding what to do>🧠"
response = completion( response = completion(
model=self.model, model=self.model,
temperature=self.temperature, temperature=self.temperature,
@ -73,8 +74,7 @@ class APIBrainQA(
if finish_reason == "stop": if finish_reason == "stop":
break break
if "function_call" in chunk.choices[0].delta and chunk.choices[0].delta["function_call"]:
if "function_call" in chunk.choices[0].delta:
if "name" in chunk.choices[0].delta["function_call"]: if "name" in chunk.choices[0].delta["function_call"]:
function_call["name"] = chunk.choices[0].delta["function_call"][ function_call["name"] = chunk.choices[0].delta["function_call"][
"name" "name"
@ -86,10 +86,12 @@ class APIBrainQA(
elif finish_reason == "function_call": elif finish_reason == "function_call":
try: try:
logger.info(f"Function call: {function_call}")
arguments = json.loads(function_call["arguments"]) arguments = json.loads(function_call["arguments"])
except Exception: except Exception:
arguments = {} arguments = {}
yield f"🧠<Calling API with arguments {arguments} and brain id {brain_id}>🧠" yield f"🧠<Calling {brain_id} with arguments {arguments}>🧠"
try: try:
api_call_response = call_brain_api( api_call_response = call_brain_api(
@ -106,7 +108,7 @@ class APIBrainQA(
messages.append( messages.append(
{ {
"role": "function", "role": "function",
"name": function_call["name"], "name": str(brain_id),
"content": api_call_response, "content": api_call_response,
} }
) )

View File

@ -7,6 +7,7 @@ from langchain.callbacks.streaming_aiter import AsyncIteratorCallbackHandler
from langchain.chains import ConversationalRetrievalChain, LLMChain from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.question_answering import load_qa_chain from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatLiteLLM from langchain.chat_models import ChatLiteLLM
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms.base import BaseLLM from langchain.llms.base import BaseLLM
from langchain.prompts.chat import ( from langchain.prompts.chat import (
@ -84,8 +85,13 @@ class QABaseBrainPicking(BaseModel):
] ]
@property @property
def embeddings(self) -> OpenAIEmbeddings: def embeddings(self):
return OpenAIEmbeddings() # pyright: ignore reportPrivateUsage=none if self.brain_settings.ollama_api_base_url:
return OllamaEmbeddings(
base_url=self.brain_settings.ollama_api_base_url
) # pyright: ignore reportPrivateUsage=none
else:
return OpenAIEmbeddings()
supabase_client: Optional[Client] = None supabase_client: Optional[Client] = None
vector_store: Optional[CustomSupabaseVectorStore] = None vector_store: Optional[CustomSupabaseVectorStore] = None
@ -143,6 +149,11 @@ class QABaseBrainPicking(BaseModel):
:param callbacks: Callbacks to be used for streaming :param callbacks: Callbacks to be used for streaming
:return: Language model instance :return: Language model instance
""" """
api_base = None
if self.brain_settings.ollama_api_base_url and model.startswith("ollama"):
api_base = self.brain_settings.ollama_api_base_url
return ChatLiteLLM( return ChatLiteLLM(
temperature=temperature, temperature=temperature,
max_tokens=self.max_tokens, max_tokens=self.max_tokens,
@ -150,6 +161,7 @@ class QABaseBrainPicking(BaseModel):
streaming=streaming, streaming=streaming,
verbose=False, verbose=False,
callbacks=callbacks, callbacks=callbacks,
api_base= api_base
) # pyright: ignore reportPrivateUsage=none ) # pyright: ignore reportPrivateUsage=none
def _create_prompt_template(self): def _create_prompt_template(self):

View File

@ -7,6 +7,7 @@ from langchain.callbacks.streaming_aiter import AsyncIteratorCallbackHandler
from langchain.chains import LLMChain from langchain.chains import LLMChain
from langchain.chat_models import ChatLiteLLM from langchain.chat_models import ChatLiteLLM
from langchain.chat_models.base import BaseChatModel from langchain.chat_models.base import BaseChatModel
from models import BrainSettings # Importing settings related to the 'brain'
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
from logger import get_logger from logger import get_logger
from models.chats import ChatQuestion from models.chats import ChatQuestion
@ -30,6 +31,7 @@ SYSTEM_MESSAGE = "Your name is Quivr. You're a helpful assistant. If you don't k
class HeadlessQA(BaseModel): class HeadlessQA(BaseModel):
brain_settings = BrainSettings()
model: str model: str
temperature: float = 0.0 temperature: float = 0.0
max_tokens: int = 2000 max_tokens: int = 2000
@ -78,13 +80,18 @@ class HeadlessQA(BaseModel):
:param callbacks: Callbacks to be used for streaming :param callbacks: Callbacks to be used for streaming
:return: Language model instance :return: Language model instance
""" """
api_base = None
if self.brain_settings.ollama_api_base_url and model.startswith("ollama"):
api_base = self.brain_settings.ollama_api_base_url
return ChatLiteLLM( return ChatLiteLLM(
temperature=0.1, temperature=temperature,
model=model, model=model,
streaming=streaming, streaming=streaming,
verbose=True, verbose=True,
callbacks=callbacks, callbacks=callbacks,
max_tokens=self.max_tokens, max_tokens=self.max_tokens,
api_base=api_base,
) )
def _create_prompt_template(self): def _create_prompt_template(self):

View File

@ -1,9 +1,13 @@
from langchain.embeddings.openai import OpenAIEmbeddings
from models.databases.supabase.supabase import SupabaseDB from models.databases.supabase.supabase import SupabaseDB
from pydantic import BaseSettings from pydantic import BaseSettings
from supabase.client import Client, create_client from supabase.client import Client, create_client
from vectorstore.supabase import SupabaseVectorStore from vectorstore.supabase import SupabaseVectorStore
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from logger import get_logger
logger = get_logger(__name__)
class BrainRateLimiting(BaseSettings): class BrainRateLimiting(BaseSettings):
max_brain_per_user: int = 5 max_brain_per_user: int = 5
@ -15,6 +19,7 @@ class BrainSettings(BaseSettings):
supabase_service_key: str supabase_service_key: str
resend_api_key: str = "null" resend_api_key: str = "null"
resend_email_address: str = "brain@mail.quivr.app" resend_email_address: str = "brain@mail.quivr.app"
ollama_api_base_url: str = None
class ContactsSettings(BaseSettings): class ContactsSettings(BaseSettings):
@ -39,11 +44,14 @@ def get_supabase_db() -> SupabaseDB:
return SupabaseDB(supabase_client) return SupabaseDB(supabase_client)
def get_embeddings() -> OpenAIEmbeddings: def get_embeddings():
settings = BrainSettings() # pyright: ignore reportPrivateUsage=none settings = BrainSettings() # pyright: ignore reportPrivateUsage=none
embeddings = OpenAIEmbeddings( if settings.ollama_api_base_url:
openai_api_key=settings.openai_api_key embeddings = OllamaEmbeddings(
) # pyright: ignore reportPrivateUsage=none base_url=settings.ollama_api_base_url,
) # pyright: ignore reportPrivateUsage=none
else:
embeddings = OpenAIEmbeddings() # pyright: ignore reportPrivateUsage=none
return embeddings return embeddings

View File

@ -1,8 +1,8 @@
# pymupdf==1.22.3 # pymupdf==1.22.3
langchain==0.0.332 langchain==0.0.341
litellm==0.13.2 litellm==1.7.7
# Markdown==3.4.4 # Markdown==3.4.4
openai==0.27.8 openai==1.1.1
GitPython==3.1.36 GitPython==3.1.36
pdf2image==1.16.3 pdf2image==1.16.3
pypdf==3.9.0 pypdf==3.9.0
@ -36,3 +36,4 @@ python-dotenv
pytest-mock pytest-mock
pytest-celery pytest-celery
pytesseract==0.3.10 pytesseract==0.3.10
async_generator

View File

@ -1,9 +1,9 @@
--- ---
sidebar_position: 2 sidebar_position: 2
title: Run Quivr fully locally title: Run Quivr locally with Ollama
--- ---
# Using Quivr fully locally # Using Quivr fully locally with Ollama
## Headers ## Headers
@ -15,13 +15,7 @@ The following is a guide to set up everything for using Quivr locally:
- [Embeddings](#embeddings) - [Embeddings](#embeddings)
- [LLM for inference](#llm) - [LLM for inference](#llm)
It is a first, working setup, but a lot of work has to be done to e.g. find the appropriate settings for the model. The guide was put together in collaboration with members of the Quivr Discord, **Using Quivr fully locally** thread. That is a good place to discuss it. https://discord.com/invite/HUpRgp2HG8
Importantly, this will currently only work on tag v0.0.46.
The guide was put together in collaboration with members of the Quivr Discord, **Using Quivr fully locally** thread. That is a good place to discuss it.
This worked for me, but I sometimes got strange results (the output contains repeating answers/questions). Maybe because `stopping_criteria=stopping_criteria` must be uncommented in `transformers.pipeline`. Will update this page as I continue learning.
<a name="database"/> <a name="database"/>
@ -37,224 +31,107 @@ Troubleshooting:
<a name="embeddings"/> <a name="embeddings"/>
## Local embeddings ## Ollama
First, let's get local embeddings to work with GPT4All. Instead of relying on OpenAI for generating embeddings of both the prompt and the documents we upload, we will use a local LLM for this. Ollama is a tool that allows you to run LLMs locally. We are using it to run Llama2, MistralAI and others locally.
Remove any existing data from the postgres database: ### Install Ollama
- `supabase/docker $ docker compose down -v` Install Ollama from their [website](https://ollama.ai/).
- `supabase/docker $ rm -rf volumes/db/data/`
- `supabase/docker $ docker compose up -d`
Change the vector dimensions in the necessary Quivr SQL files: Then run the following command to run Ollama in the background:
- Replace all occurrences of 1536 by 768, in Quivr's `scripts\tables.sql` ```bash
- Run tables.sql in the Supabase web ui SQL editor: http://localhost:8000 ollama run llama2
Change the Quivr code to use local LLM (GPT4All) and local embeddings:
- add code to `backend\core\llm\private_gpt4all.py`
```python
from langchain.embeddings import HuggingFaceEmbeddings
...
def embeddings(self) -> HuggingFaceEmbeddings:
emb = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2",
model_kwargs={'device': 'cuda'},
encode_kwargs={'normalize_embeddings': False}
)
return emb
``` ```
Note that there may be better models out there for generating the embeddings: https://huggingface.co/spaces/mteb/leaderboard ### Update Quivr to use Ollama
Update Quivr `backend/core/.env`'s Private LLM Variables: In order to have Quivr use Ollama we need to update the tables in Supabase to support the embedding format that Ollama uses. Ollama uses by default llama 2 that produces 4096 dimensional embeddings while OpenAI API produces 1536 dimensional embeddings.
```
#Private LLM Variables Go to supabase and delete your table vectors and create a new table vectors with the following schema:
PRIVATE=True
MODEL_PATH=./local_models/ggml-gpt4all-j-v1.3-groovy.bin ```sql
CREATE TABLE IF NOT EXISTS vectors (
id UUID DEFAULT uuid_generate_v4() PRIMARY KEY,
content TEXT,
file_sha1 TEXT,
metadata JSONB,
embedding VECTOR(4096)
);
``` ```
Download GPT4All model: Then run the following command to update the table:
- `$ cd backend/core/local_models/` ```sql
- `wget https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin` CREATE OR REPLACE FUNCTION match_vectors(query_embedding VECTOR(4096), match_count INT, p_brain_id UUID)
RETURNS TABLE(
Ensure the Quivr backend docker container has CUDA and the GPT4All package: id UUID,
brain_id UUID,
``` content TEXT,
FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel metadata JSONB,
#FROM python:3.11-bullseye embedding VECTOR(4096),
similarity FLOAT
ARG DEBIAN_FRONTEND=noninteractive ) LANGUAGE plpgsql AS $$
ENV DEBIAN_FRONTEND=noninteractive #variable_conflict use_column
BEGIN
RUN pip install gpt4all RETURN QUERY
SELECT
vectors.id,
brains_vectors.brain_id,
vectors.content,
vectors.metadata,
vectors.embedding,
1 - (vectors.embedding <=> query_embedding) AS similarity
FROM
vectors
INNER JOIN
brains_vectors ON vectors.id = brains_vectors.vector_id
WHERE brains_vectors.brain_id = p_brain_id
ORDER BY
vectors.embedding <=> query_embedding
LIMIT match_count;
END;
$$;
``` ```
Modify the docker-compose yml file (for backend container). The following example is for using 2 GPUs: This will update the match_vectors function to use the new embedding format.
```
... ## Add Ollama Model to Quivr
network_mode: host
deploy: Now that you have your model running locally, you need to add it to Quivr.
resources:
reservations: In order to allow the user to choose between the OpenAI API and Ollama, we need to add a new model to the Quivr backend.
devices:
- driver: nvidia Go to supabase and in the table `user_settings` either add by default or to your user the following value to the `models` column:
count: 2
capabilities: [gpu] ```json
[
"gpt-3.5-turbo-1106",
"ollama/llama2"
]
``` ```
Install nvidia container toolkit on the host, https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html: This will add the Ollama model to the list of models that the user can choose from.
``` By adding this as default, it means that all new users will have this model by default. If you want to add it to your user only, you can add it to the `models` column in the `user_settings` table. In order for the change to take effect if you put as default your need to drop the entire table with the following command:
$ wget https://nvidia.github.io/nvidia-docker/gpgkey --no-check-certificate
$ sudo apt-key add gpgkey
$ distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
$ curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
$ sudo apt-get update
$ sudo apt-get install -y nvidia-container-toolkit ```sql
DROP TABLE user_settings;
$ nvidia-ctk --version
$ sudo systemctl restart docker
``` ```
At this moment, if we try to upload a pdf, we get an error:
``` ## Env Variables
backend-core | 1989-01-01 21:51:41,211 [ERROR] utils.vectors: Error creating vector for document {'code': '22000', 'details': None, 'hint': None, 'message': 'expected 768 dimensions, not 1536'}
In order to have Quivr use Ollama we need to update the env variables.
Go to `backend/.env` and add the following env variables:
```bash
OLLAMA_API_BASE_URL=http://host.docker.internal:11434
``` ```
This can be remedied by using local embeddings for document embeddings. In backend/core/utils/vectors.py, replace: Then go to the Quivr and you are good to go.
```python
# def create_vector(self, doc, user_openai_api_key=None):
# logger.info("Creating vector for document")
# logger.info(f"Document: {doc}")
# if user_openai_api_key:
# self.commons["documents_vector_store"]._embedding = OpenAIEmbeddings(
# openai_api_key=user_openai_api_key
# ) # pyright: ignore reportPrivateUsage=none
# try:
# sids = self.commons["documents_vector_store"].add_documents([doc])
# if sids and len(sids) > 0:
# return sids
# except Exception as e:
# logger.error(f"Error creating vector for document {e}")
def create_vector(self, doc, user_openai_api_key=None):
logger.info("Creating vector for document")
logger.info(f"Document: {doc}")
self.commons["documents_vector_store"]._embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2",
model_kwargs={'device': 'cuda'},
encode_kwargs={'normalize_embeddings': False}
) # pyright: ignore reportPrivateUsage=none
logger.info('||| creating embedding')
try:
sids = self.commons["documents_vector_store"].add_documents([doc])
if sids and len(sids) > 0:
return sids
except Exception as e:
logger.error(f"Error creating vector for document {e}")
```
<a name="llm"/>
## Local LLM
The final step is to use a local model from HuggingFace for inference. (The HF token is optional, only required for certain models on HF.)
Update the Quivr backend dockerfile:
```
ENV HUGGINGFACEHUB_API_TOKEN=hf_XXX
RUN pip install accelerate
```
Update the `private_gpt4all.py` file as follows:
```python
import langchain
langchain.debug = True
langchain.verbose = True
import os
import transformers
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
...
model_id = "stabilityai/StableBeluga-13B"
...
def _create_llm(
self,
model,
streaming=False,
callbacks=None,
) -> BaseLLM:
"""
Override the _create_llm method to enforce the use of a private model.
:param model: Language model name to be used.
:param streaming: Whether to enable streaming of the model
:param callbacks: Callbacks to be used for streaming
:return: Language model instance
"""
model_path = self.model_path
logger.info("Using private model: %s", model)
logger.info("Streaming is set to %s", streaming)
logger.info("--- model %s",model)
logger.info("--- model path %s",model_path)
model_id = "stabilityai/StableBeluga-13B"
llm = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
use_cache=True,
load_in_4bit=True,
device_map='auto',
#use_auth_token=hf_auth
)
logger.info('<<< transformers.AutoModelForCausalLM.from_pretrained')
llm.eval()
logger.info('<<< eval')
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
use_auth_token=hf_auth
)
logger.info('<<< transformers.AutoTokenizer.from_pretrained')
generate_text = transformers.pipeline(
model=llm, tokenizer=tokenizer,
return_full_text=True, # langchain expects the full text
task='text-generation',
# we pass model parameters here too
#stopping_criteria=stopping_criteria, # without this model rambles during chat
temperature=0.5, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
max_new_tokens=512, # mex number of tokens to generate in the output
repetition_penalty=1.1 # without this output begins repeating
)
logger.info('<<< generate_text = transformers.pipeline(')
result = HuggingFacePipeline(pipeline=generate_text)
logger.info('<<< generate_text = transformers.pipeline(')
logger.info("<<< created llm HuggingFace")
return result
```

View File

@ -13,12 +13,12 @@ if (SENTRY_DSN) {
dsn: SENTRY_DSN, dsn: SENTRY_DSN,
// Adjust this value in production, or use tracesSampler for greater control // Adjust this value in production, or use tracesSampler for greater control
tracesSampleRate: 1, sampleRate: 0.1,
// Setting this option to true will print useful information to the console while you're setting up Sentry. // Setting this option to true will print useful information to the console while you're setting up Sentry.
debug: false, debug: false,
replaysOnErrorSampleRate: 1.0, replaysOnErrorSampleRate: 0.1,
// This sets the sample rate to be 10%. You may want this to be 100% while // This sets the sample rate to be 10%. You may want this to be 100% while
// in development and sample at a lower rate in production // in development and sample at a lower rate in production

View File

@ -9,7 +9,9 @@ if (SENTRY_DSN) {
dsn: SENTRY_DSN, dsn: SENTRY_DSN,
// Adjust this value in production, or use tracesSampler for greater control // Adjust this value in production, or use tracesSampler for greater control
tracesSampleRate: 1, tracesSampleRate: 0.05,
sampleRate: 0.05,
// Setting this option to true will print useful information to the console while you're setting up Sentry. // Setting this option to true will print useful information to the console while you're setting up Sentry.
debug: false, debug: false,

View File

@ -9,7 +9,9 @@ if (SENTRY_DSN) {
dsn: SENTRY_DSN, dsn: SENTRY_DSN,
// Adjust this value in production, or use tracesSampler for greater control // Adjust this value in production, or use tracesSampler for greater control
tracesSampleRate: 1, tracesSampleRate: 0.1,
sampleRate: 0.1,
// Setting this option to true will print useful information to the console while you're setting up Sentry. // Setting this option to true will print useful information to the console while you're setting up Sentry.
debug: false, debug: false,