mirror of
https://github.com/StanGirard/quivr.git
synced 2024-12-24 20:03:41 +03:00
feat(neurons): added class
This commit is contained in:
parent
d42f14f431
commit
dc6f610b26
@ -10,7 +10,7 @@ from langchain.schema import Document
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from utils.common import CommonsDep
|
||||
from utils.file import compute_sha1_from_content, compute_sha1_from_file
|
||||
from utils.vectors import create_summary, create_vector
|
||||
from utils.vectors import Neurons, create_summary
|
||||
|
||||
|
||||
async def process_file(commons: CommonsDep, file: UploadFile, loader_class, file_suffix, enable_summarization, user, user_openai_api_key):
|
||||
@ -52,7 +52,8 @@ async def process_file(commons: CommonsDep, file: UploadFile, loader_class, file
|
||||
}
|
||||
doc_with_metadata = Document(
|
||||
page_content=doc.page_content, metadata=metadata)
|
||||
create_vector(commons, user.email, doc_with_metadata, user_openai_api_key)
|
||||
neurons = Neurons(commons=commons)
|
||||
neurons.create_vector(user.email, doc_with_metadata, user_openai_api_key)
|
||||
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
||||
|
||||
# Remove the enable_summarization and ids
|
||||
|
@ -7,7 +7,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from parsers.common import file_already_exists_from_content
|
||||
from utils.common import CommonsDep
|
||||
from utils.file import compute_sha1_from_content
|
||||
from utils.vectors import create_vector
|
||||
from utils.vectors import Neurons
|
||||
|
||||
|
||||
async def process_github(commons: CommonsDep, repo, enable_summarization, user, supabase, user_openai_api_key):
|
||||
@ -44,7 +44,8 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, user,
|
||||
page_content=doc.page_content, metadata=metadata)
|
||||
exist = await file_already_exists_from_content(supabase, doc.page_content.encode("utf-8"), user)
|
||||
if not exist:
|
||||
create_vector(commons, user.email, doc_with_metadata, user_openai_api_key)
|
||||
neurons = Neurons(commons=commons)
|
||||
neurons.create_vector(user.email, doc_with_metadata, user_openai_api_key)
|
||||
print("Created vector for ", doc.metadata["file_name"])
|
||||
|
||||
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}
|
||||
|
@ -1,14 +1,43 @@
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.schema import Document
|
||||
from llm.qa import BrainPicking
|
||||
from llm.qa import BrainPicking, BrainSettings
|
||||
from llm.summarization import llm_evaluate_summaries, llm_summerize
|
||||
from logger import get_logger
|
||||
from models.chats import ChatMessage
|
||||
from pydantic import BaseModel
|
||||
from utils.common import CommonsDep
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# TO DO: Create classes or other to avoid having to specify commons in each one of these functions
|
||||
class Neurons(BaseModel):
|
||||
|
||||
commons: CommonsDep
|
||||
settings = BrainSettings()
|
||||
|
||||
def create_vector(self, user_id, doc, user_openai_api_key=None):
|
||||
logger.info(f"Creating vector for document")
|
||||
logger.info(f"Document: {doc}")
|
||||
if user_openai_api_key:
|
||||
self.commons['documents_vector_store']._embedding = OpenAIEmbeddings(openai_api_key=user_openai_api_key)
|
||||
try:
|
||||
sids = self.commons['documents_vector_store'].add_documents([doc])
|
||||
if sids and len(sids) > 0:
|
||||
self.commons['supabase'].table("vectors").update({"user_id": user_id}).match({"id": sids[0]}).execute()
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating vector for document {e}")
|
||||
|
||||
def create_embedding(self, content):
|
||||
return self.commons['embeddings'].embed_query(content)
|
||||
|
||||
def similarity_search(self, query, table='match_summaries', top_k=5, threshold=0.5):
|
||||
query_embedding = self.create_embedding(query)
|
||||
summaries = self.commons['supabase'].rpc(
|
||||
table, {'query_embedding': query_embedding,
|
||||
'match_count': top_k, 'match_threshold': threshold}
|
||||
).execute()
|
||||
return summaries.data
|
||||
|
||||
|
||||
def create_summary(commons: CommonsDep, document_id, content, metadata):
|
||||
logger.info(f"Summarizing document {content[:100]}")
|
||||
summary = llm_summerize(content)
|
||||
@ -22,77 +51,42 @@ def create_summary(commons: CommonsDep, document_id, content, metadata):
|
||||
commons['supabase'].table("summaries").update(
|
||||
{"document_id": document_id}).match({"id": sids[0]}).execute()
|
||||
|
||||
def create_vector(commons: CommonsDep, user_id,doc, user_openai_api_key=None):
|
||||
logger.info(f"Creating vector for document")
|
||||
logger.info(f"Document: {doc}")
|
||||
if user_openai_api_key:
|
||||
commons['documents_vector_store']._embedding = OpenAIEmbeddings(openai_api_key=user_openai_api_key)
|
||||
try:
|
||||
sids = commons['documents_vector_store'].add_documents(
|
||||
[doc])
|
||||
if sids and len(sids) > 0:
|
||||
commons['supabase'].table("vectors").update(
|
||||
{"user_id": user_id}).match({"id": sids[0]}).execute()
|
||||
# TODO: create entry in brains_vectors table with brain_id and vector_id
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating vector for document {e}")
|
||||
|
||||
def create_embedding(commons: CommonsDep, content):
|
||||
return commons['embeddings'].embed_query(content)
|
||||
|
||||
def similarity_search(commons: CommonsDep, query, table='match_summaries', top_k=5, threshold=0.5):
|
||||
query_embedding = create_embedding(commons, query)
|
||||
summaries = commons['supabase'].rpc(
|
||||
table, {'query_embedding': query_embedding,
|
||||
'match_count': top_k, 'match_threshold': threshold}
|
||||
).execute()
|
||||
return summaries.data
|
||||
|
||||
def get_answer(commons: CommonsDep, chat_message: ChatMessage, email: str, user_openai_api_key:str):
|
||||
|
||||
def get_answer(commons: CommonsDep, chat_message: ChatMessage, email: str, user_openai_api_key: str):
|
||||
Brain = BrainPicking().init(chat_message.model, email)
|
||||
qa = Brain.get_qa(chat_message, user_openai_api_key)
|
||||
|
||||
neurons = Neurons(commons=commons)
|
||||
|
||||
if chat_message.use_summarization:
|
||||
# 1. get summaries from the vector store based on question
|
||||
summaries = similarity_search(commons,
|
||||
chat_message.question, table='match_summaries')
|
||||
# 2. evaluate summaries against the question
|
||||
summaries = neurons.similarity_search(chat_message.question, table='match_summaries')
|
||||
evaluations = llm_evaluate_summaries(
|
||||
chat_message.question, summaries, chat_message.model)
|
||||
# 3. pull in the top documents from summaries
|
||||
if evaluations:
|
||||
response = commons['supabase'].from_('vectors').select(
|
||||
'*').in_('id', values=[e['document_id'] for e in evaluations]).execute()
|
||||
# 4. use top docs as additional context
|
||||
additional_context = '---\nAdditional Context={}'.format(
|
||||
'---\n'.join(data['content'] for data in response.data)
|
||||
) + '\n'
|
||||
model_response = qa(
|
||||
{"question": additional_context + chat_message.question})
|
||||
model_response = qa(
|
||||
{"question": additional_context + chat_message.question})
|
||||
else:
|
||||
transformed_history = []
|
||||
|
||||
# Iterate through pairs in the history (assuming each user message is followed by an assistant message)
|
||||
for i in range(0, len(chat_message.history) - 1, 2):
|
||||
user_message = chat_message.history[i][1]
|
||||
assistant_message = chat_message.history[i + 1][1]
|
||||
transformed_history.append((user_message, assistant_message))
|
||||
model_response = qa({"question": chat_message.question, "chat_history":transformed_history})
|
||||
model_response = qa({"question": chat_message.question, "chat_history": transformed_history})
|
||||
|
||||
answer = model_response['answer']
|
||||
|
||||
# append sources (file_name) to answer
|
||||
if "source_documents" in answer:
|
||||
# logger.debug('Source Documents: %s', answer["source_documents"])
|
||||
sources = [
|
||||
doc.metadata["file_name"] for doc in answer["source_documents"]
|
||||
if "file_name" in doc.metadata]
|
||||
# logger.debug('Sources: %s', sources)
|
||||
if sources:
|
||||
files = dict.fromkeys(sources)
|
||||
# # shall provide file links until pages available
|
||||
# files = [f"[{f}](/explore/{f})" for f in files]
|
||||
answer = answer + "\n\nRef: " + "; ".join(files)
|
||||
|
||||
return answer
|
||||
return answer
|
Loading…
Reference in New Issue
Block a user