mirror of
https://github.com/StanGirard/quivr.git
synced 2024-11-22 20:09:40 +03:00
feat(backend): add RAG evaluation using Ragas
This commit is contained in:
parent
d9d70a7ade
commit
5350e0a071
142
backend/tests/ragas_evaluation/run_evaluation.py
Normal file
142
backend/tests/ragas_evaluation/run_evaluation.py
Normal file
@ -0,0 +1,142 @@
|
||||
import argparse
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
import sys
|
||||
|
||||
# Add the current directory to the Python path
|
||||
sys.path.append(os.getcwd())
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
import pandas as pd
|
||||
import uuid
|
||||
import glob
|
||||
import ragas
|
||||
from datasets import Dataset
|
||||
|
||||
from celery_worker import process_file_and_notify
|
||||
from repository.files.upload_file import upload_file_storage
|
||||
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
|
||||
from modules.knowledge.service.knowledge_service import KnowledgeService
|
||||
from modules.brain.service.brain_service import BrainService
|
||||
from modules.brain.rags.quivr_rag import QuivrRAG
|
||||
from ragas import evaluate
|
||||
from ragas.embeddings.base import LangchainEmbeddingsWrapper
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
from langchain_core.runnables.base import RunnableSerializable
|
||||
|
||||
|
||||
def main(testset_path, input_folder, output_folder, model, context_size, metrics):
|
||||
# Create a fake user and brain
|
||||
uuid_value = uuid.uuid4()
|
||||
brain_service = BrainService()
|
||||
knowledge_service = KnowledgeService()
|
||||
brain = brain_service.create_brain(user_id=uuid_value, brain=None)
|
||||
brain_id = brain.brain_id
|
||||
|
||||
for document_path in glob.glob(input_folder + '/*'):
|
||||
# Process each document here
|
||||
process_document(knowledge_service, brain_id, document_path)
|
||||
|
||||
# Load test data
|
||||
test_data = pd.read_json(testset_path)
|
||||
|
||||
# Create a QuivrRAG chain
|
||||
knowledge_qa = QuivrRAG(
|
||||
model=model,
|
||||
brain_id=str(brain_id),
|
||||
chat_id=str(uuid.uuid4()),
|
||||
streaming=False,
|
||||
max_input=context_size,
|
||||
max_tokens=1000
|
||||
)
|
||||
brain_chain = knowledge_qa.get_chain()
|
||||
|
||||
# run langchain RAG
|
||||
response_dataset = generate_replies(test_data, brain_chain)
|
||||
|
||||
ragas_metrics = [getattr(ragas.metrics, metric) for metric in metrics]
|
||||
score = evaluate(response_dataset,metrics=ragas_metrics,
|
||||
llm=ChatOpenAI(model="gpt-4", temperature=0.1),
|
||||
embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large", dimensions=3072))).to_pandas()
|
||||
|
||||
score.to_json(output_folder + "/score.json", orient="records")
|
||||
for metric in metrics:
|
||||
print(f"{metric} mean score: {score[metric].mean()}")
|
||||
print(f"{metric} median score: {score[metric].median()}")
|
||||
|
||||
def process_document(knowledge_service: KnowledgeService, brain_id: uuid.UUID, document_path: str) -> None:
|
||||
"""
|
||||
Process a document by uploading it to the file storage, adding knowledge to the knowledge service,
|
||||
and then processing the file and sending a notification.
|
||||
|
||||
Args:
|
||||
knowledge_service: The knowledge service object used to add knowledge.
|
||||
brain_id: The ID of the brain.
|
||||
document_path: The path of the document to be processed.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
filename = document_path.split("/")[-1]
|
||||
filename_with_brain_id = str(brain_id) + "/" + str(filename)
|
||||
file_in_storage = upload_file_storage(document_path, filename_with_brain_id)
|
||||
|
||||
knowledge_to_add = CreateKnowledgeProperties(
|
||||
brain_id=brain_id,
|
||||
file_name=filename,
|
||||
extension=os.path.splitext(
|
||||
filename # pyright: ignore reportPrivateUsage=none
|
||||
)[-1].lower(),
|
||||
)
|
||||
|
||||
added_knowledge = knowledge_service.add_knowledge(knowledge_to_add)
|
||||
print(f"Knowledge {added_knowledge} added successfully")
|
||||
|
||||
process_file_and_notify(
|
||||
file_name=filename_with_brain_id,
|
||||
file_original_name=filename,
|
||||
brain_id=brain_id,
|
||||
notification_id=None,
|
||||
)
|
||||
|
||||
def generate_replies(test_data: pd.DataFrame, brain_chain: RunnableSerializable) -> Dataset:
|
||||
"""
|
||||
Generate replies for a given test data using a brain chain.
|
||||
|
||||
Args:
|
||||
test_data (pandas.DataFrame): The test data containing questions and ground truths.
|
||||
brain_chain (RunnableSerializable): The brain chain to use for generating replies.
|
||||
|
||||
Returns:
|
||||
Dataset: A dataset containing the generated replies, including questions, answers, contexts, and ground truths.
|
||||
"""
|
||||
answers = []
|
||||
contexts = []
|
||||
test_questions = test_data.question.tolist()
|
||||
test_groundtruths = test_data.ground_truth.tolist()
|
||||
|
||||
for question in test_questions:
|
||||
response = brain_chain.invoke({"question" : question})
|
||||
answers.append(response["answer"].content)
|
||||
contexts.append([context.page_content for context in response["docs"]])
|
||||
|
||||
return Dataset.from_dict({
|
||||
"question" : test_questions,
|
||||
"answer" : answers,
|
||||
"contexts" : contexts,
|
||||
"ground_truth" : test_groundtruths
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run Ragas evaluation on a test dataset')
|
||||
parser.add_argument('--input_folder', type=str, required=True, help='Path to the testset documents folder')
|
||||
parser.add_argument('--output_folder', type=str, default='./', help='Path to the output folder')
|
||||
parser.add_argument('--testset_path', type=str, required=True, help='Path to the testset JSON file')
|
||||
parser.add_argument('--model', type=str, default='gpt-3.5-turbo-0125', help='Model to use')
|
||||
parser.add_argument('--context_size', type=int, default=4000, help='Context size for the model')
|
||||
parser.add_argument('--metrics', type=str, nargs='+', choices=['answer_correctness', 'context_relevancy', 'context_precision', 'faithfulness', 'answer_similarity'], default=['answer_correctness'], help='Metrics to evaluate')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.testset_path, args.input_folder, args.output_folder, args.model, args.context_size, args.metrics)
|
Loading…
Reference in New Issue
Block a user