diff --git a/backend/tests/ragas_evaluation/run_evaluation.py b/backend/tests/ragas_evaluation/run_evaluation.py new file mode 100644 index 000000000..e3d4b834c --- /dev/null +++ b/backend/tests/ragas_evaluation/run_evaluation.py @@ -0,0 +1,142 @@ +import argparse +import os +from dotenv import load_dotenv +import sys + +# Add the current directory to the Python path +sys.path.append(os.getcwd()) +# Load environment variables from .env file +load_dotenv() + +import pandas as pd +import uuid +import glob +import ragas +from datasets import Dataset + +from celery_worker import process_file_and_notify +from repository.files.upload_file import upload_file_storage +from modules.knowledge.dto.inputs import CreateKnowledgeProperties +from modules.knowledge.service.knowledge_service import KnowledgeService +from modules.brain.service.brain_service import BrainService +from modules.brain.rags.quivr_rag import QuivrRAG +from ragas import evaluate +from ragas.embeddings.base import LangchainEmbeddingsWrapper +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_core.runnables.base import RunnableSerializable + + +def main(testset_path, input_folder, output_folder, model, context_size, metrics): + # Create a fake user and brain + uuid_value = uuid.uuid4() + brain_service = BrainService() + knowledge_service = KnowledgeService() + brain = brain_service.create_brain(user_id=uuid_value, brain=None) + brain_id = brain.brain_id + + for document_path in glob.glob(input_folder + '/*'): + # Process each document here + process_document(knowledge_service, brain_id, document_path) + + # Load test data + test_data = pd.read_json(testset_path) + + # Create a QuivrRAG chain + knowledge_qa = QuivrRAG( + model=model, + brain_id=str(brain_id), + chat_id=str(uuid.uuid4()), + streaming=False, + max_input=context_size, + max_tokens=1000 + ) + brain_chain = knowledge_qa.get_chain() + + # run langchain RAG + response_dataset = generate_replies(test_data, brain_chain) + + ragas_metrics = [getattr(ragas.metrics, metric) for metric in metrics] + score = evaluate(response_dataset,metrics=ragas_metrics, + llm=ChatOpenAI(model="gpt-4", temperature=0.1), + embeddings=LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large", dimensions=3072))).to_pandas() + + score.to_json(output_folder + "/score.json", orient="records") + for metric in metrics: + print(f"{metric} mean score: {score[metric].mean()}") + print(f"{metric} median score: {score[metric].median()}") + +def process_document(knowledge_service: KnowledgeService, brain_id: uuid.UUID, document_path: str) -> None: + """ + Process a document by uploading it to the file storage, adding knowledge to the knowledge service, + and then processing the file and sending a notification. + + Args: + knowledge_service: The knowledge service object used to add knowledge. + brain_id: The ID of the brain. + document_path: The path of the document to be processed. + + Returns: + None + """ + filename = document_path.split("/")[-1] + filename_with_brain_id = str(brain_id) + "/" + str(filename) + file_in_storage = upload_file_storage(document_path, filename_with_brain_id) + + knowledge_to_add = CreateKnowledgeProperties( + brain_id=brain_id, + file_name=filename, + extension=os.path.splitext( + filename # pyright: ignore reportPrivateUsage=none + )[-1].lower(), + ) + + added_knowledge = knowledge_service.add_knowledge(knowledge_to_add) + print(f"Knowledge {added_knowledge} added successfully") + + process_file_and_notify( + file_name=filename_with_brain_id, + file_original_name=filename, + brain_id=brain_id, + notification_id=None, + ) + +def generate_replies(test_data: pd.DataFrame, brain_chain: RunnableSerializable) -> Dataset: + """ + Generate replies for a given test data using a brain chain. + + Args: + test_data (pandas.DataFrame): The test data containing questions and ground truths. + brain_chain (RunnableSerializable): The brain chain to use for generating replies. + + Returns: + Dataset: A dataset containing the generated replies, including questions, answers, contexts, and ground truths. + """ + answers = [] + contexts = [] + test_questions = test_data.question.tolist() + test_groundtruths = test_data.ground_truth.tolist() + + for question in test_questions: + response = brain_chain.invoke({"question" : question}) + answers.append(response["answer"].content) + contexts.append([context.page_content for context in response["docs"]]) + + return Dataset.from_dict({ + "question" : test_questions, + "answer" : answers, + "contexts" : contexts, + "ground_truth" : test_groundtruths + }) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Run Ragas evaluation on a test dataset') + parser.add_argument('--input_folder', type=str, required=True, help='Path to the testset documents folder') + parser.add_argument('--output_folder', type=str, default='./', help='Path to the output folder') + parser.add_argument('--testset_path', type=str, required=True, help='Path to the testset JSON file') + parser.add_argument('--model', type=str, default='gpt-3.5-turbo-0125', help='Model to use') + parser.add_argument('--context_size', type=int, default=4000, help='Context size for the model') + parser.add_argument('--metrics', type=str, nargs='+', choices=['answer_correctness', 'context_relevancy', 'context_precision', 'faithfulness', 'answer_similarity'], default=['answer_correctness'], help='Metrics to evaluate') + args = parser.parse_args() + + main(args.testset_path, args.input_folder, args.output_folder, args.model, args.context_size, args.metrics) \ No newline at end of file