feat: ragas improved testing (#2611)

# Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate):
2024-08-17 02:10:31 +03:00 · 2024-05-23 10:36:10 +02:00 · 2024-05-23 10:36:10 +02:00 · b1fbbc8b02
commit b1fbbc8b02
parent c287fceb5b
8 changed files with 74 additions and 4 deletions
--- a/backend/modules/brain/rags/quivr_rag.py
+++ b/backend/modules/brain/rags/quivr_rag.py
@ -329,7 +329,7 @@ class QuivrRAG(BaseModel):
            model=self.model,
            temperature=self.temperature,
            api_base=api_base,
-        )
+        )  # pyright: ignore reportPrivateUsage=none
        if self.model_compatible_with_function_calling():

            # And finally, we do the part that returns the answers
--- a/backend/tests/input/monotype/COMPLETED
+++ b/backend/tests/input/monotype/COMPLETED
--- a/backend/tests/input/monotype/COMPLETED
+++ b/backend/tests/input/monotype/COMPLETED
--- a/backend/tests/input/monotype/COMPLETED
+++ b/backend/tests/input/monotype/COMPLETED
--- a/backend/tests/input/monotype/COMPLETED
+++ b/backend/tests/input/monotype/COMPLETED
--- a/backend/tests/input/monotype/COMPLETED
+++ b/backend/tests/input/monotype/COMPLETED
--- a/backend/tests/ragas_evaluation/run_evaluation.py
+++ b/backend/tests/ragas_evaluation/run_evaluation.py
@ -1,4 +1,5 @@
 import argparse
+import json
 import os
 import sys

@ -78,6 +79,7 @@ def main(

    score.to_json(output_folder + "/score.json", orient="records")
    for metric in metrics:
+        print(f"{metric} scores: {score[metric]}")
        print(f"{metric} mean score: {score[metric].mean()}")
        print(f"{metric} median score: {score[metric].median()}")
    # Cleanup if a new brain was created
@ -143,7 +145,14 @@ def generate_replies(

    for question in test_questions:
        response = brain_chain.invoke({"question": question, "chat_history": []})
-        answers.append(response["answer"].content)
+        cited_answer_data = response["answer"].additional_kwargs["tool_calls"][0][
+            "function"
+        ]["arguments"]
+        cited_answer_obj = json.loads(cited_answer_data)
+        print(f"Answer: {cited_answer_obj['answer']}")
+        answers.append(cited_answer_obj["answer"])
+        print(f"Context: {cited_answer_obj}")
+        print(response)
        contexts.append([context.page_content for context in response["docs"]])

    return Dataset.from_dict(
@ -189,7 +198,7 @@ if __name__ == "__main__":
            "faithfulness",
            "answer_similarity",
        ],
-        default=["answer_correctness"],
+        default=["answer_similarity"],
        help="Metrics to evaluate",
    )
    parser.add_argument(
--- a/backend/tests/ragas_tests/experiment.json
+++ b/backend/tests/ragas_tests/experiment.json